Source code for proteobench.io.params.msangel

"""
MSAngel creates modular pipelines that allows several search engines to identify
peptides, which are then quantified with Proline.
The parameters are provided in a .json file.
MSAngel allows for multiple search engines to be used in the same pipeline. So it
requires a list of search engines and their respective parameters, which are then
concatenated.

Relevant information in file:
-
"""

import json
import os
import pathlib
from typing import Union

import pandas as pd

from proteobench.io.params import ProteoBenchParameters
from proteobench.io.params.maxquant import _homogenize_mods


def _homogenize_mod_xtandem(mod_str: str) -> str:
    """Convert MSAngel X!Tandem modification format to ProForma-like notation.

    Format: ``{modname} of {residue}``, e.g. ``Oxidation of M``,
    ``Acetylation of protein N-term``.
    """
    mod_str = mod_str.strip()
    if " of " not in mod_str:
        return mod_str
    name, residue_part = mod_str.split(" of ", 1)
    residue_part = residue_part.strip()
    lower = residue_part.lower()
    if "protein n-term" in lower:
        return f"Protein N-term[{name}]"
    elif "n-term" in lower:
        return f"N-term[{name}]"
    elif "protein c-term" in lower:
        return f"Protein C-term[{name}]"
    elif "c-term" in lower:
        return f"C-term[{name}]"
    else:
        return f"{residue_part.upper()}[{name}]"


# Mapping from Mascot enzyme name strings to canonical ProteoBench names.
# Keys are lowercase to allow case-insensitive lookup.
_ENZYME_MAP = {
    "trypsin": "Trypsin",
    "trypsin (kr/np)": "Trypsin",
    "trypsin/p": "Trypsin/P",
    "lysc": "Lys-C",
    "lys-c": "Lys-C",
    "argc": "Arg-C",
    "arg-c": "Arg-C",
    "aspn": "Asp-N",
    "asp-n": "Asp-N",
    "gluc": "Glu-C",
    "glu-c": "Glu-C",
    "chymotrypsin": "Chymotrypsin",
}


[docs] def extract_search_engine(search_params: list) -> dict: """ Extract search engine name from the JSON data. It only works for workflows using a single search engine. Parameters ---------- search_params : list The list of search parameters extracted from the JSON file. Returns ------- dict The search engine name. """ for each_search_params in search_params["operations"]: if "searchEnginesWithForms" in each_search_params: return each_search_params["searchEnginesWithForms"][0][0]
[docs] def extract_params_mascot_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters: """ Extract search parameters from the JSON data of a workflow running Mascot. Adds them to the partially completed input_params ProteoBenchParameters object. Parameters ---------- search_params : list The list of search parameters extracted from the JSON file. input_params : ProteoBenchParameters The partially completed input_params object. Returns ------- ProteoBenchParameters The input_params object with the extracted parameters added. """ for each_search_params in search_params["operations"]: if "searchEnginesWithForms" in each_search_params: # params.search_engine_version = raw_enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["CLE"] input_params.enzyme = _ENZYME_MAP.get(raw_enzyme.strip().lower(), raw_enzyme) # params.allowed_miscleavages = input_params.fixed_mods = _homogenize_mods( each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["MODS"] ) input_params.variable_mods = _homogenize_mods( each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["IT_MODS"] ) input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["PFA"] second_pass = input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1][ "paramMap" ]["ERRORTOLERANT"] if second_pass == "1": input_params.second_pass = True else: input_params.second_pass = False # get tolerance: tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOL"] unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOLU"] tol = float(tol) print(tol) input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]" if "validationConfig" in each_search_params: input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100 # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length return input_params
[docs] def extract_params_xtandem_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters: """ Extract search parameters from the JSON data of a workflow running X!Tandem. Adds them to the partially completed input_params ProteoBenchParameters object. Parameters ---------- search_params : list The list of search parameters extracted from the JSON file. input_params : ProteoBenchParameters The partially completed input_params object. Returns ------- ProteoBenchParameters The input_params object with the extracted parameters added. """ for each_search_params in search_params["operations"]: if "searchEnginesWithForms" in each_search_params: # params.search_engine_version = raw_enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["digestionParameters"][ "enzymes" ][0]["name"] input_params.enzyme = _ENZYME_MAP.get(raw_enzyme.strip().lower(), raw_enzyme) # params.allowed_miscleavages = input_params.fixed_mods = ", ".join( _homogenize_mod_xtandem(m) for m in each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][ "fixedModifications" ] ) input_params.variable_mods = ", ".join( _homogenize_mod_xtandem(m) for m in each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][ "variableModifications" ] ) # Use the raw enzyme name for the missed cleavages lookup (dict keys are vendor names). n_missed_cleavages_dict = each_search_params["searchEnginesWithForms"][0][1]["paramMap"][ "digestionParameters" ]["nMissedCleavages"] input_params.allowed_miscleavages = n_missed_cleavages_dict.get(raw_enzyme, None) # get tolerance for precursors: tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorTolerance"] unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorAccuracyType"] tol = float(tol) input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]" # get tolerance for fragments: tol2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentIonMZTolerance"] unit2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentAccuracyType"] tol2 = float(tol2) input_params.fragment_mass_tolerance = ( "[-" + str(tol2) + " " + unit2 + ", +" + str(tol2) + " " + unit2 + "]" ) # Add "hidden" modifications when using X!Tandem: for key, value in each_search_params["searchEnginesWithForms"][0][1]["paramMap"][ "algorithmParameters" ].items(): if value["type"] == "com.compomics.util.parameters.identification.tool_specific.XtandemParameters": if value["data"]["proteinQuickAcetyl"] == True: input_params.variable_mods = input_params.variable_mods + ", N-term[Acetyl]" if value["data"]["quickPyrolidone"] == True: input_params.variable_mods = input_params.variable_mods + ", N-term[Pyrolidone]" if "validationConfig" in each_search_params: input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100 # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length return input_params
[docs] def extract_params( fname: Union[str, pathlib.Path], json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json"), ) -> ProteoBenchParameters: """ Parse MSAangel quantification tool JSON parameter file and extract relevant parameters. Parameters ---------- fname : str or pathlib.Path The path to the MSAngel JSON parameter file. Returns ------- ProteoBenchParameters The extracted parameters as a `ProteoBenchParameters` object. """ params = ProteoBenchParameters(filename=json_file) try: # If the input is a file-like object (e.g., StringIO), decode it file_contents = fname.getvalue().decode("utf-8") data = json.loads(file_contents) except AttributeError: # Otherwise, treat it as a file path with open(fname, "r") as file_contents: data = json.load(file_contents) # Extract parameters from the JSON data params.software_name = "MSAngel" params.software_version = data["msAngelVersion"] params.search_engine = extract_search_engine(data) # Params fixed in MSAngel params.enable_match_between_runs = True # parameter parsing depends on the search engine used if params.search_engine == "Mascot": extract_params_mascot_specific(data, params) elif params.search_engine == "X!Tandem": extract_params_xtandem_specific(data, params) params.fill_none() return params
if __name__ == "__main__": """ Extract parameters from MSAngel JSON files and save them as CSV. """ from pathlib import Path files = [ Path("../../../test/params/MSAngel_Xtandem-export-param.json"), Path("../../../test/params/MSAngel_fromRAWtoQUANT-Mascot-export-param.json"), ] for file in files: # Extract parameters from the file params = extract_params(file) # Convert the extracted parameters to a dictionary and then to a pandas Series data_dict = params.__dict__ series = pd.Series(data_dict) print(series) # Write the Series to a CSV file series.to_csv(file.with_suffix(".csv"))