Source code for proteobench.io.params.msangel

"""
MSAngel creates modular pipelines that allows several search engines to identify
peptides, which are then quantified with Proline.
The parameters are provided in a .json file.
MSAngel allows for multiple search engines to be used in the same pipeline. So it
requires a list of search engines and their respective parameters, which are then
concatenated.

Relevant information in file:
-
"""

import json
import os
import pathlib
from typing import Union

import pandas as pd

from proteobench.io.params import ProteoBenchParameters


[docs] def extract_search_engine(search_params: list) -> dict: """ Extract search engine name from the JSON data. It only works for workflows using a single search engine. Parameters ---------- search_params : list The list of search parameters extracted from the JSON file. Returns ------- dict The search engine name. """ for each_search_params in search_params["operations"]: if "searchEnginesWithForms" in each_search_params: return each_search_params["searchEnginesWithForms"][0][0]
[docs] def extract_params_mascot_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters: """ Extract search parameters from the JSON data of a workflow running Mascot. Adds them to the partially completed input_params ProteoBenchParameters object. Parameters ---------- search_params : list The list of search parameters extracted from the JSON file. input_params : ProteoBenchParameters The partially completed input_params object. Returns ------- ProteoBenchParameters The input_params object with the extracted parameters added. """ for each_search_params in search_params["operations"]: if "searchEnginesWithForms" in each_search_params: # params.search_engine_version = input_params.enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["CLE"] # params.allowed_miscleavages = input_params.fixed_mods = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["MODS"] input_params.variable_mods = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["IT_MODS"] input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["PFA"] second_pass = input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1][ "paramMap" ]["ERRORTOLERANT"] if second_pass == "1": input_params.second_pass = True else: input_params.second_pass = False # get tolerance: tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOL"] unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOLU"] tol = float(tol) print(tol) input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]" if "validationConfig" in each_search_params: input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100 # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length return input_params
[docs] def extract_params_xtandem_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters: """ Extract search parameters from the JSON data of a workflow running X!Tandem. Adds them to the partially completed input_params ProteoBenchParameters object. Parameters ---------- search_params : list The list of search parameters extracted from the JSON file. input_params : ProteoBenchParameters The partially completed input_params object. Returns ------- ProteoBenchParameters The input_params object with the extracted parameters added. """ for each_search_params in search_params["operations"]: if "searchEnginesWithForms" in each_search_params: # params.search_engine_version = input_params.enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["digestionParameters"][ "enzymes" ][0]["name"] # params.allowed_miscleavages = input_params.fixed_mods = ", ".join( each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][ "fixedModifications" ] ) input_params.variable_mods = ", ".join( each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][ "variableModifications" ] ) ## get value of each_search_params['searchEnginesWithForms'][0][1]["paramMap"]["digestionParameters"]["nMissedCleavages"] where key == input_params.enzyme n_missed_cleavages_dict = each_search_params["searchEnginesWithForms"][0][1]["paramMap"][ "digestionParameters" ]["nMissedCleavages"] input_params.allowed_miscleavages = n_missed_cleavages_dict.get(input_params.enzyme, None) # get tolerance for precursors: tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorTolerance"] unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorAccuracyType"] tol = float(tol) input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]" # get tolerance for fragments: tol2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentIonMZTolerance"] unit2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentAccuracyType"] tol2 = float(tol2) input_params.fragment_mass_tolerance = ( "[-" + str(tol2) + " " + unit2 + ", +" + str(tol2) + " " + unit2 + "]" ) # Add "hidden" modifications when using X!Tandem: for key, value in each_search_params["searchEnginesWithForms"][0][1]["paramMap"][ "algorithmParameters" ].items(): if value["type"] == "com.compomics.util.parameters.identification.tool_specific.XtandemParameters": if value["data"]["proteinQuickAcetyl"] == True: input_params.variable_mods = input_params.variable_mods + ";Acetyl(N-term)" if value["data"]["quickPyrolidone"] == True: input_params.variable_mods = input_params.variable_mods + ";Pyrolidone(N-term)" if "validationConfig" in each_search_params: input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100 # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length return input_params
[docs] def extract_params( fname: Union[str, pathlib.Path], json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json"), ) -> ProteoBenchParameters: """ Parse MSAangel quantification tool JSON parameter file and extract relevant parameters. Parameters ---------- fname : str or pathlib.Path The path to the MSAngel JSON parameter file. Returns ------- ProteoBenchParameters The extracted parameters as a `ProteoBenchParameters` object. """ params = ProteoBenchParameters(filename=json_file) try: # If the input is a file-like object (e.g., StringIO), decode it file_contents = fname.getvalue().decode("utf-8") data = json.loads(file_contents) except AttributeError: # Otherwise, treat it as a file path with open(fname, "r") as file_contents: data = json.load(file_contents) # Extract parameters from the JSON data params.software_name = "MSAngel" params.software_version = data["msAngelVersion"] params.search_engine = extract_search_engine(data) # Params fixed in MSAngel params.enable_match_between_runs = True # parameter parsing depends on the search engine used if params.search_engine == "Mascot": extract_params_mascot_specific(data, params) elif params.search_engine == "X!Tandem": extract_params_xtandem_specific(data, params) params.fill_none() return params
if __name__ == "__main__": """ Extract parameters from MSAngel JSON files and save them as CSV. """ from pathlib import Path files = [ Path("../../../test/params/MSAngel_Xtandem-export-param.json"), Path("../../../test/params/MSAngel_fromRAWtoQUANT-Mascot-export-param.json"), ] for file in files: # Extract parameters from the file params = extract_params(file) # Convert the extracted parameters to a dictionary and then to a pandas Series data_dict = params.__dict__ series = pd.Series(data_dict) # Write the Series to a CSV file series.to_csv(file.with_suffix(".csv"))