Source code for proteobench.io.params.msangel

"""
MSAngel creates modular pipelines that allows several search engines to identify
peptides, which are then quantified with Proline.
The parameters are provided in a .json file.
MSAngel allows for multiple search engines to be used in the same pipeline. So it
requires a list of search engines and their respective parameters, which are then
concatenated.

Relevant information in file:
-
"""

import json
import os
import pathlib
from typing import Union

import pandas as pd

from proteobench.io.params import ProteoBenchParameters



[docs]
def extract_search_engine(search_params: list) -> dict:
    """
    Extract search engine name from the JSON data.
    It only works for workflows using a single search engine.

    Parameters
    ----------
    search_params : list
        The list of search parameters extracted from the JSON file.

    Returns
    -------
    dict
        The search engine name.
    """

    for each_search_params in search_params["operations"]:
        if "searchEnginesWithForms" in each_search_params:
            return each_search_params["searchEnginesWithForms"][0][0]




[docs]
def extract_params_mascot_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters:
    """
    Extract search parameters from the JSON data of a workflow running Mascot.
    Adds them to the partially completed input_params ProteoBenchParameters object.

    Parameters
    ----------
    search_params : list
        The list of search parameters extracted from the JSON file.
    input_params : ProteoBenchParameters
        The partially completed input_params object.

    Returns
    -------
    ProteoBenchParameters
        The input_params object with the extracted parameters added.
    """

    for each_search_params in search_params["operations"]:
        if "searchEnginesWithForms" in each_search_params:
            # params.search_engine_version =
            input_params.enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["CLE"]
            # params.allowed_miscleavages =
            input_params.fixed_mods = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["MODS"]
            input_params.variable_mods = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["IT_MODS"]
            input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["PFA"]
            second_pass = input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1][
                "paramMap"
            ]["ERRORTOLERANT"]
            if second_pass == "1":
                input_params.second_pass = True
            else:
                input_params.second_pass = False
            # get tolerance:
            tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOL"]
            unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOLU"]
            tol = float(tol)
            print(tol)
            input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]"

        if "validationConfig" in each_search_params:
            input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100
            # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length

    return input_params




[docs]
def extract_params_xtandem_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters:
    """
    Extract search parameters from the JSON data of a workflow running X!Tandem.
    Adds them to the partially completed input_params ProteoBenchParameters object.

    Parameters
    ----------
    search_params : list
        The list of search parameters extracted from the JSON file.
    input_params : ProteoBenchParameters
        The partially completed input_params object.

    Returns
    -------
    ProteoBenchParameters
        The input_params object with the extracted parameters added.
    """

    for each_search_params in search_params["operations"]:
        if "searchEnginesWithForms" in each_search_params:
            # params.search_engine_version =
            input_params.enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["digestionParameters"][
                "enzymes"
            ][0]["name"]
            # params.allowed_miscleavages =
            input_params.fixed_mods = ", ".join(
                each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][
                    "fixedModifications"
                ]
            )
            input_params.variable_mods = ", ".join(
                each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][
                    "variableModifications"
                ]
            )
            ## get value of each_search_params['searchEnginesWithForms'][0][1]["paramMap"]["digestionParameters"]["nMissedCleavages"] where key == input_params.enzyme
            n_missed_cleavages_dict = each_search_params["searchEnginesWithForms"][0][1]["paramMap"][
                "digestionParameters"
            ]["nMissedCleavages"]
            input_params.allowed_miscleavages = n_missed_cleavages_dict.get(input_params.enzyme, None)
            # get tolerance for precursors:
            tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorTolerance"]
            unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorAccuracyType"]
            tol = float(tol)
            input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]"
            # get tolerance for fragments:
            tol2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentIonMZTolerance"]
            unit2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentAccuracyType"]
            tol2 = float(tol2)
            input_params.fragment_mass_tolerance = (
                "[-" + str(tol2) + " " + unit2 + ", +" + str(tol2) + " " + unit2 + "]"
            )

            # Add "hidden" modifications when using X!Tandem:
            for key, value in each_search_params["searchEnginesWithForms"][0][1]["paramMap"][
                "algorithmParameters"
            ].items():
                if value["type"] == "com.compomics.util.parameters.identification.tool_specific.XtandemParameters":
                    if value["data"]["proteinQuickAcetyl"] == True:
                        input_params.variable_mods = input_params.variable_mods + ";Acetyl(N-term)"
                    if value["data"]["quickPyrolidone"] == True:
                        input_params.variable_mods = input_params.variable_mods + ";Pyrolidone(N-term)"

        if "validationConfig" in each_search_params:
            input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100
            # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length

    return input_params




[docs]
def extract_params(
    fname: Union[str, pathlib.Path],
    json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json"),
) -> ProteoBenchParameters:
    """
    Parse MSAangel quantification tool JSON parameter file and extract relevant parameters.

    Parameters
    ----------
    fname : str or pathlib.Path
        The path to the MSAngel JSON parameter file.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters as a `ProteoBenchParameters` object.
    """
    params = ProteoBenchParameters(filename=json_file)

    try:
        # If the input is a file-like object (e.g., StringIO), decode it
        file_contents = fname.getvalue().decode("utf-8")
        data = json.loads(file_contents)
    except AttributeError:
        # Otherwise, treat it as a file path
        with open(fname, "r") as file_contents:
            data = json.load(file_contents)

    # Extract parameters from the JSON data
    params.software_name = "MSAngel"
    params.software_version = data["msAngelVersion"]
    params.search_engine = extract_search_engine(data)

    # Params fixed in MSAngel
    params.enable_match_between_runs = True

    # parameter parsing depends on the search engine used
    if params.search_engine == "Mascot":
        extract_params_mascot_specific(data, params)
    elif params.search_engine == "X!Tandem":
        extract_params_xtandem_specific(data, params)

    params.fill_none()

    return params



if __name__ == "__main__":
    """
    Extract parameters from MSAngel JSON files and save them as CSV.
    """
    from pathlib import Path

    files = [
        Path("../../../test/params/MSAngel_Xtandem-export-param.json"),
        Path("../../../test/params/MSAngel_fromRAWtoQUANT-Mascot-export-param.json"),
    ]

    for file in files:
        # Extract parameters from the file
        params = extract_params(file)

        # Convert the extracted parameters to a dictionary and then to a pandas Series
        data_dict = params.__dict__
        series = pd.Series(data_dict)

        # Write the Series to a CSV file
        series.to_csv(file.with_suffix(".csv"))