Source code for proteobench.io.params.msangel

"""
MSAngel creates modular pipelines that allows several search engines to identify
peptides, which are then quantified with Proline.
The parameters are provided in a .json file.
MSAngel allows for multiple search engines to be used in the same pipeline. So it
requires a list of search engines and their respective parameters, which are then
concatenated.

Relevant information in file:
-
"""

import json
import os
import pathlib
from typing import Union

import pandas as pd

from proteobench.io.params import ProteoBenchParameters
from proteobench.io.params.maxquant import _homogenize_mods


def _homogenize_mod_xtandem(mod_str: str) -> str:
    """Convert MSAngel X!Tandem modification format to ProForma-like notation.

    Format: ``{modname} of {residue}``, e.g. ``Oxidation of M``,
    ``Acetylation of protein N-term``.
    """
    mod_str = mod_str.strip()
    if " of " not in mod_str:
        return mod_str
    name, residue_part = mod_str.split(" of ", 1)
    residue_part = residue_part.strip()
    lower = residue_part.lower()
    if "protein n-term" in lower:
        return f"Protein N-term[{name}]"
    elif "n-term" in lower:
        return f"N-term[{name}]"
    elif "protein c-term" in lower:
        return f"Protein C-term[{name}]"
    elif "c-term" in lower:
        return f"C-term[{name}]"
    else:
        return f"{residue_part.upper()}[{name}]"


# Mapping from Mascot enzyme name strings to canonical ProteoBench names.
# Keys are lowercase to allow case-insensitive lookup.
_ENZYME_MAP = {
    "trypsin": "Trypsin",
    "trypsin (kr/np)": "Trypsin",
    "trypsin/p": "Trypsin/P",
    "lysc": "Lys-C",
    "lys-c": "Lys-C",
    "argc": "Arg-C",
    "arg-c": "Arg-C",
    "aspn": "Asp-N",
    "asp-n": "Asp-N",
    "gluc": "Glu-C",
    "glu-c": "Glu-C",
    "chymotrypsin": "Chymotrypsin",
}



[docs]
def extract_search_engine(search_params: list) -> dict:
    """
    Extract search engine name from the JSON data.
    It only works for workflows using a single search engine.

    Parameters
    ----------
    search_params : list
        The list of search parameters extracted from the JSON file.

    Returns
    -------
    dict
        The search engine name.
    """

    for each_search_params in search_params["operations"]:
        if "searchEnginesWithForms" in each_search_params:
            return each_search_params["searchEnginesWithForms"][0][0]




[docs]
def extract_params_mascot_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters:
    """
    Extract search parameters from the JSON data of a workflow running Mascot.
    Adds them to the partially completed input_params ProteoBenchParameters object.

    Parameters
    ----------
    search_params : list
        The list of search parameters extracted from the JSON file.
    input_params : ProteoBenchParameters
        The partially completed input_params object.

    Returns
    -------
    ProteoBenchParameters
        The input_params object with the extracted parameters added.
    """

    for each_search_params in search_params["operations"]:
        if "searchEnginesWithForms" in each_search_params:
            # params.search_engine_version =
            raw_enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["CLE"]
            input_params.enzyme = _ENZYME_MAP.get(raw_enzyme.strip().lower(), raw_enzyme)
            # params.allowed_miscleavages =
            input_params.fixed_mods = _homogenize_mods(
                each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["MODS"]
            )
            input_params.variable_mods = _homogenize_mods(
                each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["IT_MODS"]
            )
            input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["PFA"]
            second_pass = input_params.allowed_miscleavages = each_search_params["searchEnginesWithForms"][0][1][
                "paramMap"
            ]["ERRORTOLERANT"]
            if second_pass == "1":
                input_params.second_pass = True
            else:
                input_params.second_pass = False
            # get tolerance:
            tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOL"]
            unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["TOLU"]
            tol = float(tol)
            print(tol)
            input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]"

        if "validationConfig" in each_search_params:
            input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100
            # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length

    return input_params




[docs]
def extract_params_xtandem_specific(search_params: list, input_params: ProteoBenchParameters) -> ProteoBenchParameters:
    """
    Extract search parameters from the JSON data of a workflow running X!Tandem.
    Adds them to the partially completed input_params ProteoBenchParameters object.

    Parameters
    ----------
    search_params : list
        The list of search parameters extracted from the JSON file.
    input_params : ProteoBenchParameters
        The partially completed input_params object.

    Returns
    -------
    ProteoBenchParameters
        The input_params object with the extracted parameters added.
    """

    for each_search_params in search_params["operations"]:
        if "searchEnginesWithForms" in each_search_params:
            # params.search_engine_version =
            raw_enzyme = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["digestionParameters"][
                "enzymes"
            ][0]["name"]
            input_params.enzyme = _ENZYME_MAP.get(raw_enzyme.strip().lower(), raw_enzyme)
            # params.allowed_miscleavages =
            input_params.fixed_mods = ", ".join(
                _homogenize_mod_xtandem(m)
                for m in each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][
                    "fixedModifications"
                ]
            )
            input_params.variable_mods = ", ".join(
                _homogenize_mod_xtandem(m)
                for m in each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["modificationParameters"][
                    "variableModifications"
                ]
            )
            # Use the raw enzyme name for the missed cleavages lookup (dict keys are vendor names).
            n_missed_cleavages_dict = each_search_params["searchEnginesWithForms"][0][1]["paramMap"][
                "digestionParameters"
            ]["nMissedCleavages"]
            input_params.allowed_miscleavages = n_missed_cleavages_dict.get(raw_enzyme, None)
            # get tolerance for precursors:
            tol = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorTolerance"]
            unit = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["precursorAccuracyType"]
            tol = float(tol)
            input_params.precursor_mass_tolerance = "[-" + str(tol) + " " + unit + ", +" + str(tol) + " " + unit + "]"
            # get tolerance for fragments:
            tol2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentIonMZTolerance"]
            unit2 = each_search_params["searchEnginesWithForms"][0][1]["paramMap"]["fragmentAccuracyType"]
            tol2 = float(tol2)
            input_params.fragment_mass_tolerance = (
                "[-" + str(tol2) + " " + unit2 + ", +" + str(tol2) + " " + unit2 + "]"
            )

            # Add "hidden" modifications when using X!Tandem:
            for key, value in each_search_params["searchEnginesWithForms"][0][1]["paramMap"][
                "algorithmParameters"
            ].items():
                if value["type"] == "com.compomics.util.parameters.identification.tool_specific.XtandemParameters":
                    if value["data"]["proteinQuickAcetyl"] == True:
                        input_params.variable_mods = input_params.variable_mods + ", N-term[Acetyl]"
                    if value["data"]["quickPyrolidone"] == True:
                        input_params.variable_mods = input_params.variable_mods + ", N-term[Pyrolidone]"

        if "validationConfig" in each_search_params:
            input_params.ident_fdr_psm = each_search_params["validationConfig"]["psmExpectedFdr"] / 100
            # input_params.min_peptide_length = each_search_params["validationConfig"]["psmFilters"] #TODO: I am not sure if this is the max or min length

    return input_params




[docs]
def extract_params(
    fname: Union[str, pathlib.Path],
    json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json"),
) -> ProteoBenchParameters:
    """
    Parse MSAangel quantification tool JSON parameter file and extract relevant parameters.

    Parameters
    ----------
    fname : str or pathlib.Path
        The path to the MSAngel JSON parameter file.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters as a `ProteoBenchParameters` object.
    """
    params = ProteoBenchParameters(filename=json_file)

    try:
        # If the input is a file-like object (e.g., StringIO), decode it
        file_contents = fname.getvalue().decode("utf-8")
        data = json.loads(file_contents)
    except AttributeError:
        # Otherwise, treat it as a file path
        with open(fname, "r") as file_contents:
            data = json.load(file_contents)

    # Extract parameters from the JSON data
    params.software_name = "MSAngel"
    params.software_version = data["msAngelVersion"]
    params.search_engine = extract_search_engine(data)

    # Params fixed in MSAngel
    params.enable_match_between_runs = True

    # parameter parsing depends on the search engine used
    if params.search_engine == "Mascot":
        extract_params_mascot_specific(data, params)
    elif params.search_engine == "X!Tandem":
        extract_params_xtandem_specific(data, params)

    params.fill_none()

    return params



if __name__ == "__main__":
    """
    Extract parameters from MSAngel JSON files and save them as CSV.
    """
    from pathlib import Path

    files = [
        Path("../../../test/params/MSAngel_Xtandem-export-param.json"),
        Path("../../../test/params/MSAngel_fromRAWtoQUANT-Mascot-export-param.json"),
    ]

    for file in files:
        # Extract parameters from the file
        params = extract_params(file)

        # Convert the extracted parameters to a dictionary and then to a pandas Series
        data_dict = params.__dict__
        series = pd.Series(data_dict)

        print(series)
        # Write the Series to a CSV file
        series.to_csv(file.with_suffix(".csv"))