Source code for proteobench.io.params.fragger

"""
Functionality to parse FragPipe fragger.params parameter files.

FragPipe has a text based parameter file format which
separates parameters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""

from __future__ import annotations

import logging
import os
import pathlib
import re
from collections import namedtuple
from io import BytesIO
from typing import List

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

logger = logging.getLogger(__name__)

Parameter = namedtuple("Parameter", ["name", "value", "comment"])

VERSION_NO_PATTERN = r"MSFragger-(.+)\.jar"

# Common mass shifts mapped to modification names (ProForma notation)
MASS_TO_MOD = {
    57.02146: "Carbamidomethyl",
    15.9949: "Oxidation",
    42.0106: "Acetyl",
    79.96633: "Phospho",
    114.04293: "GG",
    -17.0265: "Pyro-glu",
    -18.0106: "Pyro-glu",
    4.025107: "Label:2H(4)",
    6.020129: "Label:13C(6)",
    8.014199: "Label:13C(6)15N(2)",
    10.008269: "Label:13C(6)15N(4)",
}
MASS_TOLERANCE = 0.001


def _lookup_mod_name(mass: float) -> str | None:
    """Look up a modification name by mass shift within tolerance."""
    for ref_mass, name in MASS_TO_MOD.items():
        if abs(mass - ref_mass) < MASS_TOLERANCE:
            return name
    return None


def _parse_fixed_mods(raw: str) -> str:
    """Parse MSFragger fixed modifications string into ProForma-like format.

    Input format: ``mass,residue_description,active,num_sites`` entries separated by ``; ``.
    Example: ``57.02146,C (cysteine),true,-1``
    """
    if not raw or not raw.strip():
        return ""
    results = []
    for entry in raw.split("; "):
        parts = entry.strip().split(",", 3)
        if len(parts) < 3:
            continue
        mass_str, residue_desc, active = parts[0].strip(), parts[1].strip(), parts[2].strip()
        if active != "true":
            continue
        mass = float(mass_str)
        if abs(mass) < MASS_TOLERANCE:
            continue
        mod_name = _lookup_mod_name(mass) or mass_str.strip()
        residue_match = re.match(r"^([A-Z])\s*\(", residue_desc)
        if residue_match:
            residue = residue_match.group(1)
        elif "N-Term" in residue_desc:
            residue = "N-term"
        elif "C-Term" in residue_desc:
            residue = "C-term"
        else:
            residue = residue_desc
        results.append(f"{residue}[{mod_name}]")
    return ", ".join(results)


def _parse_variable_mods(raw: str) -> str:
    """Parse MSFragger variable modifications string into ProForma-like format.

    Input format: ``mass,residue,active,max_occurrences`` entries separated by ``; ``.
    Special residue notations: ``[^`` = protein N-term, ``nX`` = peptide N-term of residue X.
    """
    if not raw or not raw.strip():
        return ""
    results = []
    for entry in raw.split("; "):
        parts = entry.strip().split(",", 3)
        if len(parts) < 3:
            continue
        mass_str, residue_field, active = parts[0].strip(), parts[1].strip(), parts[2].strip()
        if active != "true":
            continue
        mass = float(mass_str)
        if abs(mass) < MASS_TOLERANCE:
            continue
        mod_name = _lookup_mod_name(mass) or mass_str.strip()
        if residue_field == "[^":
            results.append(f"N-term[{mod_name}]")
        elif residue_field.startswith("n"):
            aa_residues = re.findall(r"n([A-Z])", residue_field)
            if aa_residues:
                for aa in aa_residues:
                    results.append(f"N-term {aa}[{mod_name}]")
            else:
                results.append(f"N-term[{mod_name}]")
        else:
            results.append(f"{residue_field}[{mod_name}]")
    return ", ".join(results)



[docs]
def parse_phi_report_filters(phi_report_cmd: str) -> tuple[float, float, float]:
    """
    Parse the filters from the phi-report command string.

    Parameters
    ----------
    phi_report_cmd : str
        The command string from the phi-report filter.

    Returns
    -------
    tuple of (float, float, float)
        A tuple containing the PSM, peptide, and protein FDR values.
    """
    # Define default FDR values
    default_fdr = 0.01

    # Define regex patterns for FDR values
    fdr_patterns = {
        "psm": r"--psm\s+(\d+\.\d+)",
        "peptide": r"--pep\s+(\d+\.\d+)",
        "protein": r"--prot\s+(\d+\.\d+)",
    }

    # Extract FDR values using regex
    fdr_values = {
        key: float(match.group(1)) if (match := re.search(pattern, phi_report_cmd)) else default_fdr
        for key, pattern in fdr_patterns.items()
    }

    return fdr_values["psm"], fdr_values["peptide"], fdr_values["protein"]




[docs]
def parse_params(l_of_str: List[str], sep: str = " = ") -> List[Parameter]:
    """
    Parse the FragPipe parameter file and return a list of Parameter objects.

    Parameters
    ----------
    l_of_str : List[str]
        The lines of the FragPipe parameter file as a list of strings.
    sep : str, optional
        The separator between parameter names and values. Default is " = ".

    Returns
    -------
    List[Parameter]
        A list of Parameter namedtuples containing the parameter name, value, and any comment.
    """
    data = []
    for line in l_of_str:
        line = line.strip()
        logger.debug(line)
        if line.startswith("#"):
            continue  # Skip comments
        if not line:
            continue  # Skip empty lines
        if "#" in line:  # Handle lines with inline comments
            res = line.split("#")
            if len(res) == 1:
                comment = res[0]
                data.append(Parameter(None, None, comment.strip()))
                continue
            param, comment = [x.strip() for x in res]
        else:
            param = line
            comment = None
        res = param.strip().split(sep, maxsplit=1)
        if len(res) == 1:
            param = res[0].strip()
            data.append(Parameter(param, None, comment))
            continue
        param, value = [x.strip() for x in res]
        data.append(Parameter(param, value, comment))
    return data




[docs]
def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> tuple[str, str | None, list[Parameter]]:
    """
    Read the FragPipe workflow file and return the header and a list of Parameter objects.

    Parameters
    ----------
    file : BytesIO
        The FragPipe workflow file to read.
    sep : str, optional
        The separator used between parameter names and values. Default is "=".

    Returns
    -------
    tuple of (str, list of Parameter)
        A tuple containing the header and a list of Parameter objects.
    """
    l_of_str = file.read().decode("utf-8").splitlines()
    header = l_of_str[0][1:].strip()  # Skip leading '#' in the header
    msfragger_version = None
    fragpipe_version = None
    for ss in l_of_str[1:]:
        if ss.startswith("# MSFragger version"):
            msfragger_version = ss.split(" ")[-1].strip()
            break
        elif ss.startswith("fragpipe-config.bin-msfragger"):
            path = ss.split("=")[-1].strip()
            if "/" in path:
                filename = path.split("/")[-1]
            elif "\\" in path:
                filename = path.split("\\")[-1]
            else:
                filename = path
            match = re.search(VERSION_NO_PATTERN, filename)
            if match:
                msfragger_version = match.group(1)
        if ss.startswith("# FragPipe version"):
            fragpipe_version = ss.split(" ")[-1].strip()
    return header, msfragger_version, fragpipe_version, parse_params(l_of_str, sep=sep)




[docs]
def extract_params(
    file: BytesIO, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
    """
    Parse FragPipe parameter files and extract relevant parameters into a `ProteoBenchParameters` object.

    Parameters
    ----------
    file : BytesIO
        The FragPipe parameter file to parse.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters encapsulated in a `ProteoBenchParameters` object.
    """
    header, msfragger_version, fragpipe_version, fragpipe_params = read_fragpipe_workflow(file)
    fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
        Parameter._fields[0]
    )["value"]

    # Extract version from header
    if not fragpipe_version:
        fragpipe_version = re.match(r"FragPipe \((\d+\.\d+.*)\)", header).group(1)

    # Initialize ProteoBenchParameters
    params = ProteoBenchParameters(filename=json_file)
    params.software_name = "FragPipe"
    params.software_version = fragpipe_version
    params.search_engine = "MSFragger"
    params.search_engine_version = msfragger_version

    # Enzyme and cleavage settings
    enzyme = fragpipe_params.loc["msfragger.search_enzyme_name_1"]
    if fragpipe_params.loc["msfragger.search_enzyme_name_2"] != "null":
        enzyme += f"|{fragpipe_params.loc['msfragger.search_enzyme_name_2']}"
    if enzyme == "stricttrypsin":
        enzyme = "Trypsin/P"  # strict trypsin: always cut after K and R
    elif enzyme == "trypsin":
        enzyme = "Trypsin"  # trypsin: do not cut before P
    params.enzyme = enzyme
    params.allowed_miscleavages = int(fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"])

    if fragpipe_params.loc["msfragger.num_enzyme_termini"] == "2":
        # 2 is ENZYMATIC, 1 is SEMI, 3 is SEMI_N_TERM, 0 is NONSPECIFIC
        params.semi_enzymatic = False
    else:
        params.semi_enzymatic = True

    # Modifications
    params.fixed_mods = _parse_fixed_mods(fragpipe_params.loc["msfragger.table.fix-mods"])
    params.variable_mods = _parse_variable_mods(fragpipe_params.loc["msfragger.table.var-mods"])
    params.max_mods = int(fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"])

    # Peptide length
    params.min_peptide_length = int(fragpipe_params.loc["msfragger.digest_min_length"])
    params.max_peptide_length = int(fragpipe_params.loc["msfragger.digest_max_length"])

    # Precursor mass tolerance
    precursor_mass_units = "Da"
    if int(fragpipe_params.loc["msfragger.precursor_mass_units"]):
        precursor_mass_units = "ppm"
    params.precursor_mass_tolerance = f'[{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}, {fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}]'

    # Fragment mass tolerance
    fragment_mass_units = "Da"
    if int(fragpipe_params.loc["msfragger.fragment_mass_units"]):
        fragment_mass_units = "ppm"
    params.fragment_mass_tolerance = f'[-{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}, {fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}]'

    if fragpipe_params.loc["diann.run-dia-nn"] == "true":
        params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"]
        params.ident_fdr_peptide = None
        params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"]
        params.abundance_normalization_ions = None

    else:
        phi_report_cmd = fragpipe_params.loc["phi-report.filter"]
        params.ident_fdr_psm, params.ident_fdr_peptide, params.ident_fdr_protein = parse_phi_report_filters(
            phi_report_cmd
        )

    # Precursor charge settings
    if fragpipe_params.loc["msfragger.override_charge"] == "true":
        params.min_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-lo"])
        params.max_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-hi"])
    else:
        params.min_precursor_charge = 1
        params.max_precursor_charge = None

    params.min_precursor_mz = (
        int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-lo"]) / params.max_precursor_charge
        if params.max_precursor_charge
        else None
    )
    params.max_precursor_mz = (
        int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-hi"]) / params.min_precursor_charge
        if params.min_precursor_charge
        else None
    )

    params.min_fragment_mz = None
    params.max_fragment_mz = None

    # Match between runs and quantification method settings
    if fragpipe_params.loc["quantitation.run-label-free-quant"] == "true":
        params.enable_match_between_runs = bool(int(fragpipe_params.loc["ionquant.mbr"]))
    elif fragpipe_params.loc["diann.run-dia-nn"] == "true":
        diann_quant_dict = {
            1: "Any LC (high accuracy)",
            2: "Any LC (high precision)",
            3: "Robust LC (high accuracy)",
            4: "Robust LC (high precision)",
        }
        params.enable_match_between_runs = (
            "diann.fragpipe.cmd-opts" in fragpipe_params.index
            and "--reanalyse" in fragpipe_params.loc["diann.fragpipe.cmd-opts"]
        ) or ("diann.cmd-opts" in fragpipe_params.index and "--reanalyse" in fragpipe_params.loc["diann.cmd-opts"])
        params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]

    # Protein inference settings
    if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
        params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}"

    params.fill_none()

    return params



if __name__ == "__main__":
    # Process FragPipe workflow file and extract parameters
    files = [
        "../../../test/params/fragpipe.workflow",
        "../../../test/params/fragpipe_older.workflow",
        "../../../test/params/fragpipe_win_paths.workflow",
        "../../../test/params/fragpipe_v22.workflow",
        "../../../test/params/fragpipe_fdr_test.workflow",
        "../../../test/params/fragpipe-version.workflow",
        "../../../test/params/fragpipe_v23_noMBR.workflow",
    ]

    for file_path in files:
        file = pathlib.Path(file_path)
        with open(file, "rb") as f:
            _, _, _, data = read_fragpipe_workflow(f)
        df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
        df.to_csv(file.with_suffix(".csv"))
        with open(file, "rb") as f:
            params = extract_params(f)
        series = pd.Series(params.__dict__)
        print(series)
        print("\n")
        series.to_csv(file.parent / f"{file.stem}_extracted_params.csv")