Source code for proteobench.io.params.fragger

"""
Functionality to parse FragPipe fragger.params parameter files.

FragPipe has a text based parameter file format which
separates parameters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""

from __future__ import annotations

import logging
import os
import pathlib
import re
from collections import namedtuple
from io import BytesIO
from typing import List

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

logger = logging.getLogger(__name__)

Parameter = namedtuple("Parameter", ["name", "value", "comment"])

VERSION_NO_PATTERN = r"MSFragger-(.+)\.jar"

# Common mass shifts mapped to modification names (ProForma notation)
MASS_TO_MOD = {
    57.02146: "Carbamidomethyl",
    15.9949: "Oxidation",
    42.0106: "Acetyl",
    79.96633: "Phospho",
    114.04293: "GG",
    -17.0265: "Pyro-glu",
    -18.0106: "Pyro-glu",
    4.025107: "Label:2H(4)",
    6.020129: "Label:13C(6)",
    8.014199: "Label:13C(6)15N(2)",
    10.008269: "Label:13C(6)15N(4)",
}
MASS_TOLERANCE = 0.001


def _lookup_mod_name(mass: float) -> str | None:
    """Look up a modification name by mass shift within tolerance."""
    for ref_mass, name in MASS_TO_MOD.items():
        if abs(mass - ref_mass) < MASS_TOLERANCE:
            return name
    return None


def _parse_fixed_mods(raw: str) -> str:
    """Parse MSFragger fixed modifications string into ProForma-like format.

    Input format: ``mass,residue_description,active,num_sites`` entries separated by ``; ``.
    Example: ``57.02146,C (cysteine),true,-1``
    """
    if not raw or not raw.strip():
        return ""
    results = []
    for entry in raw.split("; "):
        parts = entry.strip().split(",", 3)
        if len(parts) < 3:
            continue
        mass_str, residue_desc, active = parts[0].strip(), parts[1].strip(), parts[2].strip()
        if active != "true":
            continue
        mass = float(mass_str)
        if abs(mass) < MASS_TOLERANCE:
            continue
        mod_name = _lookup_mod_name(mass) or mass_str.strip()
        residue_match = re.match(r"^([A-Z])\s*\(", residue_desc)
        if residue_match:
            residue = residue_match.group(1)
        elif "N-Term" in residue_desc:
            residue = "N-term"
        elif "C-Term" in residue_desc:
            residue = "C-term"
        else:
            residue = residue_desc
        results.append(f"{residue}[{mod_name}]")
    return ", ".join(results)


def _parse_variable_mods(raw: str) -> str:
    """Parse MSFragger variable modifications string into ProForma-like format.

    Input format: ``mass,residue,active,max_occurrences`` entries separated by ``; ``.
    Special residue notations: ``[^`` = protein N-term, ``nX`` = peptide N-term of residue X.
    """
    if not raw or not raw.strip():
        return ""
    results = []
    for entry in raw.split("; "):
        parts = entry.strip().split(",", 3)
        if len(parts) < 3:
            continue
        mass_str, residue_field, active = parts[0].strip(), parts[1].strip(), parts[2].strip()
        if active != "true":
            continue
        mass = float(mass_str)
        if abs(mass) < MASS_TOLERANCE:
            continue
        mod_name = _lookup_mod_name(mass) or mass_str.strip()
        if residue_field == "[^":
            results.append(f"N-term[{mod_name}]")
        elif residue_field.startswith("n"):
            aa_residues = re.findall(r"n([A-Z])", residue_field)
            if aa_residues:
                for aa in aa_residues:
                    results.append(f"N-term {aa}[{mod_name}]")
            else:
                results.append(f"N-term[{mod_name}]")
        else:
            results.append(f"{residue_field}[{mod_name}]")
    return ", ".join(results)


[docs] def parse_phi_report_filters(phi_report_cmd: str) -> tuple[float, float, float]: """ Parse the filters from the phi-report command string. Parameters ---------- phi_report_cmd : str The command string from the phi-report filter. Returns ------- tuple of (float, float, float) A tuple containing the PSM, peptide, and protein FDR values. """ # Define default FDR values default_fdr = 0.01 # Define regex patterns for FDR values fdr_patterns = { "psm": r"--psm\s+(\d+\.\d+)", "peptide": r"--pep\s+(\d+\.\d+)", "protein": r"--prot\s+(\d+\.\d+)", } # Extract FDR values using regex fdr_values = { key: float(match.group(1)) if (match := re.search(pattern, phi_report_cmd)) else default_fdr for key, pattern in fdr_patterns.items() } return fdr_values["psm"], fdr_values["peptide"], fdr_values["protein"]
[docs] def parse_params(l_of_str: List[str], sep: str = " = ") -> List[Parameter]: """ Parse the FragPipe parameter file and return a list of Parameter objects. Parameters ---------- l_of_str : List[str] The lines of the FragPipe parameter file as a list of strings. sep : str, optional The separator between parameter names and values. Default is " = ". Returns ------- List[Parameter] A list of Parameter namedtuples containing the parameter name, value, and any comment. """ data = [] for line in l_of_str: line = line.strip() logger.debug(line) if line.startswith("#"): continue # Skip comments if not line: continue # Skip empty lines if "#" in line: # Handle lines with inline comments res = line.split("#") if len(res) == 1: comment = res[0] data.append(Parameter(None, None, comment.strip())) continue param, comment = [x.strip() for x in res] else: param = line comment = None res = param.strip().split(sep, maxsplit=1) if len(res) == 1: param = res[0].strip() data.append(Parameter(param, None, comment)) continue param, value = [x.strip() for x in res] data.append(Parameter(param, value, comment)) return data
[docs] def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> tuple[str, str | None, list[Parameter]]: """ Read the FragPipe workflow file and return the header and a list of Parameter objects. Parameters ---------- file : BytesIO The FragPipe workflow file to read. sep : str, optional The separator used between parameter names and values. Default is "=". Returns ------- tuple of (str, list of Parameter) A tuple containing the header and a list of Parameter objects. """ l_of_str = file.read().decode("utf-8").splitlines() header = l_of_str[0][1:].strip() # Skip leading '#' in the header msfragger_version = None fragpipe_version = None for ss in l_of_str[1:]: if ss.startswith("# MSFragger version"): msfragger_version = ss.split(" ")[-1].strip() break elif ss.startswith("fragpipe-config.bin-msfragger"): path = ss.split("=")[-1].strip() if "/" in path: filename = path.split("/")[-1] elif "\\" in path: filename = path.split("\\")[-1] else: filename = path match = re.search(VERSION_NO_PATTERN, filename) if match: msfragger_version = match.group(1) if ss.startswith("# FragPipe version"): fragpipe_version = ss.split(" ")[-1].strip() return header, msfragger_version, fragpipe_version, parse_params(l_of_str, sep=sep)
[docs] def extract_params( file: BytesIO, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json") ) -> ProteoBenchParameters: """ Parse FragPipe parameter files and extract relevant parameters into a `ProteoBenchParameters` object. Parameters ---------- file : BytesIO The FragPipe parameter file to parse. Returns ------- ProteoBenchParameters The extracted parameters encapsulated in a `ProteoBenchParameters` object. """ header, msfragger_version, fragpipe_version, fragpipe_params = read_fragpipe_workflow(file) fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index( Parameter._fields[0] )["value"] # Extract version from header if not fragpipe_version: fragpipe_version = re.match(r"FragPipe \((\d+\.\d+.*)\)", header).group(1) # Initialize ProteoBenchParameters params = ProteoBenchParameters(filename=json_file) params.software_name = "FragPipe" params.software_version = fragpipe_version params.search_engine = "MSFragger" params.search_engine_version = msfragger_version # Enzyme and cleavage settings enzyme = fragpipe_params.loc["msfragger.search_enzyme_name_1"] if fragpipe_params.loc["msfragger.search_enzyme_name_2"] != "null": enzyme += f"|{fragpipe_params.loc['msfragger.search_enzyme_name_2']}" if enzyme == "stricttrypsin": enzyme = "Trypsin/P" # strict trypsin: always cut after K and R elif enzyme == "trypsin": enzyme = "Trypsin" # trypsin: do not cut before P params.enzyme = enzyme params.allowed_miscleavages = int(fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"]) if fragpipe_params.loc["msfragger.num_enzyme_termini"] == "2": # 2 is ENZYMATIC, 1 is SEMI, 3 is SEMI_N_TERM, 0 is NONSPECIFIC params.semi_enzymatic = False else: params.semi_enzymatic = True # Modifications params.fixed_mods = _parse_fixed_mods(fragpipe_params.loc["msfragger.table.fix-mods"]) params.variable_mods = _parse_variable_mods(fragpipe_params.loc["msfragger.table.var-mods"]) params.max_mods = int(fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"]) # Peptide length params.min_peptide_length = int(fragpipe_params.loc["msfragger.digest_min_length"]) params.max_peptide_length = int(fragpipe_params.loc["msfragger.digest_max_length"]) # Precursor mass tolerance precursor_mass_units = "Da" if int(fragpipe_params.loc["msfragger.precursor_mass_units"]): precursor_mass_units = "ppm" params.precursor_mass_tolerance = f'[{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}, {fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}]' # Fragment mass tolerance fragment_mass_units = "Da" if int(fragpipe_params.loc["msfragger.fragment_mass_units"]): fragment_mass_units = "ppm" params.fragment_mass_tolerance = f'[-{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}, {fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}]' if fragpipe_params.loc["diann.run-dia-nn"] == "true": params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"] params.ident_fdr_peptide = None params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"] params.abundance_normalization_ions = None else: phi_report_cmd = fragpipe_params.loc["phi-report.filter"] params.ident_fdr_psm, params.ident_fdr_peptide, params.ident_fdr_protein = parse_phi_report_filters( phi_report_cmd ) # Precursor charge settings if fragpipe_params.loc["msfragger.override_charge"] == "true": params.min_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-lo"]) params.max_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-hi"]) else: params.min_precursor_charge = 1 params.max_precursor_charge = None params.min_precursor_mz = ( int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-lo"]) / params.max_precursor_charge if params.max_precursor_charge else None ) params.max_precursor_mz = ( int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-hi"]) / params.min_precursor_charge if params.min_precursor_charge else None ) params.min_fragment_mz = None params.max_fragment_mz = None # Match between runs and quantification method settings if fragpipe_params.loc["quantitation.run-label-free-quant"] == "true": params.enable_match_between_runs = bool(int(fragpipe_params.loc["ionquant.mbr"])) elif fragpipe_params.loc["diann.run-dia-nn"] == "true": diann_quant_dict = { 1: "Any LC (high accuracy)", 2: "Any LC (high precision)", 3: "Robust LC (high accuracy)", 4: "Robust LC (high precision)", } params.enable_match_between_runs = ( "diann.fragpipe.cmd-opts" in fragpipe_params.index and "--reanalyse" in fragpipe_params.loc["diann.fragpipe.cmd-opts"] ) or ("diann.cmd-opts" in fragpipe_params.index and "--reanalyse" in fragpipe_params.loc["diann.cmd-opts"]) params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])] # Protein inference settings if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true": params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}" params.fill_none() return params
if __name__ == "__main__": # Process FragPipe workflow file and extract parameters files = [ "../../../test/params/fragpipe.workflow", "../../../test/params/fragpipe_older.workflow", "../../../test/params/fragpipe_win_paths.workflow", "../../../test/params/fragpipe_v22.workflow", "../../../test/params/fragpipe_fdr_test.workflow", "../../../test/params/fragpipe-version.workflow", "../../../test/params/fragpipe_v23_noMBR.workflow", ] for file_path in files: file = pathlib.Path(file_path) with open(file, "rb") as f: _, _, _, data = read_fragpipe_workflow(f) df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0]) df.to_csv(file.with_suffix(".csv")) with open(file, "rb") as f: params = extract_params(f) series = pd.Series(params.__dict__) print(series) print("\n") series.to_csv(file.parent / f"{file.stem}_extracted_params.csv")