Source code for proteobench.io.params.fragger

"""
Functionality to parse FragPipe fragger.params parameter files.

FragPipe has a text based parameter file format which
separates parameters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""

from __future__ import annotations

import logging
import pathlib
import re
from collections import namedtuple
from io import BytesIO
from typing import List

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

logger = logging.getLogger(__name__)

Parameter = namedtuple("Parameter", ["name", "value", "comment"])

VERSION_NO_PATTERN = r"MSFragger-(.+)\.jar"



[docs]
def parse_phi_report_filters(phi_report_cmd: str) -> tuple[float, float, float]:
    """
    Parse the filters from the phi-report command string.

    Parameters
    ----------
    phi_report_cmd : str
        The command string from the phi-report filter.

    Returns
    -------
    tuple of (float, float, float)
        A tuple containing the PSM, peptide, and protein FDR values.
    """
    # Define default FDR values
    default_fdr = 0.01

    # Define regex patterns for FDR values
    fdr_patterns = {
        "psm": r"--psm\s+(\d+\.\d+)",
        "peptide": r"--pep\s+(\d+\.\d+)",
        "protein": r"--prot\s+(\d+\.\d+)",
    }

    # Extract FDR values using regex
    fdr_values = {
        key: float(match.group(1)) if (match := re.search(pattern, phi_report_cmd)) else default_fdr
        for key, pattern in fdr_patterns.items()
    }

    return fdr_values["psm"], fdr_values["peptide"], fdr_values["protein"]




[docs]
def parse_params(l_of_str: List[str], sep: str = " = ") -> List[Parameter]:
    """
    Parse the FragPipe parameter file and return a list of Parameter objects.

    Parameters
    ----------
    l_of_str : List[str]
        The lines of the FragPipe parameter file as a list of strings.
    sep : str, optional
        The separator between parameter names and values. Default is " = ".

    Returns
    -------
    List[Parameter]
        A list of Parameter namedtuples containing the parameter name, value, and any comment.
    """
    data = []
    for line in l_of_str:
        line = line.strip()
        logger.debug(line)
        if line.startswith("#"):
            continue  # Skip comments
        if not line:
            continue  # Skip empty lines
        if "#" in line:  # Handle lines with inline comments
            res = line.split("#")
            if len(res) == 1:
                comment = res[0]
                data.append(Parameter(None, None, comment.strip()))
                continue
            param, comment = [x.strip() for x in res]
        else:
            param = line
            comment = None
        res = param.strip().split(sep, maxsplit=1)
        if len(res) == 1:
            param = res[0].strip()
            data.append(Parameter(param, None, comment))
            continue
        param, value = [x.strip() for x in res]
        data.append(Parameter(param, value, comment))
    return data




[docs]
def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> tuple[str, str | None, list[Parameter]]:
    """
    Read the FragPipe workflow file and return the header and a list of Parameter objects.

    Parameters
    ----------
    file : BytesIO
        The FragPipe workflow file to read.
    sep : str, optional
        The separator used between parameter names and values. Default is "=".

    Returns
    -------
    tuple of (str, list of Parameter)
        A tuple containing the header and a list of Parameter objects.
    """
    l_of_str = file.read().decode("utf-8").splitlines()
    header = l_of_str[0][1:].strip()  # Skip leading '#' in the header
    msfragger_version = None
    for ss in l_of_str[1:]:
        if ss.startswith("# MSFragger version"):
            msfragger_version = ss.split(" ")[-1].strip()
            break
        elif ss.startswith("fragpipe-config.bin-msfragger"):
            path = ss.split("=")[-1].strip()
            if "/" in path:
                filename = path.split("/")[-1]
            elif "\\" in path:
                filename = path.split("\\")[-1]
            else:
                filename = path
            match = re.search(VERSION_NO_PATTERN, filename)
            if match:
                msfragger_version = match.group(1)
    return header, msfragger_version, parse_params(l_of_str, sep=sep)




[docs]
def extract_params(file: BytesIO) -> ProteoBenchParameters:
    """
    Parse FragPipe parameter files and extract relevant parameters into a `ProteoBenchParameters` object.

    Parameters
    ----------
    file : BytesIO
        The FragPipe parameter file to parse.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters encapsulated in a `ProteoBenchParameters` object.
    """
    header, msfragger_version, fragpipe_params = read_fragpipe_workflow(file)
    fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
        Parameter._fields[0]
    )["value"]

    # Extract version from header
    match = re.search(VERSION_NO_PATTERN, header)
    if match:
        header = match.group()

    # Initialize ProteoBenchParameters
    params = ProteoBenchParameters()
    params.software_name = "FragPipe"
    params.software_version = header
    params.search_engine = "MSFragger"
    params.search_engine_version = msfragger_version

    # Enzyme and cleavage settings
    enzyme = fragpipe_params.loc["msfragger.search_enzyme_name_1"]
    if fragpipe_params.loc["msfragger.search_enzyme_name_2"] != "null":
        enzyme += f"|{fragpipe_params.loc['msfragger.search_enzyme_name_2']}"
    if enzyme == "stricttrypsin":
        enzyme = "Trypsin/P"  # strict trypsin: always cut after K and R
    elif enzyme == "trypsin":
        enzyme = "Trypsin"  # trypsin: do not cut before P
    params.enzyme = enzyme
    params.allowed_miscleavages = int(fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"])

    # Modifications
    params.fixed_mods = fragpipe_params.loc["msfragger.table.fix-mods"]
    params.variable_mods = fragpipe_params.loc["msfragger.table.var-mods"]
    params.max_mods = int(fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"])

    # Peptide length
    params.min_peptide_length = int(fragpipe_params.loc["msfragger.digest_min_length"])
    params.max_peptide_length = int(fragpipe_params.loc["msfragger.digest_max_length"])

    # Precursor mass tolerance
    precursor_mass_units = "Da"
    if int(fragpipe_params.loc["msfragger.precursor_mass_units"]):
        precursor_mass_units = "ppm"
    params.precursor_mass_tolerance = f'[{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}, {fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}]'

    # Fragment mass tolerance
    fragment_mass_units = "Da"
    if int(fragpipe_params.loc["msfragger.fragment_mass_units"]):
        fragment_mass_units = "ppm"
    params.fragment_mass_tolerance = f'[-{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}, {fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}]'

    if fragpipe_params.loc["diann.run-dia-nn"] == "true":
        params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"]
        params.ident_fdr_peptide = None
        params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"]
        params.abundance_normalization_ions = None

    else:
        phi_report_cmd = fragpipe_params.loc["phi-report.filter"]
        params.ident_fdr_psm, params.ident_fdr_peptide, params.ident_fdr_protein = parse_phi_report_filters(
            phi_report_cmd
        )

    # Precursor charge settings
    if fragpipe_params.loc["msfragger.override_charge"] == "true":
        params.min_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-lo"])
        params.max_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-hi"])
    else:
        params.min_precursor_charge = 1
        params.max_precursor_charge = None

    # Match between runs and quantification method settings
    if fragpipe_params.loc["quantitation.run-label-free-quant"] == "true":
        params.enable_match_between_runs = bool(fragpipe_params.loc["ionquant.mbr"])
    elif fragpipe_params.loc["diann.run-dia-nn"] == "true":
        diann_quant_dict = {
            1: "Any LC (high accuracy)",
            2: "Any LC (high precision)",
            3: "Robust LC (high accuracy)",
            4: "Robust LC (high precision)",
        }
        params.enable_match_between_runs = (
            "diann.fragpipe.cmd-opts" in fragpipe_params.index
            and "--reanalyse" in fragpipe_params.loc["diann.fragpipe.cmd-opts"]
        ) or ("diann.cmd-opts" in fragpipe_params.index and "--reanalyse" in fragpipe_params.loc["diann.cmd-opts"])
        params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]

    # Protein inference settings
    if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
        params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}"

    params.fill_none()

    return params



if __name__ == "__main__":
    # Process FragPipe workflow file and extract parameters
    files = [
        "../../../test/params/fragpipe.workflow",
        "../../../test/params/fragpipe_older.workflow",
        "../../../test/params/fragpipe_win_paths.workflow",
        "../../../test/params/fragpipe_v22.workflow",
        "../../../test/params/fragpipe_fdr_test.workflow",
    ]

    for file_path in files:
        file = pathlib.Path(file_path)
        with open(file, "rb") as f:
            _, _, data = read_fragpipe_workflow(f)
        df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
        df.to_csv(file.with_suffix(".csv"))
        with open(file, "rb") as f:
            params = extract_params(f)
        series = pd.Series(params.__dict__)
        print(series)
        series.to_csv(file.parent / f"{file.stem}_extracted_params.csv")