Source code for proteobench.io.params.fragger
"""
Functionality to parse FragPipe fragger.params parameter files.
FragPipe has a text based parameter file format which
separates parameters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""
from __future__ import annotations
import logging
import os
import pathlib
import re
from collections import namedtuple
from io import BytesIO
from typing import List
import pandas as pd
from proteobench.io.params import ProteoBenchParameters
logger = logging.getLogger(__name__)
Parameter = namedtuple("Parameter", ["name", "value", "comment"])
VERSION_NO_PATTERN = r"MSFragger-(.+)\.jar"
# Common mass shifts mapped to modification names (ProForma notation)
MASS_TO_MOD = {
57.02146: "Carbamidomethyl",
15.9949: "Oxidation",
42.0106: "Acetyl",
79.96633: "Phospho",
114.04293: "GG",
-17.0265: "Pyro-glu",
-18.0106: "Pyro-glu",
4.025107: "Label:2H(4)",
6.020129: "Label:13C(6)",
8.014199: "Label:13C(6)15N(2)",
10.008269: "Label:13C(6)15N(4)",
}
MASS_TOLERANCE = 0.001
def _lookup_mod_name(mass: float) -> str | None:
"""Look up a modification name by mass shift within tolerance."""
for ref_mass, name in MASS_TO_MOD.items():
if abs(mass - ref_mass) < MASS_TOLERANCE:
return name
return None
def _parse_fixed_mods(raw: str) -> str:
"""Parse MSFragger fixed modifications string into ProForma-like format.
Input format: ``mass,residue_description,active,num_sites`` entries separated by ``; ``.
Example: ``57.02146,C (cysteine),true,-1``
"""
if not raw or not raw.strip():
return ""
results = []
for entry in raw.split("; "):
parts = entry.strip().split(",", 3)
if len(parts) < 3:
continue
mass_str, residue_desc, active = parts[0].strip(), parts[1].strip(), parts[2].strip()
if active != "true":
continue
mass = float(mass_str)
if abs(mass) < MASS_TOLERANCE:
continue
mod_name = _lookup_mod_name(mass) or mass_str.strip()
residue_match = re.match(r"^([A-Z])\s*\(", residue_desc)
if residue_match:
residue = residue_match.group(1)
elif "N-Term" in residue_desc:
residue = "N-term"
elif "C-Term" in residue_desc:
residue = "C-term"
else:
residue = residue_desc
results.append(f"{residue}[{mod_name}]")
return ", ".join(results)
def _parse_variable_mods(raw: str) -> str:
"""Parse MSFragger variable modifications string into ProForma-like format.
Input format: ``mass,residue,active,max_occurrences`` entries separated by ``; ``.
Special residue notations: ``[^`` = protein N-term, ``nX`` = peptide N-term of residue X.
"""
if not raw or not raw.strip():
return ""
results = []
for entry in raw.split("; "):
parts = entry.strip().split(",", 3)
if len(parts) < 3:
continue
mass_str, residue_field, active = parts[0].strip(), parts[1].strip(), parts[2].strip()
if active != "true":
continue
mass = float(mass_str)
if abs(mass) < MASS_TOLERANCE:
continue
mod_name = _lookup_mod_name(mass) or mass_str.strip()
if residue_field == "[^":
results.append(f"N-term[{mod_name}]")
elif residue_field.startswith("n"):
aa_residues = re.findall(r"n([A-Z])", residue_field)
if aa_residues:
for aa in aa_residues:
results.append(f"N-term {aa}[{mod_name}]")
else:
results.append(f"N-term[{mod_name}]")
else:
results.append(f"{residue_field}[{mod_name}]")
return ", ".join(results)
[docs]
def parse_phi_report_filters(phi_report_cmd: str) -> tuple[float, float, float]:
"""
Parse the filters from the phi-report command string.
Parameters
----------
phi_report_cmd : str
The command string from the phi-report filter.
Returns
-------
tuple of (float, float, float)
A tuple containing the PSM, peptide, and protein FDR values.
"""
# Define default FDR values
default_fdr = 0.01
# Define regex patterns for FDR values
fdr_patterns = {
"psm": r"--psm\s+(\d+\.\d+)",
"peptide": r"--pep\s+(\d+\.\d+)",
"protein": r"--prot\s+(\d+\.\d+)",
}
# Extract FDR values using regex
fdr_values = {
key: float(match.group(1)) if (match := re.search(pattern, phi_report_cmd)) else default_fdr
for key, pattern in fdr_patterns.items()
}
return fdr_values["psm"], fdr_values["peptide"], fdr_values["protein"]
[docs]
def parse_params(l_of_str: List[str], sep: str = " = ") -> List[Parameter]:
"""
Parse the FragPipe parameter file and return a list of Parameter objects.
Parameters
----------
l_of_str : List[str]
The lines of the FragPipe parameter file as a list of strings.
sep : str, optional
The separator between parameter names and values. Default is " = ".
Returns
-------
List[Parameter]
A list of Parameter namedtuples containing the parameter name, value, and any comment.
"""
data = []
for line in l_of_str:
line = line.strip()
logger.debug(line)
if line.startswith("#"):
continue # Skip comments
if not line:
continue # Skip empty lines
if "#" in line: # Handle lines with inline comments
res = line.split("#")
if len(res) == 1:
comment = res[0]
data.append(Parameter(None, None, comment.strip()))
continue
param, comment = [x.strip() for x in res]
else:
param = line
comment = None
res = param.strip().split(sep, maxsplit=1)
if len(res) == 1:
param = res[0].strip()
data.append(Parameter(param, None, comment))
continue
param, value = [x.strip() for x in res]
data.append(Parameter(param, value, comment))
return data
[docs]
def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> tuple[str, str | None, list[Parameter]]:
"""
Read the FragPipe workflow file and return the header and a list of Parameter objects.
Parameters
----------
file : BytesIO
The FragPipe workflow file to read.
sep : str, optional
The separator used between parameter names and values. Default is "=".
Returns
-------
tuple of (str, list of Parameter)
A tuple containing the header and a list of Parameter objects.
"""
l_of_str = file.read().decode("utf-8").splitlines()
header = l_of_str[0][1:].strip() # Skip leading '#' in the header
msfragger_version = None
fragpipe_version = None
for ss in l_of_str[1:]:
if ss.startswith("# MSFragger version"):
msfragger_version = ss.split(" ")[-1].strip()
break
elif ss.startswith("fragpipe-config.bin-msfragger"):
path = ss.split("=")[-1].strip()
if "/" in path:
filename = path.split("/")[-1]
elif "\\" in path:
filename = path.split("\\")[-1]
else:
filename = path
match = re.search(VERSION_NO_PATTERN, filename)
if match:
msfragger_version = match.group(1)
if ss.startswith("# FragPipe version"):
fragpipe_version = ss.split(" ")[-1].strip()
return header, msfragger_version, fragpipe_version, parse_params(l_of_str, sep=sep)
[docs]
def extract_params(
file: BytesIO, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
"""
Parse FragPipe parameter files and extract relevant parameters into a `ProteoBenchParameters` object.
Parameters
----------
file : BytesIO
The FragPipe parameter file to parse.
Returns
-------
ProteoBenchParameters
The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
header, msfragger_version, fragpipe_version, fragpipe_params = read_fragpipe_workflow(file)
fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
Parameter._fields[0]
)["value"]
# Extract version from header
if not fragpipe_version:
fragpipe_version = re.match(r"FragPipe \((\d+\.\d+.*)\)", header).group(1)
# Initialize ProteoBenchParameters
params = ProteoBenchParameters(filename=json_file)
params.software_name = "FragPipe"
params.software_version = fragpipe_version
params.search_engine = "MSFragger"
params.search_engine_version = msfragger_version
# Enzyme and cleavage settings
enzyme = fragpipe_params.loc["msfragger.search_enzyme_name_1"]
if fragpipe_params.loc["msfragger.search_enzyme_name_2"] != "null":
enzyme += f"|{fragpipe_params.loc['msfragger.search_enzyme_name_2']}"
if enzyme == "stricttrypsin":
enzyme = "Trypsin/P" # strict trypsin: always cut after K and R
elif enzyme == "trypsin":
enzyme = "Trypsin" # trypsin: do not cut before P
params.enzyme = enzyme
params.allowed_miscleavages = int(fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"])
if fragpipe_params.loc["msfragger.num_enzyme_termini"] == "2":
# 2 is ENZYMATIC, 1 is SEMI, 3 is SEMI_N_TERM, 0 is NONSPECIFIC
params.semi_enzymatic = False
else:
params.semi_enzymatic = True
# Modifications
params.fixed_mods = _parse_fixed_mods(fragpipe_params.loc["msfragger.table.fix-mods"])
params.variable_mods = _parse_variable_mods(fragpipe_params.loc["msfragger.table.var-mods"])
params.max_mods = int(fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"])
# Peptide length
params.min_peptide_length = int(fragpipe_params.loc["msfragger.digest_min_length"])
params.max_peptide_length = int(fragpipe_params.loc["msfragger.digest_max_length"])
# Precursor mass tolerance
precursor_mass_units = "Da"
if int(fragpipe_params.loc["msfragger.precursor_mass_units"]):
precursor_mass_units = "ppm"
params.precursor_mass_tolerance = f'[{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}, {fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}]'
# Fragment mass tolerance
fragment_mass_units = "Da"
if int(fragpipe_params.loc["msfragger.fragment_mass_units"]):
fragment_mass_units = "ppm"
params.fragment_mass_tolerance = f'[-{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}, {fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}]'
if fragpipe_params.loc["diann.run-dia-nn"] == "true":
params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"]
params.ident_fdr_peptide = None
params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"]
params.abundance_normalization_ions = None
else:
phi_report_cmd = fragpipe_params.loc["phi-report.filter"]
params.ident_fdr_psm, params.ident_fdr_peptide, params.ident_fdr_protein = parse_phi_report_filters(
phi_report_cmd
)
# Precursor charge settings
if fragpipe_params.loc["msfragger.override_charge"] == "true":
params.min_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-lo"])
params.max_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-hi"])
else:
params.min_precursor_charge = 1
params.max_precursor_charge = None
params.min_precursor_mz = (
int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-lo"]) / params.max_precursor_charge
if params.max_precursor_charge
else None
)
params.max_precursor_mz = (
int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-hi"]) / params.min_precursor_charge
if params.min_precursor_charge
else None
)
params.min_fragment_mz = None
params.max_fragment_mz = None
# Match between runs and quantification method settings
if fragpipe_params.loc["quantitation.run-label-free-quant"] == "true":
params.enable_match_between_runs = bool(int(fragpipe_params.loc["ionquant.mbr"]))
elif fragpipe_params.loc["diann.run-dia-nn"] == "true":
diann_quant_dict = {
1: "Any LC (high accuracy)",
2: "Any LC (high precision)",
3: "Robust LC (high accuracy)",
4: "Robust LC (high precision)",
}
params.enable_match_between_runs = (
"diann.fragpipe.cmd-opts" in fragpipe_params.index
and "--reanalyse" in fragpipe_params.loc["diann.fragpipe.cmd-opts"]
) or ("diann.cmd-opts" in fragpipe_params.index and "--reanalyse" in fragpipe_params.loc["diann.cmd-opts"])
params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]
# Protein inference settings
if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}"
params.fill_none()
return params
if __name__ == "__main__":
# Process FragPipe workflow file and extract parameters
files = [
"../../../test/params/fragpipe.workflow",
"../../../test/params/fragpipe_older.workflow",
"../../../test/params/fragpipe_win_paths.workflow",
"../../../test/params/fragpipe_v22.workflow",
"../../../test/params/fragpipe_fdr_test.workflow",
"../../../test/params/fragpipe-version.workflow",
"../../../test/params/fragpipe_v23_noMBR.workflow",
]
for file_path in files:
file = pathlib.Path(file_path)
with open(file, "rb") as f:
_, _, _, data = read_fragpipe_workflow(f)
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
df.to_csv(file.with_suffix(".csv"))
with open(file, "rb") as f:
params = extract_params(f)
series = pd.Series(params.__dict__)
print(series)
print("\n")
series.to_csv(file.parent / f"{file.stem}_extracted_params.csv")