Source code for proteobench.io.params.fragger
"""
Functionality to parse FragPipe fragger.params parameter files.
FragPipe has a text based parameter file format which
separates parameters and their value using an equal sign. Optional comments are
expressed with a hash sign.
"""
from __future__ import annotations
import logging
import os
import pathlib
import re
from collections import namedtuple
from io import BytesIO
from typing import List
import pandas as pd
from proteobench.io.params import ProteoBenchParameters
logger = logging.getLogger(__name__)
Parameter = namedtuple("Parameter", ["name", "value", "comment"])
VERSION_NO_PATTERN = r"MSFragger-(.+)\.jar"
[docs]
def parse_phi_report_filters(phi_report_cmd: str) -> tuple[float, float, float]:
"""
Parse the filters from the phi-report command string.
Parameters
----------
phi_report_cmd : str
The command string from the phi-report filter.
Returns
-------
tuple of (float, float, float)
A tuple containing the PSM, peptide, and protein FDR values.
"""
# Define default FDR values
default_fdr = 0.01
# Define regex patterns for FDR values
fdr_patterns = {
"psm": r"--psm\s+(\d+\.\d+)",
"peptide": r"--pep\s+(\d+\.\d+)",
"protein": r"--prot\s+(\d+\.\d+)",
}
# Extract FDR values using regex
fdr_values = {
key: float(match.group(1)) if (match := re.search(pattern, phi_report_cmd)) else default_fdr
for key, pattern in fdr_patterns.items()
}
return fdr_values["psm"], fdr_values["peptide"], fdr_values["protein"]
[docs]
def parse_params(l_of_str: List[str], sep: str = " = ") -> List[Parameter]:
"""
Parse the FragPipe parameter file and return a list of Parameter objects.
Parameters
----------
l_of_str : List[str]
The lines of the FragPipe parameter file as a list of strings.
sep : str, optional
The separator between parameter names and values. Default is " = ".
Returns
-------
List[Parameter]
A list of Parameter namedtuples containing the parameter name, value, and any comment.
"""
data = []
for line in l_of_str:
line = line.strip()
logger.debug(line)
if line.startswith("#"):
continue # Skip comments
if not line:
continue # Skip empty lines
if "#" in line: # Handle lines with inline comments
res = line.split("#")
if len(res) == 1:
comment = res[0]
data.append(Parameter(None, None, comment.strip()))
continue
param, comment = [x.strip() for x in res]
else:
param = line
comment = None
res = param.strip().split(sep, maxsplit=1)
if len(res) == 1:
param = res[0].strip()
data.append(Parameter(param, None, comment))
continue
param, value = [x.strip() for x in res]
data.append(Parameter(param, value, comment))
return data
[docs]
def read_fragpipe_workflow(file: BytesIO, sep: str = "=") -> tuple[str, str | None, list[Parameter]]:
"""
Read the FragPipe workflow file and return the header and a list of Parameter objects.
Parameters
----------
file : BytesIO
The FragPipe workflow file to read.
sep : str, optional
The separator used between parameter names and values. Default is "=".
Returns
-------
tuple of (str, list of Parameter)
A tuple containing the header and a list of Parameter objects.
"""
l_of_str = file.read().decode("utf-8").splitlines()
header = l_of_str[0][1:].strip() # Skip leading '#' in the header
msfragger_version = None
fragpipe_version = None
for ss in l_of_str[1:]:
if ss.startswith("# MSFragger version"):
msfragger_version = ss.split(" ")[-1].strip()
break
elif ss.startswith("fragpipe-config.bin-msfragger"):
path = ss.split("=")[-1].strip()
if "/" in path:
filename = path.split("/")[-1]
elif "\\" in path:
filename = path.split("\\")[-1]
else:
filename = path
match = re.search(VERSION_NO_PATTERN, filename)
if match:
msfragger_version = match.group(1)
if ss.startswith("# FragPipe version"):
fragpipe_version = ss.split(" ")[-1].strip()
return header, msfragger_version, fragpipe_version, parse_params(l_of_str, sep=sep)
[docs]
def extract_params(
file: BytesIO, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
"""
Parse FragPipe parameter files and extract relevant parameters into a `ProteoBenchParameters` object.
Parameters
----------
file : BytesIO
The FragPipe parameter file to parse.
Returns
-------
ProteoBenchParameters
The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
header, msfragger_version, fragpipe_version, fragpipe_params = read_fragpipe_workflow(file)
fragpipe_params = pd.DataFrame.from_records(fragpipe_params, columns=Parameter._fields).set_index(
Parameter._fields[0]
)["value"]
# Extract version from header
if not fragpipe_version:
fragpipe_version = re.match(r"FragPipe \((\d+\.\d+.*)\)", header).group(1)
# Initialize ProteoBenchParameters
params = ProteoBenchParameters(filename=json_file)
params.software_name = "FragPipe"
params.software_version = fragpipe_version
params.search_engine = "MSFragger"
params.search_engine_version = msfragger_version
# Enzyme and cleavage settings
enzyme = fragpipe_params.loc["msfragger.search_enzyme_name_1"]
if fragpipe_params.loc["msfragger.search_enzyme_name_2"] != "null":
enzyme += f"|{fragpipe_params.loc['msfragger.search_enzyme_name_2']}"
if enzyme == "stricttrypsin":
enzyme = "Trypsin/P" # strict trypsin: always cut after K and R
elif enzyme == "trypsin":
enzyme = "Trypsin" # trypsin: do not cut before P
params.enzyme = enzyme
params.allowed_miscleavages = int(fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"])
if fragpipe_params.loc["msfragger.num_enzyme_termini"] == "2":
# 2 is ENZYMATIC, 1 is SEMI, 3 is SEMI_N_TERM, 0 is NONSPECIFIC
params.semi_enzymatic = False
else:
params.semi_enzymatic = True
# Modifications
params.fixed_mods = fragpipe_params.loc["msfragger.table.fix-mods"]
params.variable_mods = fragpipe_params.loc["msfragger.table.var-mods"]
params.max_mods = int(fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"])
# Peptide length
params.min_peptide_length = int(fragpipe_params.loc["msfragger.digest_min_length"])
params.max_peptide_length = int(fragpipe_params.loc["msfragger.digest_max_length"])
# Precursor mass tolerance
precursor_mass_units = "Da"
if int(fragpipe_params.loc["msfragger.precursor_mass_units"]):
precursor_mass_units = "ppm"
params.precursor_mass_tolerance = f'[{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}, {fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}]'
# Fragment mass tolerance
fragment_mass_units = "Da"
if int(fragpipe_params.loc["msfragger.fragment_mass_units"]):
fragment_mass_units = "ppm"
params.fragment_mass_tolerance = f'[-{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}, {fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}]'
if fragpipe_params.loc["diann.run-dia-nn"] == "true":
params.ident_fdr_protein = fragpipe_params.loc["diann.q-value"]
params.ident_fdr_peptide = None
params.ident_fdr_psm = fragpipe_params.loc["diann.q-value"]
params.abundance_normalization_ions = None
else:
phi_report_cmd = fragpipe_params.loc["phi-report.filter"]
params.ident_fdr_psm, params.ident_fdr_peptide, params.ident_fdr_protein = parse_phi_report_filters(
phi_report_cmd
)
# Precursor charge settings
if fragpipe_params.loc["msfragger.override_charge"] == "true":
params.min_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-lo"])
params.max_precursor_charge = int(fragpipe_params.loc["msfragger.misc.fragger.precursor-charge-hi"])
else:
params.min_precursor_charge = 1
params.max_precursor_charge = None
params.min_precursor_mz = (
int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-lo"]) / params.max_precursor_charge
if params.max_precursor_charge
else None
)
params.max_precursor_mz = (
int(fragpipe_params.loc["msfragger.misc.fragger.digest-mass-hi"]) / params.min_precursor_charge
if params.min_precursor_charge
else None
)
params.min_fragment_mz = None
params.max_fragment_mz = None
# Match between runs and quantification method settings
if fragpipe_params.loc["quantitation.run-label-free-quant"] == "true":
params.enable_match_between_runs = bool(int(fragpipe_params.loc["ionquant.mbr"]))
elif fragpipe_params.loc["diann.run-dia-nn"] == "true":
diann_quant_dict = {
1: "Any LC (high accuracy)",
2: "Any LC (high precision)",
3: "Robust LC (high accuracy)",
4: "Robust LC (high precision)",
}
params.enable_match_between_runs = (
"diann.fragpipe.cmd-opts" in fragpipe_params.index
and "--reanalyse" in fragpipe_params.loc["diann.fragpipe.cmd-opts"]
) or ("diann.cmd-opts" in fragpipe_params.index and "--reanalyse" in fragpipe_params.loc["diann.cmd-opts"])
params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]
# Protein inference settings
if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
params.protein_inference = f"ProteinProphet: {fragpipe_params.loc['protein-prophet.cmd-opts']}"
params.fill_none()
return params
if __name__ == "__main__":
# Process FragPipe workflow file and extract parameters
files = [
"../../../test/params/fragpipe.workflow",
"../../../test/params/fragpipe_older.workflow",
"../../../test/params/fragpipe_win_paths.workflow",
"../../../test/params/fragpipe_v22.workflow",
"../../../test/params/fragpipe_fdr_test.workflow",
"../../../test/params/fragpipe-version.workflow",
"../../../test/params/fragpipe_v23_noMBR.workflow",
]
for file_path in files:
file = pathlib.Path(file_path)
with open(file, "rb") as f:
_, _, _, data = read_fragpipe_workflow(f)
df = pd.DataFrame.from_records(data, columns=Parameter._fields).set_index(Parameter._fields[0])
df.to_csv(file.with_suffix(".csv"))
with open(file, "rb") as f:
params = extract_params(f)
series = pd.Series(params.__dict__)
print(series)
print("\n")
series.to_csv(file.parent / f"{file.stem}_extracted_params.csv")