Source code for proteobench.io.params.spectronaut
"""
Spectronaut parameter parsing.
"""
import re
from pathlib import Path
from typing import List, Optional, Tuple
import pandas as pd
from proteobench.io.params import ProteoBenchParameters
VENDOR_SYSTEM_MAP = {
"Thermo": "Thermo Orbitrap",
"Bruker": "TOF",
}
ms1_tolerance_static = re.compile(r"MS1 Tolerance \(Th\):\s*(\d*)")
ms2_tolerance_static = re.compile(r"MS2 Tolerance \(Th\):\s*(\d*)")
ms1_tolerance_relative = re.compile(r"MS1 Tolerance \(ppm\):\s*(\d*)")
ms2_tolerance_relative = re.compile(r"MS2 Tolerance \(ppm\):\s*(\d*)")
main_search_regex = re.compile(r"Main Search:\s*(.*)")
[docs]
def clean_text(text: str) -> str:
"""
Clean the input text by removing leading and trailing spaces, colons, commas, or tabs.
Parameters
----------
text : str
The text to be cleaned.
Returns
-------
str
The cleaned text.
"""
text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text)
return text
[docs]
def extract_value(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the value associated with a search term from a list of lines.
Parameters
----------
lines : List[str]
The list of lines to search through.
search_term : str
The term to search for in the lines.
Returns
-------
Optional[str]
The extracted value, or None if the search term is not found.
"""
return next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
[docs]
def extract_calibration_method(line: str) -> Optional[str]:
"""
Extract the calibration method from the 'Main Search' line.
"""
match = main_search_regex.search(line)
if match:
calibration_method = match.group(1).strip()
return calibration_method
return None
[docs]
def extract_tolerances(line: str, calibration_method: str, MS1_tol: Optional[str], MS2_tol: Optional[str]) -> tuple:
"""
Extract MS1 and MS2 tolerances based on the calibration method, without overwriting existing values.
"""
# Only extract MS1 and MS2 tolerances if they haven't already been set
if calibration_method == "Static":
MS1_tol, MS2_tol = extract_tolerances_with_regex(
line, MS1_tol, MS2_tol, ms1_tolerance_static, ms2_tolerance_static
)
elif calibration_method == "Relative":
MS1_tol, MS2_tol = extract_tolerances_with_regex(
line, MS1_tol, MS2_tol, ms1_tolerance_relative, ms2_tolerance_relative
)
return MS1_tol, MS2_tol
[docs]
def extract_tolerances_with_regex(
line: str,
MS1_tol: Optional[str],
MS2_tol: Optional[str],
ms1_tolerance_regex: re.Pattern,
ms2_tolerance_regex: re.Pattern,
) -> Tuple[Optional[str], Optional[str]]:
"""
Extract MS1 and MS2 tolerances from the line using the provided regular expressions,
without overwriting existing values.
Args:
line: The line from which tolerances should be extracted.
MS1_tol: Existing MS1 tolerance (retained if already set).
MS2_tol: Existing MS2 tolerance (retained if already set).
ms1_tolerance_regex: Regex pattern for MS1 tolerance.
ms2_tolerance_regex: Regex pattern for MS2 tolerance.
Returns:
A tuple (MS1_tol, MS2_tol) with updated or retained values.
"""
def extract_if_none(current: Optional[str], pattern: re.Pattern) -> Optional[str]:
if current is None:
match = pattern.search(line)
return match.group(1) if match else None
return current
MS1_tol = extract_if_none(MS1_tol, ms1_tolerance_regex)
MS2_tol = extract_if_none(MS2_tol, ms2_tolerance_regex)
return MS1_tol, MS2_tol
[docs]
def extract_mass_tolerance(lines: List[str], system="Thermo Orbitrap") -> Optional[str]:
"""
Extract mass tolerances from the 'Main Search' section based on the system and calibration method.
"""
tolerance_section = False
system_section = False
calibration_method = None
MS1_tol = MS2_tol = None
for line in lines:
if line.startswith("Pulsar Search\\Tolerances"):
tolerance_section = True
elif tolerance_section:
if line.startswith(system):
system_section = True
elif system_section:
# Extract the calibration method from the 'Main Search' line
if "Main Search:" in line and not calibration_method:
calibration_method = extract_calibration_method(line)
if calibration_method:
if calibration_method == "Dynamic":
return "Dynamic", "Dynamic"
else:
unit = "Th" if calibration_method == "Static" else "ppm"
# Extract the tolerances for the identified calibration method
MS1_tol, MS2_tol = extract_tolerances(line, calibration_method, MS1_tol, MS2_tol)
if MS1_tol is not None and MS2_tol is not None:
return (
f"[-{MS1_tol} {unit}, {MS1_tol} {unit}]",
f"[-{MS2_tol} {unit}, {MS2_tol} {unit}]",
)
return None
[docs]
def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the value associated with a search term using regular expressions.
Parameters
----------
lines : List[str]
The list of lines to search through.
search_term : str
The regular expression to search for in the lines.
Returns
-------
Optional[str]
The extracted value, or None if the search term is not found.
"""
return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)
[docs]
def read_spectronaut_settings(file_path: str, system="Thermo Orbitrap") -> ProteoBenchParameters:
"""
Read a Spectronaut settings file, extract parameters, and return them as a `ProteoBenchParameters` object.
Parameters
----------
file_path : str
The path to the Spectronaut settings file.
Returns
-------
ProteoBenchParameters
The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
# Try to read the file contents
if hasattr(file_path, "read"):
# Assume it behaves like a file object
lines = file_path.read().decode("utf-8").splitlines()
else:
try:
# Attempt to open and read the file
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()
except Exception as e:
raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")
# Remove any trailing newline characters from each line
lines = [line.strip() for line in lines]
system = extract_value(lines, "Vendor:")
if system in VENDOR_SYSTEM_MAP:
system = VENDOR_SYSTEM_MAP[system]
else:
raise ValueError(
f"Unknown system: {system}. Supported systems are: {', '.join(VENDOR_SYSTEM_MAP.keys())}. Did you upload the correct settings file?"
)
params = ProteoBenchParameters()
params.software_name = "Spectronaut"
params.software_version = lines[0].split()[1]
params.search_engine = "Spectronaut"
params.search_engine_version = params.software_version
# Clean up the lines and extract the relevant parameters
lines = [re.sub(r"^[\sāāāā]*", "", line).strip() for line in lines]
params.ident_fdr_psm = float(extract_value(lines, "Precursor Qvalue Cutoff:").replace(",", "."))
params.ident_fdr_peptide = None
params.ident_fdr_protein = float(extract_value(lines, "Protein Qvalue Cutoff (Experiment):").replace(",", "."))
params.enable_match_between_runs = False # https://x.com/OliverMBernhar1/status/1656220095553601537
params.precursor_mass_tolerance, params.fragment_mass_tolerance = extract_mass_tolerance(lines, system=system)
params.enzyme = extract_value(lines, "Enzymes / Cleavage Rules:")
params.allowed_miscleavages = int(extract_value(lines, "Missed Cleavages:"))
params.max_peptide_length = int(extract_value(lines, "Max Peptide Length:"))
params.min_peptide_length = int(extract_value(lines, "Min Peptide Length:"))
params.fixed_mods = extract_value(lines, "Fixed Modifications:")
params.variable_mods = extract_value_regex(lines, "^Variable Modifications:")
params.max_mods = int(extract_value(lines, "Max Variable Modifications:"))
_min_precursor_charge = extract_value(lines, "Peptide Charge:")
if _min_precursor_charge == "False":
params.min_precursor_charge = None
else:
params.min_precursor_charge = int(_min_precursor_charge)
_max_precursor_charge = extract_value(lines, "Peptide Charge:")
if _max_precursor_charge == "False":
params.max_precursor_charge = None
else:
params.max_precursor_charge = int(_max_precursor_charge)
params.scan_window = extract_value(lines, "XIC IM Extraction Window:")
params.quantification_method = extract_value(
lines, "Quantity MS Level:"
) # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:"
params.protein_inference = extract_value(lines, "Inference Algorithm:") # or Protein Inference Workflow:
params.predictors_library = None
params.abundance_normalization_ions = extract_value(lines, "Cross-Run Normalization:")
return params
if __name__ == "__main__":
"""
Reads Spectronaut settings files, extracts parameters, and writes them to CSV files.
"""
fnames = [
"../../../test/params/spectronaut_Experiment1_ExperimentSetupOverview_BGS_Factory_Settings.txt",
"../../../test/params/Spectronaut_dynamic.txt",
"../../../test/params/Spectronaut_static.txt",
"../../../test/params/Spectronaut_relative.txt",
]
for file in fnames:
# Extract parameters from the settings file
parameters = read_spectronaut_settings(file)
# Convert parameters to pandas Series and save to CSV
actual = pd.Series(parameters.__dict__)
actual.to_csv(Path(file).with_suffix(".csv"))
# Optionally, print the parameters to the console
print(parameters)