Source code for proteobench.io.params.spectronaut

"""
Spectronaut parameter parsing.
"""

import os
import re
from pathlib import Path
from typing import List, Optional, Tuple

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

VENDOR_SYSTEM_MAP = {
    "Thermo": "Thermo Orbitrap",
    "Bruker": "TOF",
}
ms1_tolerance_static = re.compile(r"MS1 Tolerance \(Th\):\s*(\d*)")
ms2_tolerance_static = re.compile(r"MS2 Tolerance \(Th\):\s*(\d*)")
ms1_tolerance_relative = re.compile(r"MS1 Tolerance \(ppm\):\s*(\d*)")
ms2_tolerance_relative = re.compile(r"MS2 Tolerance \(ppm\):\s*(\d*)")
main_search_regex = re.compile(r"Main Search:\s*(.*)")


[docs] def clean_text(text: str) -> str: """ Clean the input text by removing leading and trailing spaces, colons, commas, or tabs. Parameters ---------- text : str The text to be cleaned. Returns ------- str The cleaned text. """ text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text) return text
[docs] def extract_value(lines: List[str], search_term: str) -> Optional[str]: """ Extract the value associated with a search term from a list of lines. Parameters ---------- lines : List[str] The list of lines to search through. search_term : str The term to search for in the lines. Returns ------- Optional[str] The extracted value, or None if the search term is not found. """ return next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
[docs] def extract_calibration_method(line: str) -> Optional[str]: """ Extract the calibration method from the 'Main Search' line. """ match = main_search_regex.search(line) if match: calibration_method = match.group(1).strip() return calibration_method return None
[docs] def extract_tolerances(line: str, calibration_method: str, MS1_tol: Optional[str], MS2_tol: Optional[str]) -> tuple: """ Extract MS1 and MS2 tolerances based on the calibration method, without overwriting existing values. """ # Only extract MS1 and MS2 tolerances if they haven't already been set if calibration_method == "Static": MS1_tol, MS2_tol = extract_tolerances_with_regex( line, MS1_tol, MS2_tol, ms1_tolerance_static, ms2_tolerance_static ) elif calibration_method == "Relative": MS1_tol, MS2_tol = extract_tolerances_with_regex( line, MS1_tol, MS2_tol, ms1_tolerance_relative, ms2_tolerance_relative ) return MS1_tol, MS2_tol
[docs] def extract_tolerances_with_regex( line: str, MS1_tol: Optional[str], MS2_tol: Optional[str], ms1_tolerance_regex: re.Pattern, ms2_tolerance_regex: re.Pattern, ) -> Tuple[Optional[str], Optional[str]]: """ Extract MS1 and MS2 tolerances from the line using the provided regular expressions, without overwriting existing values. Args: line: The line from which tolerances should be extracted. MS1_tol: Existing MS1 tolerance (retained if already set). MS2_tol: Existing MS2 tolerance (retained if already set). ms1_tolerance_regex: Regex pattern for MS1 tolerance. ms2_tolerance_regex: Regex pattern for MS2 tolerance. Returns: A tuple (MS1_tol, MS2_tol) with updated or retained values. """ def extract_if_none(current: Optional[str], pattern: re.Pattern) -> Optional[str]: if current is None: match = pattern.search(line) return match.group(1) if match else None return current MS1_tol = extract_if_none(MS1_tol, ms1_tolerance_regex) MS2_tol = extract_if_none(MS2_tol, ms2_tolerance_regex) return MS1_tol, MS2_tol
[docs] def extract_mass_tolerance(lines: List[str], system="Thermo Orbitrap") -> Optional[str]: """ Extract mass tolerances from the 'Main Search' section based on the system and calibration method. """ tolerance_section = False system_section = False calibration_method = None MS1_tol = MS2_tol = None for line in lines: if line.startswith("Pulsar Search\\Tolerances"): tolerance_section = True elif tolerance_section: if line.startswith(system): system_section = True elif system_section: # Extract the calibration method from the 'Main Search' line if "Main Search:" in line and not calibration_method: calibration_method = extract_calibration_method(line) if calibration_method: if calibration_method == "Dynamic": return "Dynamic", "Dynamic" else: unit = "Th" if calibration_method == "Static" else "ppm" # Extract the tolerances for the identified calibration method MS1_tol, MS2_tol = extract_tolerances(line, calibration_method, MS1_tol, MS2_tol) if MS1_tol is not None and MS2_tol is not None: return ( f"[-{MS1_tol} {unit}, {MS1_tol} {unit}]", f"[-{MS2_tol} {unit}, {MS2_tol} {unit}]", ) return None
[docs] def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]: """ Extract the value associated with a search term using regular expressions. Parameters ---------- lines : List[str] The list of lines to search through. search_term : str The regular expression to search for in the lines. Returns ------- Optional[str] The extracted value, or None if the search term is not found. """ return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)
[docs] def read_spectronaut_settings( file_path: str, system="Thermo Orbitrap", json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DIA_ion.json"), ) -> ProteoBenchParameters: """ Read a Spectronaut settings file, extract parameters, and return them as a `ProteoBenchParameters` object. Parameters ---------- file_path : str The path to the Spectronaut settings file. Returns ------- ProteoBenchParameters The extracted parameters encapsulated in a `ProteoBenchParameters` object. """ # Try to read the file contents if hasattr(file_path, "read"): # Assume it behaves like a file object lines = file_path.read().decode("utf-8").splitlines() else: try: # Attempt to open and read the file with open(file_path, encoding="utf-8") as f: lines = f.readlines() except Exception as e: raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}") # Remove any trailing newline characters from each line lines = [line.strip() for line in lines] system = extract_value(lines, "Vendor:") if system in VENDOR_SYSTEM_MAP: system = VENDOR_SYSTEM_MAP[system] else: raise ValueError( f"Unknown system: {system}. Supported systems are: {', '.join(VENDOR_SYSTEM_MAP.keys())}. Did you upload the correct settings file?" ) params = ProteoBenchParameters(filename=json_file) params.software_name = "Spectronaut" params.software_version = lines[0].split()[1] params.search_engine = "Spectronaut" params.search_engine_version = params.software_version # Clean up the lines and extract the relevant parameters lines = [re.sub(r"^[\sā”‚ā”œā”€ā””]*", "", line).strip() for line in lines] params.ident_fdr_psm = float(extract_value(lines, "Precursor Qvalue Cutoff:").replace(",", ".")) params.ident_fdr_peptide = None params.ident_fdr_protein = float(extract_value(lines, "Protein Qvalue Cutoff (Experiment):").replace(",", ".")) params.enable_match_between_runs = False # https://x.com/OliverMBernhar1/status/1656220095553601537 params.precursor_mass_tolerance, params.fragment_mass_tolerance = extract_mass_tolerance(lines, system=system) params.enzyme = extract_value(lines, "Enzymes / Cleavage Rules:") params.semi_specific = extract_value(lines, "Digest Type:") != "Specific" params.allowed_miscleavages = int(extract_value(lines, "Missed Cleavages:")) params.max_peptide_length = int(extract_value(lines, "Max Peptide Length:")) params.min_peptide_length = int(extract_value(lines, "Min Peptide Length:")) params.fixed_mods = extract_value(lines, "Fixed Modifications:") params.variable_mods = extract_value_regex(lines, "^Variable Modifications:") params.max_mods = int(extract_value(lines, "Max Variable Modifications:")) _min_precursor_charge = extract_value(lines, "Peptide Charge:") if _min_precursor_charge == "False": params.min_precursor_charge = None else: params.min_precursor_charge = int(_min_precursor_charge) _max_precursor_charge = extract_value(lines, "Peptide Charge:") if _max_precursor_charge == "False": params.max_precursor_charge = None else: params.max_precursor_charge = int(_max_precursor_charge) params.min_fragment_mz = None # Spectronaut does not provide this information params.max_fragment_mz = None # Spectronaut does not provide this information params.max_precursor_mz = None # Spectronaut does not provide this information params.min_precursor_mz = None # Spectronaut does not provide this information params.scan_window = extract_value(lines, "XIC IM Extraction Window:") params.quantification_method = extract_value( lines, "Quantity MS Level:" ) # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:" params.protein_inference = extract_value(lines, "Inference Algorithm:") # or Protein Inference Workflow: params.predictors_library = None params.abundance_normalization_ions = extract_value(lines, "Cross-Run Normalization:") return params
if __name__ == "__main__": """ Reads Spectronaut settings files, extracts parameters, and writes them to CSV files. """ fnames = [ "../../../test/params/spectronaut_Experiment1_ExperimentSetupOverview_BGS_Factory_Settings.txt", "../../../test/params/Spectronaut_dynamic.txt", "../../../test/params/Spectronaut_static.txt", "../../../test/params/Spectronaut_relative.txt", ] for file in fnames: # Extract parameters from the settings file parameters = read_spectronaut_settings(file) # Convert parameters to pandas Series and save to CSV actual = pd.Series(parameters.__dict__) actual.to_csv(Path(file).with_suffix(".csv")) # Optionally, print the parameters to the console print(parameters)