Source code for proteobench.io.params.peaks
"""
Peaks parameter parsing.
"""
import os
import re
from pathlib import Path
from typing import List, Optional
import pandas as pd
import yaml
from proteobench.io.params import ProteoBenchParameters
[docs]
def clean_text(text: str) -> str:
"""
Clean the input text by removing leading and trailing spaces, colons, commas, or tabs.
Parameters
----------
text : str
The text to be cleaned.
Returns
-------
str
The cleaned text.
"""
text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text)
return text
[docs]
def extract_value(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the value associated with a search term from a list of lines.
Parameters
----------
lines : List[str]
The list of lines to search through.
search_term : str
The term to search for in the lines.
Returns
-------
Optional[str]
The extracted value, or None if the search term is not found.
"""
matching_line = next((line for line in lines if search_term in line), None)
if matching_line:
raw_value = matching_line.split(search_term, 1)[1]
return clean_text(raw_value)
return None
[docs]
def extract_mass_tolerance(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the mass tolerance value associated with a search term, with special handling for "System Default".
Parameters
----------
lines : List[str]
The list of lines to search through.
search_term : str
The term to search for in the lines.
Returns
-------
Optional[str]
The extracted mass tolerance value, or None if the search term is not found.
"""
value = next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
value = "40 ppm" if value == "System Default" else value
return value
[docs]
def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]:
"""
Extract the value associated with a search term using regular expressions.
Parameters
----------
lines : List[str]
The list of lines to search through.
search_term : str
The regular expression to search for in the lines.
Returns
-------
Optional[str]
The extracted value, or None if the search term is not found.
"""
return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)
[docs]
def get_items_between(lines: list, start: str, end: str, only_last: bool = False) -> list:
"""
Find all lines starting with '-' that appear between 'start' and 'end'.
Return them as a list of strings, without the leading dash.
Parameters
----------
lines : list
The list of lines to search through.
start : str
The start term to search for in the lines.
end : str
The end term to search for in the lines.
only_last : bool
If True, only the items found between the last occurrence of start and end will be returned.
Returns
-------
list
The list of items found between the start and end terms.
"""
capturing = False
items = []
temp_items = []
for line in lines:
stripped = line.strip()
if stripped.startswith(start):
capturing = True
temp_items = []
continue
if capturing and stripped.startswith(end):
capturing = False
if only_last:
items = temp_items[:]
else:
items.extend(temp_items)
temp_items = []
if capturing and stripped.startswith("- "):
# Remove the dash and leading space
item = stripped[2:].strip()
temp_items.append(item)
if only_last and capturing:
items = temp_items
return items
[docs]
def extract_params(
file_path: str, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
"""
Read a PEAKS settings file, extract parameters, and return them as a `ProteoBenchParameters` object.
Parameters
----------
file_path : str
The path to the PEAKS settings file.
Returns
-------
ProteoBenchParameters
The extracted parameters encapsulated in a `ProteoBenchParameters` object.
"""
if hasattr(file_path, "read"):
lines = file_path.read().decode("utf-8").splitlines()
else:
try:
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()
except Exception as e:
raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")
lines = [line.strip() for line in lines]
params = ProteoBenchParameters(filename=json_file)
params.software_name = "PEAKS"
params.software_version = extract_value(lines, "PEAKS Version:")
params.search_engine = "PEAKS"
params.search_engine_version = params.software_version
psm_fdr = extract_value(lines, "Precursor FDR:")
# Its either "Precursor FDR:" (DIA) or "PSM FDR:" (DDA)
if not psm_fdr:
psm_fdr = extract_value(lines, "PSM FDR:")
peptide_fdr = extract_value(lines, "Peptide FDR:")
params.ident_fdr_peptide = peptide_fdr
params.ident_fdr_psm = psm_fdr
# peaks uses Proteins -10LgP >= 15.0 instead of FDR
protein_fdr = extract_value(lines, "Protein Group FDR:")
params.ident_fdr_protein = protein_fdr
params.enable_match_between_runs = True if extract_value(lines, "Match Between Run:") == "Yes" else False
params.precursor_mass_tolerance = extract_mass_tolerance(lines, "Precursor Mass Error Tolerance:")
params.fragment_mass_tolerance = extract_mass_tolerance(lines, "Fragment Mass Error Tolerance:")
params.enzyme = extract_value(lines, "Enzyme:")
params.semi_enzymatic = extract_value(lines, "Digest Mode:") != "Specific"
params.allowed_miscleavages = int(extract_value(lines, "Max Missed Cleavage:"))
try:
peptide_length_range = extract_value(lines, "Peptide Length between:").split(",")
except AttributeError:
peptide_length_range = extract_value(lines, "Peptide Length Range:").split(" - ")
params.max_peptide_length = int(peptide_length_range[1])
params.min_peptide_length = int(peptide_length_range[0])
fixed = get_items_between(lines, "Fixed Modifications:", "Variable Modifications:", only_last=True)
params.fixed_mods = " ,".join(fixed)
varmods = get_items_between(lines, "Variable Modifications:", "Database:", only_last=True)
params.variable_mods = " ,".join(varmods)
params.max_mods = int(extract_value(lines, "Max Variable PTM per Peptide:"))
try:
precursor_charge_between = extract_value(lines, "Precursor Charge between:").split(",")
except AttributeError:
precursor_charge_between = (
extract_value(lines, "Charge between:").replace("[", "").replace("]", "").split(" - ")
)
params.min_precursor_charge = int(precursor_charge_between[0])
params.max_precursor_charge = int(precursor_charge_between[1])
try:
precursor_mz_between = extract_value(lines, "Precursor M/Z between:").split(",")
params.min_precursor_mz = int(precursor_mz_between[0])
params.max_precursor_mz = int(precursor_mz_between[1])
fragment_mz_between = extract_value(lines, "Fragment M/Z between:").split(",")
params.min_fragment_mz = int(fragment_mz_between[0])
params.max_fragment_mz = int(fragment_mz_between[1])
except AttributeError: # DDA
params.min_precursor_mz = None
params.max_precursor_mz = None
params.min_fragment_mz = None
params.max_fragment_mz = None
params.scan_window = None
params.quantification_method = extract_value(
lines, "LFQ Method:"
) # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:"
params.protein_inference = None
params.predictors_library = None
params.abundance_normalization_ions = extract_value(lines, "Normalization Method:")
return params
if __name__ == "__main__":
"""
Reads PEAKS settings files, extracts parameters, and writes them to CSV files.
"""
fnames = [
"../../../test/params/PEAKS_parameters.txt",
"../../../test/params/PEAKS_parameters_DDA.txt",
"../../../test/params/PEAKS_parameters_DIA.txt",
"../../../test/params/PEAKS_parameters_DDA_new.txt",
"../../../test/params/PEAKS_diaPASEF.txt",
]
for file in fnames:
# Extract parameters from the settings file
parameters = extract_params(file)
# Convert parameters to pandas Series and save to CSV
actual = pd.Series(parameters.__dict__)
actual.to_csv(Path(file).with_suffix(".csv"))
# Optionally, print the parameters to the console
print(parameters)