Source code for proteobench.io.params.peaks

"""
Peaks parameter parsing.
"""

import os
import re
from pathlib import Path
from typing import List, Optional

import pandas as pd
import yaml

from proteobench.io.params import ProteoBenchParameters



[docs]
def clean_text(text: str) -> str:
    """
    Clean the input text by removing leading and trailing spaces, colons, commas, or tabs.

    Parameters
    ----------
    text : str
        The text to be cleaned.

    Returns
    -------
    str
        The cleaned text.
    """
    text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text)
    return text




[docs]
def extract_value(lines: List[str], search_term: str) -> Optional[str]:
    """
    Extract the value associated with a search term from a list of lines.

    Parameters
    ----------
    lines : List[str]
        The list of lines to search through.
    search_term : str
        The term to search for in the lines.

    Returns
    -------
    Optional[str]
        The extracted value, or None if the search term is not found.
    """
    matching_line = next((line for line in lines if search_term in line), None)
    if matching_line:
        raw_value = matching_line.split(search_term, 1)[1]
        return clean_text(raw_value)
    return None




[docs]
def extract_mass_tolerance(lines: List[str], search_term: str) -> Optional[str]:
    """
    Extract the mass tolerance value associated with a search term, with special handling for "System Default".

    Parameters
    ----------
    lines : List[str]
        The list of lines to search through.
    search_term : str
        The term to search for in the lines.

    Returns
    -------
    Optional[str]
        The extracted mass tolerance value, or None if the search term is not found.
    """
    value = next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
    value = "40 ppm" if value == "System Default" else value
    return value




[docs]
def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]:
    """
    Extract the value associated with a search term using regular expressions.

    Parameters
    ----------
    lines : List[str]
        The list of lines to search through.
    search_term : str
        The regular expression to search for in the lines.

    Returns
    -------
    Optional[str]
        The extracted value, or None if the search term is not found.
    """
    return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)




[docs]
def get_items_between(lines: list, start: str, end: str, only_last: bool = False) -> list:
    """
    Find all lines starting with '-' that appear between 'start' and 'end'.
    Return them as a list of strings, without the leading dash.

    Parameters
    ----------
    lines : list
        The list of lines to search through.
    start : str
        The start term to search for in the lines.
    end : str
        The end term to search for in the lines.
    only_last : bool
        If True, only the items found between the last occurrence of start and end will be returned.

    Returns
    -------
    list
        The list of items found between the start and end terms.
    """
    capturing = False
    items = []
    temp_items = []

    for line in lines:
        stripped = line.strip()

        if stripped.startswith(start):
            capturing = True
            temp_items = []
            continue

        if capturing and stripped.startswith(end):
            capturing = False
            if only_last:
                items = temp_items[:]
            else:
                items.extend(temp_items)
            temp_items = []

        if capturing and stripped.startswith("- "):
            # Remove the dash and leading space
            item = stripped[2:].strip()
            temp_items.append(item)

    if only_last and capturing:
        items = temp_items

    return items




[docs]
def extract_params(
    file_path: str, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
    """
    Read a PEAKS settings file, extract parameters, and return them as a `ProteoBenchParameters` object.

    Parameters
    ----------
    file_path : str
        The path to the PEAKS settings file.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters encapsulated in a `ProteoBenchParameters` object.
    """
    if hasattr(file_path, "read"):
        lines = file_path.read().decode("utf-8").splitlines()
    else:
        try:
            with open(file_path, encoding="utf-8") as f:
                lines = f.readlines()
        except Exception as e:
            raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")

    lines = [line.strip() for line in lines]

    params = ProteoBenchParameters(filename=json_file)

    params.software_name = "PEAKS"
    params.software_version = extract_value(lines, "PEAKS Version:")
    params.search_engine = "PEAKS"
    params.search_engine_version = params.software_version

    psm_fdr = extract_value(lines, "Precursor FDR:")
    # Its either "Precursor FDR:" (DIA) or "PSM FDR:" (DDA)
    if not psm_fdr:
        psm_fdr = extract_value(lines, "PSM FDR:")
    peptide_fdr = extract_value(lines, "Peptide FDR:")
    params.ident_fdr_peptide = peptide_fdr
    params.ident_fdr_psm = psm_fdr
    # peaks uses  Proteins -10LgP >= 15.0  instead of FDR
    protein_fdr = extract_value(lines, "Protein Group FDR:")
    params.ident_fdr_protein = protein_fdr
    params.enable_match_between_runs = True if extract_value(lines, "Match Between Run:") == "Yes" else False
    params.precursor_mass_tolerance = extract_mass_tolerance(lines, "Precursor Mass Error Tolerance:")
    params.fragment_mass_tolerance = extract_mass_tolerance(lines, "Fragment Mass Error Tolerance:")
    params.enzyme = extract_value(lines, "Enzyme:")
    params.semi_enzymatic = extract_value(lines, "Digest Mode:") != "Specific"
    params.allowed_miscleavages = int(extract_value(lines, "Max Missed Cleavage:"))
    try:
        peptide_length_range = extract_value(lines, "Peptide Length between:").split(",")
    except AttributeError:
        peptide_length_range = extract_value(lines, "Peptide Length Range:").split(" - ")
    params.max_peptide_length = int(peptide_length_range[1])
    params.min_peptide_length = int(peptide_length_range[0])
    fixed = get_items_between(lines, "Fixed Modifications:", "Variable Modifications:", only_last=True)
    params.fixed_mods = " ,".join(fixed)
    varmods = get_items_between(lines, "Variable Modifications:", "Database:", only_last=True)
    params.variable_mods = " ,".join(varmods)
    params.max_mods = int(extract_value(lines, "Max Variable PTM per Peptide:"))
    try:
        precursor_charge_between = extract_value(lines, "Precursor Charge between:").split(",")
    except AttributeError:
        precursor_charge_between = (
            extract_value(lines, "Charge between:").replace("[", "").replace("]", "").split(" - ")
        )
    params.min_precursor_charge = int(precursor_charge_between[0])
    params.max_precursor_charge = int(precursor_charge_between[1])

    try:
        precursor_mz_between = extract_value(lines, "Precursor M/Z between:").split(",")
        params.min_precursor_mz = int(precursor_mz_between[0])
        params.max_precursor_mz = int(precursor_mz_between[1])
        fragment_mz_between = extract_value(lines, "Fragment M/Z between:").split(",")
        params.min_fragment_mz = int(fragment_mz_between[0])
        params.max_fragment_mz = int(fragment_mz_between[1])
    except AttributeError:  # DDA
        params.min_precursor_mz = None
        params.max_precursor_mz = None
        params.min_fragment_mz = None
        params.max_fragment_mz = None

    params.scan_window = None

    params.quantification_method = extract_value(
        lines, "LFQ Method:"
    )  # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:"
    params.protein_inference = None
    params.predictors_library = None
    params.abundance_normalization_ions = extract_value(lines, "Normalization Method:")
    return params



if __name__ == "__main__":
    """
    Reads PEAKS settings files, extracts parameters, and writes them to CSV files.
    """
    fnames = [
        "../../../test/params/PEAKS_parameters.txt",
        "../../../test/params/PEAKS_parameters_DDA.txt",
        "../../../test/params/PEAKS_parameters_DIA.txt",
        "../../../test/params/PEAKS_parameters_DDA_new.txt",
        "../../../test/params/PEAKS_diaPASEF.txt",
    ]

    for file in fnames:
        # Extract parameters from the settings file
        parameters = extract_params(file)

        # Convert parameters to pandas Series and save to CSV
        actual = pd.Series(parameters.__dict__)
        actual.to_csv(Path(file).with_suffix(".csv"))

        # Optionally, print the parameters to the console
        print(parameters)