Source code for proteobench.io.params.peaks

"""
Peaks parameter parsing.
"""

import re
from pathlib import Path
from typing import List, Optional

import pandas as pd
import yaml

from proteobench.io.params import ProteoBenchParameters



[docs]
def clean_text(text: str) -> str:
    """
    Clean the input text by removing leading and trailing spaces, colons, commas, or tabs.

    Parameters
    ----------
    text : str
        The text to be cleaned.

    Returns
    -------
    str
        The cleaned text.
    """
    text = re.sub(r"^[\s:,\t]+|[\s:,\t]+$", "", text)
    return text




[docs]
def extract_value(lines: List[str], search_term: str) -> Optional[str]:
    """
    Extract the value associated with a search term from a list of lines.

    Parameters
    ----------
    lines : List[str]
        The list of lines to search through.
    search_term : str
        The term to search for in the lines.

    Returns
    -------
    Optional[str]
        The extracted value, or None if the search term is not found.
    """
    matching_line = next((line for line in lines if search_term in line), None)
    if matching_line:
        raw_value = matching_line.split(search_term, 1)[1]
        return clean_text(raw_value)
    return None




[docs]
def extract_mass_tolerance(lines: List[str], search_term: str) -> Optional[str]:
    """
    Extract the mass tolerance value associated with a search term, with special handling for "System Default".

    Parameters
    ----------
    lines : List[str]
        The list of lines to search through.
    search_term : str
        The term to search for in the lines.

    Returns
    -------
    Optional[str]
        The extracted mass tolerance value, or None if the search term is not found.
    """
    value = next((clean_text(line.split(search_term)[1]) for line in lines if search_term in line), None)
    value = "40 ppm" if value == "System Default" else value
    return value




[docs]
def extract_value_regex(lines: List[str], search_term: str) -> Optional[str]:
    """
    Extract the value associated with a search term using regular expressions.

    Parameters
    ----------
    lines : List[str]
        The list of lines to search through.
    search_term : str
        The regular expression to search for in the lines.

    Returns
    -------
    Optional[str]
        The extracted value, or None if the search term is not found.
    """
    return next((clean_text(re.split(search_term, line)[1]) for line in lines if re.search(search_term, line)), None)




[docs]
def get_items_between(lines: list, start: str, end: str, only_last: bool = False) -> list:
    """
    Find all lines starting with '-' that appear between 'start' and 'end'.
    Return them as a list of strings, without the leading dash.

    Parameters
    ----------
    lines : list
        The list of lines to search through.
    start : str
        The start term to search for in the lines.
    end : str
        The end term to search for in the lines.
    only_last : bool
        If True, only the items found between the last occurrence of start and end will be returned.

    Returns
    -------
    list
        The list of items found between the start and end terms.
    """
    capturing = False
    items = []
    temp_items = []

    for line in lines:
        stripped = line.strip()

        if stripped.startswith(start):
            capturing = True
            temp_items = []
            continue

        if capturing and stripped.startswith(end):
            capturing = False
            if only_last:
                items = temp_items[:]
            else:
                items.extend(temp_items)
            temp_items = []

        if capturing and stripped.startswith("- "):
            # Remove the dash and leading space
            item = stripped[2:].strip()
            temp_items.append(item)

    if only_last and capturing:
        items = temp_items

    return items




[docs]
def read_peaks_settings(file_path: str) -> ProteoBenchParameters:
    """
    Read a PEAKS settings file, extract parameters, and return them as a `ProteoBenchParameters` object.

    Parameters
    ----------
    file_path : str
        The path to the PEAKS settings file.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters encapsulated in a `ProteoBenchParameters` object.
    """
    if hasattr(file_path, "read"):
        lines = file_path.read().decode("utf-8").splitlines()
    else:
        try:
            with open(file_path, encoding="utf-8") as f:
                lines = f.readlines()
        except Exception as e:
            raise IOError(f"Failed to open or read the file at {file_path}. Error: {e}")

    lines = [line.strip() for line in lines]

    params = ProteoBenchParameters()

    params.software_name = "PEAKS"
    params.software_version = extract_value(lines, "PEAKS Version:")
    params.search_engine = "PEAKS"
    params.search_engine_version = params.software_version

    psm_fdr = extract_value(lines, "Precursor FDR:")
    # Its either "Precursor FDR:" (DIA) or "PSM FDR:" (DDA)
    psm_fdr = extract_value(lines, "PSM FDR:")
    peptide_fdr = extract_value(lines, "Peptide FDR:")
    params.ident_fdr_peptide = peptide_fdr
    params.ident_fdr_psm = psm_fdr
    # peaks uses  Proteins -10LgP >= 15.0  instead of FDR
    params.ident_fdr_protein = None
    params.enable_match_between_runs = True if extract_value(lines, "Match Between Run:") == "Yes" else False
    params.precursor_mass_tolerance = extract_mass_tolerance(lines, "Precursor Mass Error Tolerance:")
    params.fragment_mass_tolerance = extract_mass_tolerance(lines, "Fragment Mass Error Tolerance:")
    params.enzyme = extract_value(lines, "Enzyme:")
    params.allowed_miscleavages = int(extract_value(lines, "Max Missed Cleavage:"))
    try:
        peptide_length_range = extract_value(lines, "Peptide Length between:").split(",")
    except AttributeError:
        peptide_length_range = extract_value(lines, "Peptide Length Range:").split(" - ")
    params.max_peptide_length = int(peptide_length_range[1])
    params.min_peptide_length = int(peptide_length_range[0])
    fixed = get_items_between(lines, "Fixed Modifications:", "Variable Modifications:", only_last=True)
    params.fixed_mods = " ,".join(fixed)
    varmods = get_items_between(lines, "Variable Modifications:", "Database:", only_last=True)
    params.variable_mods = " ,".join(varmods)
    params.max_mods = int(extract_value(lines, "Max Variable PTM per Peptide:"))
    try:
        precursor_charge_between = extract_value(lines, "Precursor Charge between:").split(",")
    except AttributeError:
        precursor_charge_between = (
            extract_value(lines, "Charge between:").replace("[", "").replace("]", "").split(" - ")
        )
    params.min_precursor_charge = int(precursor_charge_between[0])
    params.max_precursor_charge = int(precursor_charge_between[1])

    params.scan_window = None

    params.quantification_method = extract_value(
        lines, "LFQ Method:"
    )  # "Quantity MS Level:" or "Protein LFQ Method:" or "Quantity Type:"
    params.protein_inference = None
    params.predictors_library = None
    params.abundance_normalization_ions = extract_value(lines, "Normalization Method:")
    return params



if __name__ == "__main__":
    """
    Reads PEAKS settings files, extracts parameters, and writes them to CSV files.
    """
    fnames = [
        "../../../test/params/PEAKS_parameters.txt",
        "../../../test/params/PEAKS_parameters_DDA.txt",
        "../../../test/params/PEAKS_parameters_DIA.txt",
        "../../../test/params/PEAKS_parameters_DDA_new.txt",
    ]

    for file in fnames:
        # Extract parameters from the settings file
        parameters = read_peaks_settings(file)

        # Convert parameters to pandas Series and save to CSV
        actual = pd.Series(parameters.__dict__)
        actual.to_csv(Path(file).with_suffix(".csv"))

        # Optionally, print the parameters to the console
        print(parameters)