Source code for proteobench.io.params.proline

"""
Proline Studio is a quantification tool. The search engine is often Mascot.
The parameters are provided per raw file in separate sheets of an Excel file.

Relevant information in sheets:
- "Search settings and infos",
- "Import and filters"
- "Quant config"
"""

import os
import re
from pathlib import Path
from typing import List

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

# Column names for different sheets in the Excel file
use_columns = {
    "Search settings and infos": [
        "software_name",
        "software_version",
        "enzymes",
        "max_missed_cleavages",
        "fixed_ptms",
        "variable_ptms",
        "peptide_charge_states",
        "peptide_mass_error_tolerance",
        "fragment_mass_error_tolerance",
    ],
    "Import and filters": [
        "psm_filter_expected_fdr",
        "psm_filter_2",
    ],
    "Quant config": [],
}

# Regular expressions for finding minimum peptide length and charge states
PATTERN_MIN_PEP_LENGTH = r"\[threshold_value=([0-9].*)\]"
PATTERN_CHARGE = r"[\d+]+"



[docs]
def find_charge(string: str) -> List[int]:
    """
    Extract charge states from a string using a regular expression.

    Parameters
    ----------
    string : str
        The string containing charge states.

    Returns
    -------
    List[int]
        A list of charge states as integers.
    """
    charges = re.findall(PATTERN_CHARGE, string)
    charges = [int(c[:-1]) for c in charges]  # Remove any trailing non-digit characters
    return charges




[docs]
def find_min_pep_length(string: str) -> int:
    """
    Extract the minimum peptide length from a string using a regular expression.

    Parameters
    ----------
    string : str
        The string containing the minimum peptide length.

    Returns
    -------
    int
        The minimum peptide length as an integer.
    """
    min_length = re.findall(PATTERN_MIN_PEP_LENGTH, string)[0]
    return int(min_length)




[docs]
def extract_params(
    fname: str, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
    """
    Parse Proline Studio parameter file (Excel) and extract relevant parameters.

    Parameters
    ----------
    fname : str
        The path to the Proline Studio Excel parameter file.

    Returns
    -------
    ProteoBenchParameters
        The extracted parameters encapsulated in a `ProteoBenchParameters` object.
    """
    params = ProteoBenchParameters(filename=json_file)
    excel = pd.ExcelFile(fname)

    # Parse the "Search settings and infos" sheet
    sheet_name = "Search settings and infos"
    cols = use_columns[sheet_name]
    sheet = excel.parse(sheet_name, dtype="object", index_col=0).T
    idx = sheet["quant_channel_name"].to_list()
    stats = sheet.describe()

    # Validate unique entries in the selected columns
    assert all(stats.loc["unique", cols] == 1), "Not all columns are unique"

    # Filter and reset index
    sheet = sheet[cols].drop_duplicates().reset_index(drop=True)

    # Extract relevant parameters from the sheet
    params.software_name = "ProlineStudio"
    params.search_engine = sheet.loc[0, "software_name"]
    params.search_engine_version = sheet.loc[0, "software_version"]
    params.enzyme = sheet.loc[0, "enzymes"]
    params.allowed_miscleavages = sheet.loc[0, "max_missed_cleavages"]
    params.fixed_mods = sheet.loc[0, "fixed_ptms"]
    params.variable_mods = sheet.loc[0, "variable_ptms"]
    _precursor_mass_tolerance = sheet.loc[0, "peptide_mass_error_tolerance"]
    params.precursor_mass_tolerance = f"[-{_precursor_mass_tolerance}, {_precursor_mass_tolerance}]"
    _fragment_mass_tolerance = sheet.loc[0, "fragment_mass_error_tolerance"]
    params.fragment_mass_tolerance = f"[-{_fragment_mass_tolerance}, {_fragment_mass_tolerance}]"

    # Extract charge states and set min/max precursor charge
    charges = find_charge(sheet.loc[0, "peptide_charge_states"])
    params.min_precursor_charge = min(charges)
    params.max_precursor_charge = max(charges)

    # Parse the "Import and filters" sheet
    sheet_name = "Import and filters"
    cols = use_columns[sheet_name]
    sheet = excel.parse(sheet_name, dtype="object", index_col=0).T.loc[idx, cols]
    stats = sheet.describe()
    assert all(stats.loc["unique", cols] == 1), "Not all columns are unique"
    sheet = sheet[cols].drop_duplicates().reset_index(drop=True)

    # Extract FDR and peptide length information
    try:
        params.ident_fdr_psm = int(sheet.loc[0, "psm_filter_expected_fdr"]) / 100
    except ValueError:
        params.ident_fdr_psm = sheet.loc[0, "psm_filter_expected_fdr"]
    params.min_peptide_length = find_min_pep_length(sheet.loc[0, "psm_filter_2"])

    # Parse the "Quant config" sheet for match between runs (MBR) information
    sheet_name = "Quant config"
    sheet = excel.parse(sheet_name, dtype="object", index_col=0)
    enable_match_between_runs = sheet.index.str.contains("cross assignment").any()
    params.enable_match_between_runs = bool(enable_match_between_runs)

    # Try to extract software version from "Dataset statistics and infos" sheet
    try:
        sheet_name = "Dataset statistics and infos"
        sheet = excel.parse(sheet_name, dtype="object", index_col=0, header=None).squeeze()
        params.software_version = sheet.loc["version"]
    except KeyError:
        pass
    except ValueError:
        pass

    params.fill_none()

    return params



if __name__ == "__main__":
    """
    Extract parameters from Proline Studio parameter files and write them to CSV files.
    """
    files = [
        "../../../test/params/Proline_example_w_Mascot_wo_proteinSets.xlsx",
        "../../../test/params/Proline_example_2.xlsx",
        "../../../test/params/ProlineStudio_withMBR.xlsx",
        "../../../test/params/ProlineStudio_241024.xlsx",
    ]

    for file in files:
        file = Path(file)

        # Extract parameters from the file
        params = extract_params(file)

        # Convert the extracted parameters to a dictionary and then to a pandas Series
        data_dict = params.__dict__
        series = pd.Series(data_dict)

        # Write the Series to a CSV file
        series.to_csv(file.with_suffix(".csv"))