Source code for proteobench.io.params.metamorpheus

"""
Extract parameters from a MetaMorpheus TOML file and convert them to a pandas Series.
"""

import os
import tomllib as toml
from io import BytesIO
from pathlib import Path, PosixPath
from typing import IO, Tuple, Union

import pandas as pd

from proteobench.io.params import ProteoBenchParameters



[docs]
def load_files(file1: Union[str, IO], file2: Union[str, IO]) -> Tuple[Union[str, None], Union[dict, None]]:
    """
    Load two files (IO objects or file paths), returning:
    - The first line from a plain text file as the version string
    - A dictionary parsed from a TOML file

    Returns
    -------
    Tuple[Union[str, None], Union[dict, None]]
        versions_line, settings_dict
    """
    versions_line = None
    settings = None

    def try_parse(file: Union[str, IO]):
        nonlocal versions_line, settings

        # Case 1: Path
        if isinstance(file, (str, PosixPath, Path)):
            # Try TOML
            try:
                with open(file, "rb") as f:
                    settings_candidate = toml.load(f)
                    settings = settings_candidate
                    return
            except Exception:
                pass
            # Try version line
            try:
                with open(file, "r", encoding="utf-8") as f:
                    versions_line = f.readline().strip()
                    return
            except Exception:
                pass

        # Case 2: IO object
        elif hasattr(file, "read"):
            try:
                file.seek(0)
                # Try loading directly (only works if binary)
                settings_candidate = toml.load(file)
                settings = settings_candidate
                return
            except Exception:
                pass
            try:
                # Try to convert to binary buffer if in text mode
                file.seek(0)
                content = file.read()
                if isinstance(content, str):
                    buffer = BytesIO(content.encode("utf-8"))
                    settings_candidate = toml.load(buffer)
                    settings = settings_candidate
                    return
            except Exception:
                pass
            try:
                file.seek(0)
                line = file.readline()
                if isinstance(line, bytes):
                    line = line.decode("utf-8", errors="replace")
                versions_line = line.strip()
                return
            except Exception:
                pass

    for f in (file1, file2):
        try_parse(f)

    if versions_line and settings:
        print("Successfully parsed both versions and settings.")
    else:
        print("Could not identify both versions and settings from the provided files.")

    return versions_line, settings




[docs]
def parse_modifications(mods: str) -> list:
    """
    Parse modifications from a string or list format into a standardized list.

    Parameters
    ----------
    mods : Union[str]
        Modifications in string format (e.g., ""Common Fixed\tCarbamidomethyl on C\t\tCommon Fixed\tCarbamidomethyl on U"")

    Returns
    -------
    list
        List of modifications.
    """
    parsed_mod_list = []
    mod_list = mods.split("\t\t")
    for mod in mod_list:
        mod_spec = mod.split("\t")[1]
        parsed_mod_list.append(mod_spec)

    return ";".join(parsed_mod_list) if parsed_mod_list else []




[docs]
def format_tolerances(tolerance: str) -> str:
    """
    Format mass tolerance values from a string to a standardized format.

    Parameters
    ----------
    tolerance : str
        Mass tolerance in string format (e.g., "±20.0000 PPM")

    Returns
    -------
    str
        Formatted mass tolerance as a string.
    """
    tolerance, unit = tolerance.split()
    tolerance = tolerance.strip("±")
    tolerance = float(tolerance)
    formatted_tolerance = f"[-{tolerance:.2f} {unit}, {tolerance:.2f} {unit}]"
    return formatted_tolerance




[docs]
def extract_params(file_path_1, file_path_2) -> ProteoBenchParameters:
    params = ProteoBenchParameters()

    versions_line, settings = load_files(file_path_1, file_path_2)

    params.software_name = "MetaMorpheus"
    params.search_engine = "MetaMorpheus"
    params.software_version = versions_line.split()[2]
    params.enzyme = settings["CommonParameters"]["DigestionParams"]["Protease"]
    params.semi_specific = settings["CommonParameters"]["DigestionParams"]["FragmentationTerminus"] != "Both"
    params.allowed_miscleavages = settings["CommonParameters"]["DigestionParams"]["MaxMissedCleavages"]
    params.fixed_mods = parse_modifications(settings["CommonParameters"]["ListOfModsFixed"])
    params.variable_mods = parse_modifications(settings["CommonParameters"]["ListOfModsVariable"])
    params.precursor_mass_tolerance = format_tolerances(settings["CommonParameters"]["PrecursorMassTolerance"])
    params.fragment_mass_tolerance = format_tolerances(settings["CommonParameters"]["ProductMassTolerance"])
    params.min_peptide_length = settings["CommonParameters"]["DigestionParams"]["MinPeptideLength"]
    params.max_peptide_length = settings["CommonParameters"]["DigestionParams"]["MaxPeptideLength"]
    params.max_mods = settings["CommonParameters"]["DigestionParams"]["MaxModsForPeptide"]
    params.min_precursor_charge = settings["CommonParameters"]["PrecursorDeconvolutionParameters"][
        "MinAssumedChargeState"
    ]
    params.max_precursor_charge = settings["CommonParameters"]["PrecursorDeconvolutionParameters"][
        "MaxAssumedChargeState"
    ]
    params.enable_match_between_runs = bool(settings["SearchParameters"]["MatchBetweenRuns"])
    params.quantification_method = "FlashLFQ"
    params.protein_inference = "Parsimony" if settings["SearchParameters"]["DoParsimony"] == True else None
    params.abundance_normalization_ions = True if settings["SearchParameters"]["Normalize"] == True else False
    params.ident_fdr_psm = "{}".format(settings["CommonParameters"]["QValueThreshold"])
    params.ident_fdr_peptide = None
    params.ident_fdr_protein = None
    params.search_engine_version = None
    return params



if __name__ == "__main__":
    fnames = [
        [
            "../../../test/params/metamorpheus_search_task_config.toml",
            "../../../test/params/metamorpheus_version_result.txt",
        ],
        # Reverse order
        [
            "../../../test/params/metamorpheus_version_result.txt",
            "../../../test/params/metamorpheus_search_task_config.toml",
        ],
    ]

    for file1, file2 in fnames:
        # Extract parameters from the file
        parameters = extract_params(file1, file2)
        print(parameters.__dict__)
        # With streamlit the IO object is used -> open files
        print("\n")
        with open(file1, "r") as f1, open(file2, "r") as f2:
            parameters = extract_params(f1, f2)

            f1.seek(0), f2.seek(0)
        print("\n")
        print(parameters.__dict__)
        series = pd.Series(parameters.__dict__)
        series.to_csv("../../../test/params/metamorpheus_parameters.csv")
        print("\n")