Source code for proteobench.io.params.diann

"""
DIA-NN parameter parsing.
"""

import os
import pathlib
import re
from typing import Any, List, Optional

import pandas as pd
from packaging.version import Version

from proteobench.io.params import ProteoBenchParameters

# Regexes
fragment_mass_tolerance_regex = r"Optimised mass accuracy: (\d*\.?\d+) ppm"
precursor_mass_tolerance_regex = r"Recommended MS1 mass accuracy setting: (\d*\.?\d+) ppm"
software_version_regex = r"DIA-NN\s(.*?)\s\(Data-Independent Acquisition by Neural Networks\)"
scan_window_regex = r"Scan window radius set to (\d+)"
fdr_regex = r"Output will be filtered at (\d+\.\d+) FDR"
min_pep_len_regex = r"Min peptide length set to (\d+)"
max_pep_len_regex = r"Max peptide length set to (\d+)"
min_z_regex = r"Min precursor charge set to (\d+)"
max_z_regex = r"Max precursor charge set to (\d+)"
min_mz_prec_regex = r"Min precursor m/z set to (\d+)"
max_mz_prec_regex = r"Max precursor m/z set to (\d+)"
min_mz_frag_regex = r"Min fragment m/z set to (\d+)"
max_mz_frag_regex = r"Max fragment m/z set to (\d+)"
cleavage_regex = r"In silico digest will involve cuts at (.*)"
cleavage_exc_regex = r"But excluding cuts at (.*)"
missed_cleavages_regex = r"Maximum number of missed cleavages set to (\d+)"
max_mods_regex = r"Maximum number of variable modifications set to (\d+)"
fixed_mods_regex_1 = r"(.*) enabled as a fixed modification"
fixed_mods_regex_2 = r"Modification (.*) with mass delta \d+\.*\d* at .+ will be considered as fixed"
var_mods_regex = r"Modification (.*) with mass delta \d+\.*\d* at .+ will be considered as variable"
quant_mode_regex = r"(.*?) quantification mode"
protein_inference_regex = r"Implicit protein grouping: (.*);"

# Flags
enable_match_between_runs_regex = r"(MBR enabled)|(reanalyse them)"  # If present, MBR is enabled

PARAM_REGEX_DICT = {
    "ident_fdr_psm": fdr_regex,
    "ident_fdr_protein": fdr_regex,
    "precursor_mass_tolerance": precursor_mass_tolerance_regex,
    "fragment_mass_tolerance": fragment_mass_tolerance_regex,
    "enzyme": cleavage_regex,
    "allowed_miscleavages": missed_cleavages_regex,
    "min_peptide_length": min_pep_len_regex,
    "max_peptide_length": max_pep_len_regex,
    "fixed_mods": [fixed_mods_regex_1, fixed_mods_regex_2],
    "variable_mods": var_mods_regex,
    "max_mods": max_mods_regex,
    "min_precursor_charge": min_z_regex,
    "max_precursor_charge": max_z_regex,
    "scan_window": scan_window_regex,
    "enable_match_between_runs": enable_match_between_runs_regex,
}


PARAM_CMD_DICT = {
    "ident_fdr_psm": "qvalue",
    "enable_match_between_runs": "reanalyse",
    "precursor_mass_tolerance": "mass-acc-ms1",
    "fragment_mass_tolerance": "mass-acc",
    "enzyme": "cut",
    "allowed_miscleavages": "missed-cleavages",
    "min_peptide_length": "min-pep-len",
    "max_peptide_length": "max-pep-len",
    "min_fragment_mz": "min-fr-mz",
    "max_fragment_mz": "max-fr-mz",
    "min_precursor_mz": "min-pr-mz",
    "max_precursor_mz": "max-pr-mz",
    "fixed_mods": "mod",
    "variable_mods": "var-mod",
    "max_mods": "var-mods",
    "min_precursor_charge": "min-pr-charge",
    "max_precursor_charge": "max-pr-charge",
    "scan_window": "window",
    "protein_inference": "pg-level",
}
SETTINGS_PB_FLOAT = [
    "ident_fdr_psm",
    "ident_fdr_peptide",
    "ident_fdr_protein",
    "precursor_mass_tolerance",
    "fragment_mass_tolerance",
]
SETTINGS_PB_INT = [
    "allowed_miscleavages",
    "min_peptide_length",
    "max_peptide_length",
    "max_mods",
    "min_precursor_charge",
    "max_precursor_charge",
    "scan_window",
]
SETTINGS_PB_MOD = ["fixed_mods", "variable_mods"]

PROT_INF_MAP = {"isoform IDs": "Isoforms", "protein names": "Protein_names", "genes": "Genes"}



[docs]
def find_cmdline_string(lines: List[str]) -> Optional[str]:
    """
    Find the command line statement in the log file of DIANN.

    It is assumed that this statement is stored on a single line.

    Parameters
    ----------
    lines : list[str]
        All input lines from the DIA-NN log file.

    Returns
    -------
    str
        The command line string.
    """
    for line in lines:
        if "diann" in line and "--" in line:
            return line.strip()
    return None




[docs]
def parse_cmdline_string(cmd_line: str, software_version: str) -> dict:
    """
    Parse a DIA-NN command line string into a dictionary of settings.

    Parameters
    ----------
    cmd_line : str
        The command line string to parse.
    software_version : str
        The version of the DIA-NN software, e.g., "1.8".

    Returns
    -------
    dict
        Parsed settings in dictionary format. Keys are setting names, and values are:
        - List of inputs for multi-value settings.
        - Boolean `True` for flag-like settings (without values).
        - Modified settings for variable and fixed modifications.

    Raises
    ------
    AssertionError
        If an unsupported setting format is detected (e.g., `unimod` with extra arguments).
    """
    settings_dict = {}
    settings_list = [setting.split() for setting in cmd_line.split(" --")]
    variable_modifications = []
    fixed_modifications = []

    def add_modification(mod_list, setting, description=None):
        """
        Add a modification to the specified list.

        Parameters
        ----------
        mod_list : list
            The list of parsed modifications.
        setting : str
            The parsed setting file line.
        description : str, optional
            Modification description that overwrites the parsed setting file line.
        """
        if len(setting) != 1:
            raise ValueError(f"Invalid `unimod` format: {setting}")
        mod_list.append(description or setting[0])

    is_version_below_1_8 = Version(software_version.split(" ")[0]) < Version("1.8")

    for setting_parts in settings_list:
        key = setting_parts[0]
        values = setting_parts[1:]

        if key.startswith("unimod"):
            if is_version_below_1_8:
                if key == "unimod4":
                    add_modification(fixed_modifications, setting_parts, "Carbamidomethyl (C)")
                elif key == "unimod35":
                    add_modification(variable_modifications, setting_parts, "Oxidation (M)")
            else:
                add_modification(fixed_modifications, setting_parts)

        elif len(setting_parts) == 1:  # Boolean flag
            settings_dict[key] = True

        elif key == "var-mod":  # Handle variable modifications
            variable_modifications.append("".join(values).replace(",", "/"))

        else:  # General key-value settings
            settings_dict[key] = values

    # Add modifications to the settings dictionary
    settings_dict["var-mod"] = variable_modifications
    if "mod" not in settings_dict:
        settings_dict["mod"] = fixed_modifications

    return settings_dict




[docs]
def parse_setting(setting_name: str, setting_list: list) -> Any:
    """
    Parse individual settings based on their setting type.

    Parameters
    ----------
    setting_name : str
        The name of the setting (ProteoBench).
    setting_list : list
        The input value of a given setting.

    Returns
    -------
    Any
        The parsed setting.
    """
    if setting_name in SETTINGS_PB_FLOAT:
        assert len(setting_list) == 1
        return float(setting_list[0])
    if setting_name in SETTINGS_PB_INT:
        assert len(setting_list) == 1
        return int(setting_list[0])
    if setting_name in SETTINGS_PB_MOD:
        return ",".join(setting_list)
    return "".join(setting_list)




[docs]
def extract_with_regex(lines: List[str], regex, search_all=False) -> str:
    """
    If no mass accuracy was specified in the cmd string, extract it from the log-file.

    Parameters
    ----------
    lines : list[str]
        All input lines from the DIA-NN log file.
    regex : str
        The regex pattern to be matched.

    Returns
    -------
    str:
        The MS1 and MS2 mass accuracy specified in ppm.
    """
    if search_all:
        container = []
    for line in lines:
        regex_match = re.search(regex, line)
        if search_all and regex_match:
            container.append(regex_match.group(1))
        if not search_all and regex_match:
            return regex_match.group(1)
    if search_all and container:
        return container[-1]  # Return the last match if multiple matches are found
    return None




[docs]
def parse_protein_inference_method(cmdline_dict: dict) -> str:
    """
    Parse the protein inference method from the parsed execution command string.

    This setting is defined by disparate setting tags, namely:
    - no-prot-inf: No protein inference
    - pg-level: Code specifies inference method

    Parameters
    ----------
    cmdline_dict : dict
        Parsed execution command string.

    Returns
    -------
    str
        The protein inference method.
        Possibilities:
        - Disabled
        - Isoforms
        - Protein_names
        - Genes
    """
    if "no-prot-inf" in cmdline_dict.keys():
        return "Disabled"
    elif "pg-level" in cmdline_dict.keys():
        pg_setting = cmdline_dict["pg-level"][0]
        pg_level_mapping = {"0": "Isoforms", "1": "Protein_names", "2": "Genes"}
        try:
            return pg_level_mapping[pg_setting]
        except KeyError:
            Exception(f"Unexpected setting passed to --pg-level in diann.exe: {pg_setting}")
    else:
        return (
            "Genes"  # Default value, when --pg-level is not changed in the GUI it does not appear in the command string
        )




[docs]
def parse_quantification_strategy(cmdline_dict: dict):
    """
    Parse the quantification method from the parsed execution command string.

    This setting is defined by disparate setting tags, namely:
    - direct-quant: use legacy quantification within DIANN
    - high-acc: QuantUMS high-accuracy setting
    - no tag: Default is QuantUMS high-precision

    Parameters
    ----------
    cmdline_dict : dict
        Parsed execution command string.

    Returns
    -------
    str
        The quantification method.
        Possibilities:
        - Legacy
        - QuantUMS high-accuracy
        - QuantUMS high-precision
    """
    if "direct-quant" in cmdline_dict.keys():
        return "Legacy"
    elif "high-acc" in cmdline_dict.keys():
        return "QuantUMS high-accuracy"
    else:
        # Default value
        return "QuantUMS high-precision"




[docs]
def parse_predictors_library(cmdline_dict: dict):
    """
    Parse the spectral library predictors from parsed execute command string.

    For now, only 'DIANN' and 'User defined speclib' are supported.
    In the future, the user might specify which algorithm was used for library generation.

    Parameters
    ----------
    cmdline_dict : dict
        Parsed execution command string.

    Returns
    -------
    dict
        Dictionary specifying algorithm name for RT, IM and MS2_int.
    """
    if "predictor" in cmdline_dict.keys():
        return {"RT": "DIANN", "IM": "DIANN", "MS2_int": "DIANN"}
    elif "lib" in cmdline_dict.keys():
        if not isinstance(cmdline_dict["lib"], bool):
            return {"RT": "User defined speclib", "IM": "User defined speclib", "MS2_int": "User defined speclib"}




[docs]
def extract_cfg_parameter(lines: List[str], regex: str, cast_type: type = str, default=None, search_all=False) -> Any:
    """Extract and cast a parameter using a regex pattern."""
    match = extract_with_regex(lines, regex, search_all=search_all)
    if match is None:
        return default
    try:
        return cast_type(match)
    except ValueError:
        return default




[docs]
def extract_modifications(lines: List[str], regexes: List[str]) -> Optional[str]:
    """Extract and join modifications from a list of regexes."""
    modifications = []
    for regex in regexes:
        modifications.extend(
            match.group(1) if match.group(1).endswith("\n") else match.group(1) + "\n"
            for match in re.finditer(regex, "\n".join(lines))
        )
    return ",".join(modifications).replace("\n", "") if modifications else None




[docs]
def extract_params(
    fname: str, json=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DIA_ion.json")
) -> ProteoBenchParameters:
    """
    Parse DIA-NN log file and extract relevant parameters.

    Logic:
    1. Read the log file and extract the software version.
    2. Find the command line string that was used to run DIA-NN.
    3. Parse the command line string to extract settings.
    Default values are set for parameters that are not specified in the command line.
    4. If the --cfg flag is used (meaning a configuration file was used),
      the parameters are parsed from the free text underneath the cmd line.


    Parameters
    ----------
    fname : str
        Parameter file name path.

    Returns
    -------
    ProteoBenchParameters
        The parsed ProteoBenchParameters object.
    """
    print("JSON file used for DIA-NN parameters:", json)
    print("\n" * 5)
    cfg_used = False
    # Some default and flag settings
    parameters = {
        "software_name": "DIA-NN",
        "search_engine": "DIA-NN",
        "enable_match_between_runs": False,
        "quantification_method": "QuantUMS high-precision",
        "protein_inference": "Genes",  # Default value, if not specified in the command line
        "min_precursor_charge": 1,
        "max_precursor_charge": 4,
        "min_peptide_length": 7,
        "max_peptide_length": 30,
        "min_fragment_mz": 200,
        "max_fragment_mz": 1800,
        "min_precursor_mz": 300,
        "max_precursor_mz": 1800,
    }

    try:
        # Read in the log file
        with open(fname) as f:
            lines = f.readlines()
    except:
        lines = [l for l in fname.read().decode("utf-8").splitlines()]

    # Extract software versions from the log file.
    software_version = search_engine_version = extract_with_regex(lines, software_version_regex)
    parameters["software_version"] = software_version
    parameters["search_engine_version"] = search_engine_version

    # Get settings from the execution command string
    cmdline_string = find_cmdline_string(lines)
    if cmdline_string and "--cfg" in cmdline_string:
        cfg_used = True
        # If a configuration file was used, the parameters are specified in the free text below the cmd line.
    cmdline_dict = parse_cmdline_string(cmdline_string, software_version)

    parameters["quantification_method"] = parse_quantification_strategy(cmdline_dict)
    parameters["protein_inference"] = parse_protein_inference_method(cmdline_dict)
    parameters["predictors_library"] = parse_predictors_library(cmdline_dict)

    # Parse most settings as possible from the execution command using PARAM_CMD_DICT for mapping.
    for proteobench_setting, cmd_setting in PARAM_CMD_DICT.items():
        if cmd_setting in cmdline_dict.keys():
            if isinstance(cmdline_dict[cmd_setting], bool):
                parameters[proteobench_setting] = cmdline_dict[cmd_setting]
            else:
                parameters[proteobench_setting] = parse_setting(proteobench_setting, cmdline_dict[cmd_setting])

    # Parse cut parameter to standard enzyme name
    if "enzyme" not in parameters.keys():  # This happens when running fragpipe-diann
        parameters["enzyme"] = "cut"
    elif parameters["enzyme"] == "K*,R*":
        parameters["enzyme"] = "Trypsin/P"
    elif parameters["enzyme"] == "K*,R*,!*P":
        parameters["enzyme"] = "Trypsin"

    # If mass-acc flag is not present in cmdline string, extract it from the log file
    if "fragment_mass_tolerance" not in parameters.keys():
        fragment_mass_tol = extract_with_regex(lines, fragment_mass_tolerance_regex)
        parameters["fragment_mass_tolerance"] = "[-" + fragment_mass_tol + " ppm" + ", " + fragment_mass_tol + " ppm]"
    else:
        parameters["fragment_mass_tolerance"] = (
            "[-"
            + str(parameters["fragment_mass_tolerance"])
            + " ppm"
            + ", "
            + str(parameters["fragment_mass_tolerance"])
            + " ppm]"
        )

    if "precursor_mass_tolerance" not in parameters.keys():
        precursor_mass_tol = extract_with_regex(lines, precursor_mass_tolerance_regex)
        parameters["precursor_mass_tolerance"] = (
            "[-" + precursor_mass_tol + " ppm" + ", " + precursor_mass_tol + " ppm]"
        )
    else:
        parameters["precursor_mass_tolerance"] = (
            "[-"
            + str(parameters["precursor_mass_tolerance"])
            + " ppm"
            + ", "
            + str(parameters["precursor_mass_tolerance"])
            + " ppm]"
        )
    # If scan window is not customely set, extract it from the log file
    parameters["scan_window"] = int(extract_with_regex(lines, scan_window_regex))
    parameters["abundance_normalization_ions"] = None

    # If cfg file is used, extract the parameters from the free text below the cmd line.
    if cfg_used:
        print("DEBUG: Extracting parameters from the configuration file.")
        parameters.update(
            {
                "ident_fdr_psm": extract_cfg_parameter(lines, fdr_regex, float),
                "ident_fdr_protein": None,
                "enable_match_between_runs": bool(re.search(enable_match_between_runs_regex, "".join(lines))),
                "enzyme": (
                    f"{extract_cfg_parameter(lines, cleavage_regex) or ''},!{extract_cfg_parameter(lines, cleavage_exc_regex) or ''}"
                ),
                "allowed_miscleavages": extract_cfg_parameter(lines, missed_cleavages_regex, int),
                "min_peptide_length": extract_cfg_parameter(lines, min_pep_len_regex, int),
                "max_peptide_length": extract_cfg_parameter(lines, max_pep_len_regex, int),
                "min_precursor_charge": extract_cfg_parameter(lines, min_z_regex, int),
                "max_precursor_charge": extract_cfg_parameter(lines, max_z_regex, int),
                "max_mods": extract_cfg_parameter(lines, max_mods_regex, int),
                "quantification_method": extract_cfg_parameter(
                    lines, quant_mode_regex, str, "QuantUMS high-precision", search_all=True
                ),
                "fixed_mods": extract_modifications(lines, PARAM_REGEX_DICT["fixed_mods"]),
                "variable_mods": extract_modifications(lines, [PARAM_REGEX_DICT["variable_mods"]]),
                "min_fragment_mz": extract_cfg_parameter(lines, min_mz_frag_regex, int),
                "max_fragment_mz": extract_cfg_parameter(lines, max_mz_frag_regex, int),
                "min_precursor_mz": extract_cfg_parameter(lines, min_mz_prec_regex, int),
                "max_precursor_mz": extract_cfg_parameter(lines, max_mz_prec_regex, int),
            }
        )

        protein_inference = extract_cfg_parameter(lines, protein_inference_regex)
        parameters["protein_inference"] = PROT_INF_MAP.get(protein_inference, "Genes")

    return ProteoBenchParameters(**parameters, filename=json)



if __name__ == "__main__":
    for fname in [
        "../../../test/params/DIANN_output_20240229_report.log.txt",
        "../../../test/params/Version1_9_Predicted_Library_report.log.txt",
        "../../../test/params/DIANN_WU304578_report.log.txt",
        "../../../test/params/DIANN_1.7.16.log.txt",
        "../../../test/params/DIANN_cfg_settings.txt",
        "../../../test/params/DIANN_cfg_MBR.txt",
        "../../../test/params/DIA-NN_cfg_directq.txt",
    ]:
        file = pathlib.Path(fname)
        params = extract_params(file)
        data_dict = params.__dict__
        series = pd.Series(data_dict)
        print(series)
        series.to_csv(file.with_suffix(".csv"))