Source code for proteobench.io.params.diann

"""
DIA-NN parameter parsing.
"""

import os
import pathlib
import re
from typing import Any, List, Optional

import pandas as pd
from packaging.version import Version

from proteobench.io.params import ProteoBenchParameters

# Regexes
fragment_mass_tolerance_regex = r"Optimised mass accuracy: (\d*\.?\d+) ppm"
precursor_mass_tolerance_regex = r"Recommended MS1 mass accuracy setting: (\d*\.?\d+) ppm"
software_version_regex = r"DIA-NN\s(.*?)\s\(Data-Independent Acquisition by Neural Networks\)"
scan_window_regex = r"Scan window radius set to (\d+)"
fdr_regex = r"Output will be filtered at (\d+\.\d+) FDR"
min_pep_len_regex = r"Min peptide length set to (\d+)"
max_pep_len_regex = r"Max peptide length set to (\d+)"
min_z_regex = r"Min precursor charge set to (\d+)"
max_z_regex = r"Max precursor charge set to (\d+)"
min_mz_prec_regex = r"Min precursor m/z set to (\d+)"
max_mz_prec_regex = r"Max precursor m/z set to (\d+)"
min_mz_frag_regex = r"Min fragment m/z set to (\d+)"
max_mz_frag_regex = r"Max fragment m/z set to (\d+)"
cleavage_regex = r"In silico digest will involve cuts at (.*)"
cleavage_exc_regex = r"But excluding cuts at (.*)"
missed_cleavages_regex = r"Maximum number of missed cleavages set to (\d+)"
max_mods_regex = r"Maximum number of variable modifications set to (\d+)"
fixed_mods_regex_1 = r"(.*) enabled as a fixed modification"
fixed_mods_regex_2 = r"Modification (.*) with mass delta \d+\.*\d* at .+ will be considered as fixed"
var_mods_regex = r"Modification (.*) with mass delta \d+\.*\d* at .+ will be considered as variable"
quant_mode_regex = r"(.*?) quantification mode"
protein_inference_regex = r"Implicit protein grouping: (.*);"

# Flags
enable_match_between_runs_regex = r"(MBR enabled)|(reanalyse them)"  # If present, MBR is enabled

PARAM_REGEX_DICT = {
    "ident_fdr_psm": fdr_regex,
    "ident_fdr_protein": fdr_regex,
    "precursor_mass_tolerance": precursor_mass_tolerance_regex,
    "fragment_mass_tolerance": fragment_mass_tolerance_regex,
    "enzyme": cleavage_regex,
    "allowed_miscleavages": missed_cleavages_regex,
    "min_peptide_length": min_pep_len_regex,
    "max_peptide_length": max_pep_len_regex,
    "fixed_mods": [fixed_mods_regex_1, fixed_mods_regex_2],
    "variable_mods": var_mods_regex,
    "max_mods": max_mods_regex,
    "min_precursor_charge": min_z_regex,
    "max_precursor_charge": max_z_regex,
    "scan_window": scan_window_regex,
    "enable_match_between_runs": enable_match_between_runs_regex,
}


PARAM_CMD_DICT = {
    "ident_fdr_psm": "qvalue",
    "enable_match_between_runs": "reanalyse",
    "precursor_mass_tolerance": "mass-acc-ms1",
    "fragment_mass_tolerance": "mass-acc",
    "enzyme": "cut",
    "allowed_miscleavages": "missed-cleavages",
    "min_peptide_length": "min-pep-len",
    "max_peptide_length": "max-pep-len",
    "min_fragment_mz": "min-fr-mz",
    "max_fragment_mz": "max-fr-mz",
    "min_precursor_mz": "min-pr-mz",
    "max_precursor_mz": "max-pr-mz",
    "fixed_mods": "mod",
    "variable_mods": "var-mod",
    "max_mods": "var-mods",
    "min_precursor_charge": "min-pr-charge",
    "max_precursor_charge": "max-pr-charge",
    "scan_window": "window",
    "protein_inference": "pg-level",
}
SETTINGS_PB_FLOAT = [
    "ident_fdr_psm",
    "ident_fdr_peptide",
    "ident_fdr_protein",
    "precursor_mass_tolerance",
    "fragment_mass_tolerance",
]
SETTINGS_PB_INT = [
    "allowed_miscleavages",
    "min_peptide_length",
    "max_peptide_length",
    "max_mods",
    "min_precursor_charge",
    "max_precursor_charge",
    "scan_window",
]
SETTINGS_PB_MOD = ["fixed_mods", "variable_mods"]

PROT_INF_MAP = {"isoform IDs": "Isoforms", "protein names": "Protein_names", "genes": "Genes"}


[docs] def find_cmdline_string(lines: List[str]) -> Optional[str]: """ Find the command line statement in the log file of DIANN. It is assumed that this statement is stored on a single line. Parameters ---------- lines : list[str] All input lines from the DIA-NN log file. Returns ------- str The command line string. """ for line in lines: if "diann" in line and "--" in line: return line.strip() return None
[docs] def parse_cmdline_string(cmd_line: str, software_version: str) -> dict: """ Parse a DIA-NN command line string into a dictionary of settings. Parameters ---------- cmd_line : str The command line string to parse. software_version : str The version of the DIA-NN software, e.g., "1.8". Returns ------- dict Parsed settings in dictionary format. Keys are setting names, and values are: - List of inputs for multi-value settings. - Boolean `True` for flag-like settings (without values). - Modified settings for variable and fixed modifications. Raises ------ AssertionError If an unsupported setting format is detected (e.g., `unimod` with extra arguments). """ settings_dict = {} settings_list = [setting.split() for setting in cmd_line.split(" --")] variable_modifications = [] fixed_modifications = [] def add_modification(mod_list, setting, description=None): """ Add a modification to the specified list. Parameters ---------- mod_list : list The list of parsed modifications. setting : str The parsed setting file line. description : str, optional Modification description that overwrites the parsed setting file line. """ if len(setting) != 1: raise ValueError(f"Invalid `unimod` format: {setting}") mod_list.append(description or setting[0]) is_version_below_1_8 = Version(software_version.split(" ")[0]) < Version("1.8") for setting_parts in settings_list: key = setting_parts[0] values = setting_parts[1:] if key.startswith("unimod"): if is_version_below_1_8: if key == "unimod4": add_modification(fixed_modifications, setting_parts, "Carbamidomethyl (C)") elif key == "unimod35": add_modification(variable_modifications, setting_parts, "Oxidation (M)") else: add_modification(fixed_modifications, setting_parts) elif len(setting_parts) == 1: # Boolean flag settings_dict[key] = True elif key == "var-mod": # Handle variable modifications variable_modifications.append("".join(values).replace(",", "/")) else: # General key-value settings settings_dict[key] = values # Add modifications to the settings dictionary settings_dict["var-mod"] = variable_modifications if "mod" not in settings_dict: settings_dict["mod"] = fixed_modifications return settings_dict
[docs] def parse_setting(setting_name: str, setting_list: list) -> Any: """ Parse individual settings based on their setting type. Parameters ---------- setting_name : str The name of the setting (ProteoBench). setting_list : list The input value of a given setting. Returns ------- Any The parsed setting. """ if setting_name in SETTINGS_PB_FLOAT: assert len(setting_list) == 1 return float(setting_list[0]) if setting_name in SETTINGS_PB_INT: assert len(setting_list) == 1 return int(setting_list[0]) if setting_name in SETTINGS_PB_MOD: return ",".join(setting_list) return "".join(setting_list)
[docs] def extract_with_regex(lines: List[str], regex, search_all=False) -> str: """ If no mass accuracy was specified in the cmd string, extract it from the log-file. Parameters ---------- lines : list[str] All input lines from the DIA-NN log file. regex : str The regex pattern to be matched. Returns ------- str: The MS1 and MS2 mass accuracy specified in ppm. """ if search_all: container = [] for line in lines: regex_match = re.search(regex, line) if search_all and regex_match: container.append(regex_match.group(1)) if not search_all and regex_match: return regex_match.group(1) if search_all and container: return container[-1] # Return the last match if multiple matches are found return None
[docs] def parse_protein_inference_method(cmdline_dict: dict) -> str: """ Parse the protein inference method from the parsed execution command string. This setting is defined by disparate setting tags, namely: - no-prot-inf: No protein inference - pg-level: Code specifies inference method Parameters ---------- cmdline_dict : dict Parsed execution command string. Returns ------- str The protein inference method. Possibilities: - Disabled - Isoforms - Protein_names - Genes """ if "no-prot-inf" in cmdline_dict.keys(): return "Disabled" elif "pg-level" in cmdline_dict.keys(): pg_setting = cmdline_dict["pg-level"][0] pg_level_mapping = {"0": "Isoforms", "1": "Protein_names", "2": "Genes"} try: return pg_level_mapping[pg_setting] except KeyError: Exception(f"Unexpected setting passed to --pg-level in diann.exe: {pg_setting}") else: return ( "Genes" # Default value, when --pg-level is not changed in the GUI it does not appear in the command string )
[docs] def parse_quantification_strategy(cmdline_dict: dict): """ Parse the quantification method from the parsed execution command string. This setting is defined by disparate setting tags, namely: - direct-quant: use legacy quantification within DIANN - high-acc: QuantUMS high-accuracy setting - no tag: Default is QuantUMS high-precision Parameters ---------- cmdline_dict : dict Parsed execution command string. Returns ------- str The quantification method. Possibilities: - Legacy - QuantUMS high-accuracy - QuantUMS high-precision """ if "direct-quant" in cmdline_dict.keys(): return "Legacy" elif "high-acc" in cmdline_dict.keys(): return "QuantUMS high-accuracy" else: # Default value return "QuantUMS high-precision"
[docs] def parse_predictors_library(cmdline_dict: dict): """ Parse the spectral library predictors from parsed execute command string. For now, only 'DIANN' and 'User defined speclib' are supported. In the future, the user might specify which algorithm was used for library generation. Parameters ---------- cmdline_dict : dict Parsed execution command string. Returns ------- dict Dictionary specifying algorithm name for RT, IM and MS2_int. """ if "predictor" in cmdline_dict.keys(): return {"RT": "DIANN", "IM": "DIANN", "MS2_int": "DIANN"} elif "lib" in cmdline_dict.keys(): if not isinstance(cmdline_dict["lib"], bool): return {"RT": "User defined speclib", "IM": "User defined speclib", "MS2_int": "User defined speclib"}
[docs] def extract_cfg_parameter(lines: List[str], regex: str, cast_type: type = str, default=None, search_all=False) -> Any: """Extract and cast a parameter using a regex pattern.""" match = extract_with_regex(lines, regex, search_all=search_all) if match is None: return default try: return cast_type(match) except ValueError: return default
[docs] def extract_modifications(lines: List[str], regexes: List[str]) -> Optional[str]: """Extract and join modifications from a list of regexes.""" modifications = [] for regex in regexes: modifications.extend( match.group(1) if match.group(1).endswith("\n") else match.group(1) + "\n" for match in re.finditer(regex, "\n".join(lines)) ) return ",".join(modifications).replace("\n", "") if modifications else None
[docs] def extract_params( fname: str, json=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DIA_ion.json") ) -> ProteoBenchParameters: """ Parse DIA-NN log file and extract relevant parameters. Logic: 1. Read the log file and extract the software version. 2. Find the command line string that was used to run DIA-NN. 3. Parse the command line string to extract settings. Default values are set for parameters that are not specified in the command line. 4. If the --cfg flag is used (meaning a configuration file was used), the parameters are parsed from the free text underneath the cmd line. Parameters ---------- fname : str Parameter file name path. Returns ------- ProteoBenchParameters The parsed ProteoBenchParameters object. """ print("JSON file used for DIA-NN parameters:", json) print("\n" * 5) cfg_used = False # Some default and flag settings parameters = { "software_name": "DIA-NN", "search_engine": "DIA-NN", "enable_match_between_runs": False, "quantification_method": "QuantUMS high-precision", "protein_inference": "Genes", # Default value, if not specified in the command line "min_precursor_charge": 1, "max_precursor_charge": 4, "min_peptide_length": 7, "max_peptide_length": 30, "min_fragment_mz": 200, "max_fragment_mz": 1800, "min_precursor_mz": 300, "max_precursor_mz": 1800, } try: # Read in the log file with open(fname) as f: lines = f.readlines() except: lines = [l for l in fname.read().decode("utf-8").splitlines()] # Extract software versions from the log file. software_version = search_engine_version = extract_with_regex(lines, software_version_regex) parameters["software_version"] = software_version parameters["search_engine_version"] = search_engine_version # Get settings from the execution command string cmdline_string = find_cmdline_string(lines) if cmdline_string and "--cfg" in cmdline_string: cfg_used = True # If a configuration file was used, the parameters are specified in the free text below the cmd line. cmdline_dict = parse_cmdline_string(cmdline_string, software_version) parameters["quantification_method"] = parse_quantification_strategy(cmdline_dict) parameters["protein_inference"] = parse_protein_inference_method(cmdline_dict) parameters["predictors_library"] = parse_predictors_library(cmdline_dict) # Parse most settings as possible from the execution command using PARAM_CMD_DICT for mapping. for proteobench_setting, cmd_setting in PARAM_CMD_DICT.items(): if cmd_setting in cmdline_dict.keys(): if isinstance(cmdline_dict[cmd_setting], bool): parameters[proteobench_setting] = cmdline_dict[cmd_setting] else: parameters[proteobench_setting] = parse_setting(proteobench_setting, cmdline_dict[cmd_setting]) # Parse cut parameter to standard enzyme name if "enzyme" not in parameters.keys(): # This happens when running fragpipe-diann parameters["enzyme"] = "cut" elif parameters["enzyme"] == "K*,R*": parameters["enzyme"] = "Trypsin/P" elif parameters["enzyme"] == "K*,R*,!*P": parameters["enzyme"] = "Trypsin" # If mass-acc flag is not present in cmdline string, extract it from the log file if "fragment_mass_tolerance" not in parameters.keys(): fragment_mass_tol = extract_with_regex(lines, fragment_mass_tolerance_regex) parameters["fragment_mass_tolerance"] = "[-" + fragment_mass_tol + " ppm" + ", " + fragment_mass_tol + " ppm]" else: parameters["fragment_mass_tolerance"] = ( "[-" + str(parameters["fragment_mass_tolerance"]) + " ppm" + ", " + str(parameters["fragment_mass_tolerance"]) + " ppm]" ) if "precursor_mass_tolerance" not in parameters.keys(): precursor_mass_tol = extract_with_regex(lines, precursor_mass_tolerance_regex) parameters["precursor_mass_tolerance"] = ( "[-" + precursor_mass_tol + " ppm" + ", " + precursor_mass_tol + " ppm]" ) else: parameters["precursor_mass_tolerance"] = ( "[-" + str(parameters["precursor_mass_tolerance"]) + " ppm" + ", " + str(parameters["precursor_mass_tolerance"]) + " ppm]" ) # If scan window is not customely set, extract it from the log file parameters["scan_window"] = int(extract_with_regex(lines, scan_window_regex)) parameters["abundance_normalization_ions"] = None # If cfg file is used, extract the parameters from the free text below the cmd line. if cfg_used: print("DEBUG: Extracting parameters from the configuration file.") parameters.update( { "ident_fdr_psm": extract_cfg_parameter(lines, fdr_regex, float), "ident_fdr_protein": None, "enable_match_between_runs": bool(re.search(enable_match_between_runs_regex, "".join(lines))), "enzyme": ( f"{extract_cfg_parameter(lines, cleavage_regex) or ''},!{extract_cfg_parameter(lines, cleavage_exc_regex) or ''}" ), "allowed_miscleavages": extract_cfg_parameter(lines, missed_cleavages_regex, int), "min_peptide_length": extract_cfg_parameter(lines, min_pep_len_regex, int), "max_peptide_length": extract_cfg_parameter(lines, max_pep_len_regex, int), "min_precursor_charge": extract_cfg_parameter(lines, min_z_regex, int), "max_precursor_charge": extract_cfg_parameter(lines, max_z_regex, int), "max_mods": extract_cfg_parameter(lines, max_mods_regex, int), "quantification_method": extract_cfg_parameter( lines, quant_mode_regex, str, "QuantUMS high-precision", search_all=True ), "fixed_mods": extract_modifications(lines, PARAM_REGEX_DICT["fixed_mods"]), "variable_mods": extract_modifications(lines, [PARAM_REGEX_DICT["variable_mods"]]), "min_fragment_mz": extract_cfg_parameter(lines, min_mz_frag_regex, int), "max_fragment_mz": extract_cfg_parameter(lines, max_mz_frag_regex, int), "min_precursor_mz": extract_cfg_parameter(lines, min_mz_prec_regex, int), "max_precursor_mz": extract_cfg_parameter(lines, max_mz_prec_regex, int), } ) protein_inference = extract_cfg_parameter(lines, protein_inference_regex) parameters["protein_inference"] = PROT_INF_MAP.get(protein_inference, "Genes") return ProteoBenchParameters(**parameters, filename=json)
if __name__ == "__main__": for fname in [ "../../../test/params/DIANN_output_20240229_report.log.txt", "../../../test/params/Version1_9_Predicted_Library_report.log.txt", "../../../test/params/DIANN_WU304578_report.log.txt", "../../../test/params/DIANN_1.7.16.log.txt", "../../../test/params/DIANN_cfg_settings.txt", "../../../test/params/DIANN_cfg_MBR.txt", "../../../test/params/DIA-NN_cfg_directq.txt", ]: file = pathlib.Path(fname) params = extract_params(file) data_dict = params.__dict__ series = pd.Series(data_dict) print(series) series.to_csv(file.with_suffix(".csv"))