"""
DIA-NN parameter parsing.
"""
import pathlib
import re
from typing import Any, List, Optional
import pandas as pd
from packaging.version import Version
from proteobench.io.params import ProteoBenchParameters
# Regexes
fragment_mass_tolerance_regex = r"Optimised mass accuracy: (\d*\.?\d+) ppm"
precursor_mass_tolerance_regex = r"Recommended MS1 mass accuracy setting: (\d*\.?\d+) ppm"
software_version_regex = r"DIA-NN\s(.*?)\s\(Data-Independent Acquisition by Neural Networks\)"
scan_window_regex = r"Scan window radius set to (\d+)"
fdr_regex = r"Output will be filtered at (\d+\.\d+) FDR"
min_pep_len_regex = r"Min peptide length set to (\d+)"
max_pep_len_regex = r"Max peptide length set to (\d+)"
min_z_regex = r"Min precursor charge set to (\d+)"
max_z_regex = r"Max precursor charge set to (\d+)"
cleavage_regex = r"In silico digest will involve cuts at (.*)"
cleavage_exc_regex = r"But excluding cuts at (.*)"
missed_cleavages_regex = r"Maximum number of missed cleavages set to (\d+)"
max_mods_regex = r"Maximum number of variable modifications set to (\d+)"
fixed_mods_regex_1 = r"(.*) enabled as a fixed modification"
fixed_mods_regex_2 = r"Modification (.*) with mass delta \d+\.*\d* at .+ will be considered as fixed"
var_mods_regex = r"Modification (.*) with mass delta \d+\.*\d* at .+ will be considered as variable"
quant_mode_regex = r"(.*?) quantification mode"
protein_inference_regex = r"Implicit protein grouping: (.*);"
# Flags
enable_match_between_runs_regex = "MBR enabled" # If present, MBR is enabled
PARAM_REGEX_DICT = {
"ident_fdr_psm": fdr_regex,
"ident_fdr_protein": fdr_regex,
"precursor_mass_tolerance": precursor_mass_tolerance_regex,
"fragment_mass_tolerance": fragment_mass_tolerance_regex,
"enzyme": cleavage_regex,
"allowed_miscleavages": missed_cleavages_regex,
"min_peptide_length": min_pep_len_regex,
"max_peptide_length": max_pep_len_regex,
"fixed_mods": [fixed_mods_regex_1, fixed_mods_regex_2],
"variable_mods": var_mods_regex,
"max_mods": max_mods_regex,
"min_precursor_charge": min_z_regex,
"max_precursor_charge": max_z_regex,
"scan_window": scan_window_regex,
"enable_match_between_runs": enable_match_between_runs_regex,
}
PARAM_CMD_DICT = {
"ident_fdr_psm": "qvalue",
"enable_match_between_runs": "reanalyse",
"precursor_mass_tolerance": "mass-acc-ms1",
"fragment_mass_tolerance": "mass-acc",
"enzyme": "cut",
"allowed_miscleavages": "missed-cleavages",
"min_peptide_length": "min-pep-len",
"max_peptide_length": "max-pep-len",
"fixed_mods": "mod",
"variable_mods": "var-mod",
"max_mods": "var-mods",
"min_precursor_charge": "min-pr-charge",
"max_precursor_charge": "max-pr-charge",
"scan_window": "window",
"protein_inference": "pg-level",
}
SETTINGS_PB_FLOAT = [
"ident_fdr_psm",
"ident_fdr_peptide",
"ident_fdr_protein",
"precursor_mass_tolerance",
"fragment_mass_tolerance",
]
SETTINGS_PB_INT = [
"allowed_miscleavages",
"min_peptide_length",
"max_peptide_length",
"max_mods",
"min_precursor_charge",
"max_precursor_charge",
"scan_window",
]
SETTINGS_PB_MOD = ["fixed_mods", "variable_mods"]
PROT_INF_MAP = {"isoform IDs": "Isoforms", "protein names": "Protein_names", "genes": "Genes"}
[docs]
def find_cmdline_string(lines: List[str]) -> Optional[str]:
"""
Find the command line statement in the log file of DIANN.
It is assumed that this statement is stored on a single line.
Parameters
----------
lines : list[str]
All input lines from the DIA-NN log file.
Returns
-------
str
The command line string.
"""
for line in lines:
if "diann" in line and "--" in line:
return line.strip()
return None
[docs]
def parse_cmdline_string(cmd_line: str, software_version: str) -> dict:
"""
Parse a DIA-NN command line string into a dictionary of settings.
Parameters
----------
cmd_line : str
The command line string to parse.
software_version : str
The version of the DIA-NN software, e.g., "1.8".
Returns
-------
dict
Parsed settings in dictionary format. Keys are setting names, and values are:
- List of inputs for multi-value settings.
- Boolean `True` for flag-like settings (without values).
- Modified settings for variable and fixed modifications.
Raises
------
AssertionError
If an unsupported setting format is detected (e.g., `unimod` with extra arguments).
"""
settings_dict = {}
settings_list = [setting.split() for setting in cmd_line.split(" --")]
variable_modifications = []
fixed_modifications = []
def add_modification(mod_list, setting, description=None):
"""
Add a modification to the specified list.
Parameters
----------
mod_list : list
The list of parsed modifications.
setting : str
The parsed setting file line.
description : str, optional
Modification description that overwrites the parsed setting file line.
"""
if len(setting) != 1:
raise ValueError(f"Invalid `unimod` format: {setting}")
mod_list.append(description or setting[0])
is_version_below_1_8 = Version(software_version.split(" ")[0]) < Version("1.8")
for setting_parts in settings_list:
key = setting_parts[0]
values = setting_parts[1:]
if key.startswith("unimod"):
if is_version_below_1_8:
if key == "unimod4":
add_modification(fixed_modifications, setting_parts, "Carbamidomethyl (C)")
elif key == "unimod35":
add_modification(variable_modifications, setting_parts, "Oxidation (M)")
else:
add_modification(fixed_modifications, setting_parts)
elif len(setting_parts) == 1: # Boolean flag
settings_dict[key] = True
elif key == "var-mod": # Handle variable modifications
variable_modifications.append("".join(values).replace(",", "/"))
else: # General key-value settings
settings_dict[key] = values
# Add modifications to the settings dictionary
settings_dict["var-mod"] = variable_modifications
if "mod" not in settings_dict:
settings_dict["mod"] = fixed_modifications
return settings_dict
[docs]
def parse_setting(setting_name: str, setting_list: list) -> Any:
"""
Parse individual settings based on their setting type.
Parameters
----------
setting_name : str
The name of the setting (ProteoBench).
setting_list : list
The input value of a given setting.
Returns
-------
Any
The parsed setting.
"""
if setting_name in SETTINGS_PB_FLOAT:
assert len(setting_list) == 1
return float(setting_list[0])
if setting_name in SETTINGS_PB_INT:
assert len(setting_list) == 1
return int(setting_list[0])
if setting_name in SETTINGS_PB_MOD:
return ",".join(setting_list)
return "".join(setting_list)
[docs]
def parse_protein_inference_method(cmdline_dict: dict) -> str:
"""
Parse the protein inference method from the parsed execution command string.
This setting is defined by disparate setting tags, namely:
- no-prot-inf: No protein inference
- pg-level: Code specifies inference method
Parameters
----------
cmdline_dict : dict
Parsed execution command string.
Returns
-------
str
The protein inference method.
Possibilities:
- Disabled
- Isoforms
- Protein_names
- Genes
"""
if "no-prot-inf" in cmdline_dict.keys():
return "Disabled"
elif "pg-level" in cmdline_dict.keys():
pg_setting = cmdline_dict["pg-level"][0]
pg_level_mapping = {"0": "Isoforms", "1": "Protein_names", "2": "Genes"}
try:
return pg_level_mapping[pg_setting]
except KeyError:
Exception(f"Unexpected setting passed to --pg-level in diann.exe: {pg_setting}")
else:
return (
"Genes" # Default value, when --pg-level is not changed in the GUI it does not appear in the command string
)
[docs]
def parse_quantification_strategy(cmdline_dict: dict):
"""
Parse the quantification method from the parsed execution command string.
This setting is defined by disparate setting tags, namely:
- direct-quant: use legacy quantification within DIANN
- high-acc: QuantUMS high-accuracy setting
- no tag: Default is QuantUMS high-precision
Parameters
----------
cmdline_dict : dict
Parsed execution command string.
Returns
-------
str
The quantification method.
Possibilities:
- Legacy
- QuantUMS high-accuracy
- QuantUMS high-precision
"""
if "direct-quant" in cmdline_dict.keys():
return "Legacy"
elif "high-acc" in cmdline_dict.keys():
return "QuantUMS high-accuracy"
else:
# Default value
return "QuantUMS high-precision"
[docs]
def parse_predictors_library(cmdline_dict: dict):
"""
Parse the spectral library predictors from parsed execute command string.
For now, only 'DIANN' and 'User defined speclib' are supported.
In the future, the user might specify which algorithm was used for library generation.
Parameters
----------
cmdline_dict : dict
Parsed execution command string.
Returns
-------
dict
Dictionary specifying algorithm name for RT, IM and MS2_int.
"""
if "predictor" in cmdline_dict.keys():
return {"RT": "DIANN", "IM": "DIANN", "MS2_int": "DIANN"}
elif "lib" in cmdline_dict.keys():
if not isinstance(cmdline_dict["lib"], bool):
return {"RT": "User defined speclib", "IM": "User defined speclib", "MS2_int": "User defined speclib"}
if __name__ == "__main__":
for fname in [
"../../../test/params/DIANN_output_20240229_report.log.txt",
"../../../test/params/Version1_9_Predicted_Library_report.log.txt",
"../../../test/params/DIANN_WU304578_report.log.txt",
"../../../test/params/DIANN_1.7.16.log.txt",
"../../../test/params/DIANN_cfg_settings.txt",
]:
file = pathlib.Path(fname)
params = extract_params(file)
data_dict = params.__dict__
series = pd.Series(data_dict)
print(series)
series.to_csv(file.with_suffix(".csv"))