"""
AlphaDIA parameter parsing.
"""
import os
import pathlib
import re
from typing import Dict, List, Tuple
import pandas as pd
from proteobench.io.params import ProteoBenchParameters
# Regular expression to clean up lines from ANSI escape codes
ANSI_REGEX = re.compile(r"(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]")
TIMESTAMP_REGEX = re.compile(r"(\d+ days?,\s*)?(\d+):\d{2}:\d{2}\.?\d*")
DEBUG_LEVEL_REGEX = re.compile(r"(PROGRESS|INFO|WARNING|ERROR|CRITICAL|DEBUG):")
TREE_REGEX = re.compile(r"^\s*(├──|└──|\│)\s*|\s*(├──|└──|\│)\s*")
USER_DEFINED_REGEX = re.compile(r"(\[|\()?user defined(\]|\))?")
DEFAULT_REGEX = re.compile(r"(\[|\()?default:?(\]|\))?")
CONFIG_KEY_MAPPER = {
"version": "software_version",
"software_name": "software_name",
"search_engine": "search_engine",
"search_engine_version": "search_engine_version",
"fdr": ["ident_fdr_psm", "ident_fdr_protein"],
"mbr_step_enabled": "enable_match_between_runs",
"target_ms1_tolerance": "precursor_mass_tolerance",
"target_ms2_tolerance": "fragment_mass_tolerance",
"min_fragment_mz": "min_fragment_mz",
"max_fragment_mz": "max_fragment_mz",
"min_precursor_mz": "min_precursor_mz",
"max_precursor_mz": "max_precursor_mz",
"enzyme": "enzyme",
"missed_cleavages": "allowed_miscleavages",
"min_peptide_length": "min_peptide_length",
"max_peptide_length": "max_peptide_length",
"fixed_modifications": "fixed_mods",
"variable_modifications": "variable_mods",
"max_var_mod_num": "max_mods",
"min_precursor_charge": "min_precursor_charge",
"max_precursor_charge": "max_precursor_charge",
"quantification_method": "quantification_method",
"inference_strategy": "protein_inference",
"predictors_library": "predictors_library",
}
[docs]
def clean_line(line: str) -> str:
"""Clean up a line by removing ANSI escape codes and trimming whitespace, as well as removing timestamps."""
line = ANSI_REGEX.sub("", line)
line = TIMESTAMP_REGEX.sub("", line)
line = DEBUG_LEVEL_REGEX.sub("", line)
line = TREE_REGEX.sub("", line)
return line.strip()
[docs]
def parse_key_value(line: str) -> Tuple[str, str]:
"""
Parse a key-value pair from a line in the log. It assumes the format 'key: value'.
Parameters
----------
line : str
The line to parse.
Returns
-------
Tuple[str, str]
The parsed key and value.
"""
key, value = line.split(":", 1)
return key.strip(), value.strip()
[docs]
def detect_newer_version(lines: List[str]) -> bool:
"""
Detect if the log file is from a newer version (>= 1.10) of AlphaDIA based on the presence of 'user defined' in the lines.
Parameters
----------
lines : List[str]
List of log lines.
Returns
-------
bool
True if the log file is from a newer version, False otherwise.
"""
for line in lines:
cleaned_line = clean_line(line)
# If user defined and default are in the same line, it indicates a newer version
if "user defined, default" in cleaned_line:
return True
return False
def _extract_values_newer_version(lines: List[str], start_index: int) -> List[int]:
"""
Extract values from lines that are indented, following the format for parameters like precursor_len in newer versions (>=1.10).
Parameters
----------
lines : List[str]
List of log lines.
start_index : int
Index of the line where the key (e.g., precursor_len) is found.
Returns
-------
List[int]
A list of integers representing the values extracted from the nested lines.
"""
values = []
for line in lines[start_index:]:
cleaned_line = clean_line(line)
if cleaned_line:
number = re.search(r"\d+", cleaned_line)
if number:
values.append(int(number.group(0)))
if len(values) == 3: # Assuming we want three values
break
return values[:-1] # Exclude the last value which is often not needed
def _extract_values_older_version(lines: List[str], start_index: int) -> List[int]:
"""
Extract values from lines that are indented, following the format for parameters like precursor_len in older versions (<1.10).
Parameters
----------
lines : List[str]
List of log lines.
start_index : int
Index of the line where the key (e.g., precursor_len) is found.
debug : bool, optional
If True, print debug information. Default is False.
Returns
-------
List[int]
A list of integers representing the values extracted from the nested lines.
"""
values = []
for line in lines[start_index:]:
if len(values) == 3:
break
cleaned_line = clean_line(line)
if cleaned_line:
if "user defined" in cleaned_line:
# Remove the last value, as that is overwritten
if values:
values.pop()
cleaned_line = USER_DEFINED_REGEX.sub("", cleaned_line)
number = re.search(r"\d+", cleaned_line)
if number:
values.append(int(number.group(0)))
else:
number = re.search(r"\d+", cleaned_line)
if number:
values.append(int(number.group(0)))
return values[:-1]
[docs]
def read_file_lines(file_path: str) -> List[str]:
"""Read lines from a file."""
try:
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()
except:
lines = [l for l in file_path.read().decode("utf-8").splitlines()]
return lines
[docs]
def initialize_default_parameters() -> Dict[str, str]:
"""Initialize default parameters."""
return {
"software_name": "AlphaDIA",
"search_engine": "AlphaDIA",
"quantification_method": "DirectLFQ",
"predictors_library": "AlphaPeptDeep",
"enable_match_between_runs": False,
}
[docs]
def process_key_value_line(cleaned_line: str, all_parameters: Dict[str, str], version_filled: bool) -> bool:
"""Process a line containing a key-value pair."""
key, value = parse_key_value(cleaned_line)
if key and value:
if key == "version" and version_filled:
return version_filled
if key == "version":
version_filled = True
if key in all_parameters:
# Check if the new value is numerically equivalent to avoid duplicates like "10, 10.0"
existing_value = all_parameters[key]
try:
# Try to compare as numbers
if float(value.split()[0]) == float(existing_value.split(",")[-1].split()[0]):
# Values are numerically equal, don't concatenate
return version_filled
except (ValueError, IndexError):
# Not numeric or can't parse, proceed with concatenation
pass
all_parameters[key] += f", {value}"
else:
all_parameters[key] = value
return version_filled
[docs]
def process_precursor_len(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
"""Process precursor length parameters."""
values = extract_values_from_nested_lines(lines, index + 1)
all_parameters["min_peptide_length"] = values[0]
all_parameters["max_peptide_length"] = values[-1]
[docs]
def process_precursor_charge(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
"""Process precursor charge parameters."""
values = extract_values_from_nested_lines(lines, index + 1)
all_parameters["min_precursor_charge"] = values[0]
all_parameters["max_precursor_charge"] = values[-1]
[docs]
def process_precursor_mz(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
"""Process precursor m/z parameters."""
values = extract_values_from_nested_lines(lines, index + 1)
all_parameters["min_precursor_mz"] = values[0]
all_parameters["max_precursor_mz"] = values[-1]
[docs]
def process_fragment_mz(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
"""Process fragment m/z parameters."""
values = extract_values_from_nested_lines(lines, index + 1)
all_parameters["min_fragment_mz"] = values[0]
all_parameters["max_fragment_mz"] = values[-1]
[docs]
def clean_up_parameters(all_parameters: Dict[str, str]) -> None:
"""Clean up parameters by removing redundant keys and processing values."""
keys_to_remove = []
for key in all_parameters.keys():
if key == "fdr":
all_parameters["ident_fdr_psm"] = all_parameters[key]
all_parameters["ident_fdr_protein"] = all_parameters[key]
keys_to_remove.append(key)
elif (key not in CONFIG_KEY_MAPPER.values()) and (key not in ["ident_fdr_psm", "ident_fdr_protein"]):
keys_to_remove.append(key)
for key in keys_to_remove:
del all_parameters[key]
for key, value in all_parameters.items():
if isinstance(value, str):
value_list = list(set(value.split(", ")))
if len(value_list) == 1:
all_parameters[key] = DEFAULT_REGEX.sub("", value_list[0])
else:
for val in value_list:
if "default" in val:
all_parameters[key] = DEFAULT_REGEX.sub("", val).replace("]", "").strip()
if "user defined" in val:
all_parameters[key] = USER_DEFINED_REGEX.sub("", val)
break
elif isinstance(value, list):
all_parameters[key] = list(set(value))
if __name__ == "__main__":
for fname in [
"../../../test/params/log_alphadia_1.txt",
"../../../test/params/log_alphadia_2.txt",
"../../../test/params/log_alphadia_1.8.txt",
"../../../test/params/log_alphadia_1.10.txt",
"../../../test/params/log_alphadia_1.12.txt",
"../../../test/params/log_alphadia_1.12MBR.txt",
"../../../test/params/alphadia_weird_lengths.txt",
]:
file = pathlib.Path(fname)
pb_params = extract_params(file)
params = pb_params.__dict__
series = pd.Series(params)
series.to_csv(file.with_suffix(".csv"))
print("\n" * 3)