Source code for proteobench.io.params.alphadia

"""
AlphaDIA parameter parsing.
"""

import os
import pathlib
import re
from typing import Dict, List, Tuple

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

# Regular expression to clean up lines from ANSI escape codes
ANSI_REGEX = re.compile(r"(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]")
TIMESTAMP_REGEX = re.compile(r"(\d+ day,)|(\d+):\d{2}:\d{2}\.?\d*")
DEBUG_LEVEL_REGEX = re.compile(r"(PROGRESS|INFO|WARNING|ERROR|CRITICAL|DEBUG):")
TREE_REGEX = re.compile(r"^\s*(├──|└──|\│)\s*|\s*(├──|└──|\│)\s*")
USER_DEFINED_REGEX = re.compile(r"(\[|\()?user defined(\]|\))?")
DEFAULT_REGEX = re.compile(r"(\[|\()?default:?(\]|\))?")

CONFIG_KEY_MAPPER = {
    "version": "software_version",
    "software_name": "software_name",
    "search_engine": "search_engine",
    "search_engine_version": "search_engine_version",
    "fdr": ["ident_fdr_psm", "ident_fdr_protein"],
    "mbr_step_enabled": "enable_match_between_runs",
    "target_ms1_tolerance": "precursor_mass_tolerance",
    "target_ms2_tolerance": "fragment_mass_tolerance",
    "min_fragment_mz": "min_fragment_mz",
    "max_fragment_mz": "max_fragment_mz",
    "min_precursor_mz": "min_precursor_mz",
    "max_precursor_mz": "max_precursor_mz",
    "enzyme": "enzyme",
    "missed_cleavages": "allowed_miscleavages",
    "min_peptide_length": "min_peptide_length",
    "max_peptide_length": "max_peptide_length",
    "fixed_modifications": "fixed_mods",
    "variable_modifications": "variable_mods",
    "max_var_mod_num": "max_mods",
    "min_precursor_charge": "min_precursor_charge",
    "max_precursor_charge": "max_precursor_charge",
    "quantification_method": "quantification_method",
    "inference_strategy": "protein_inference",
    "predictors_library": "predictors_library",
}


[docs] def clean_line(line: str) -> str: """Clean up a line by removing ANSI escape codes and trimming whitespace, as well as removing timestamps.""" line = ANSI_REGEX.sub("", line) line = TIMESTAMP_REGEX.sub("", line) line = DEBUG_LEVEL_REGEX.sub("", line) line = TREE_REGEX.sub("", line) return line.strip()
[docs] def parse_key_value(line: str) -> Tuple[str, str]: """ Parse a key-value pair from a line in the log. It assumes the format 'key: value'. Parameters ---------- line : str The line to parse. Returns ------- Tuple[str, str] The parsed key and value. """ key, value = line.split(":", 1) return key.strip(), value.strip()
[docs] def detect_newer_version(lines: List[str]) -> bool: """ Detect if the log file is from a newer version (>= 1.10) of AlphaDIA based on the presence of 'user defined' in the lines. Parameters ---------- lines : List[str] List of log lines. Returns ------- bool True if the log file is from a newer version, False otherwise. """ for line in lines: cleaned_line = clean_line(line) # If user defined and default are in the same line, it indicates a newer version if "user defined, default" in cleaned_line: return True return False
def _extract_values_newer_version(lines: List[str], start_index: int) -> List[int]: """ Extract values from lines that are indented, following the format for parameters like precursor_len in newer versions (>=1.10). Parameters ---------- lines : List[str] List of log lines. start_index : int Index of the line where the key (e.g., precursor_len) is found. Returns ------- List[int] A list of integers representing the values extracted from the nested lines. """ values = [] for line in lines[start_index:]: cleaned_line = clean_line(line) if cleaned_line: number = re.search(r"\d+", cleaned_line) if number: values.append(int(number.group(0))) if len(values) == 3: # Assuming we want three values break return values[:-1] # Exclude the last value which is often not needed def _extract_values_older_version(lines: List[str], start_index: int) -> List[int]: """ Extract values from lines that are indented, following the format for parameters like precursor_len in older versions (<1.10). Parameters ---------- lines : List[str] List of log lines. start_index : int Index of the line where the key (e.g., precursor_len) is found. debug : bool, optional If True, print debug information. Default is False. Returns ------- List[int] A list of integers representing the values extracted from the nested lines. """ values = [] for line in lines[start_index:]: if len(values) == 3: break cleaned_line = clean_line(line) if cleaned_line: if "user defined" in cleaned_line: # Remove the last value, as that is overwritten if values: values.pop() cleaned_line = USER_DEFINED_REGEX.sub("", cleaned_line) number = re.search(r"\d+", cleaned_line) if number: values.append(int(number.group(0))) else: number = re.search(r"\d+", cleaned_line) if number: values.append(int(number.group(0))) return values[:-1]
[docs] def extract_values_from_nested_lines(lines: List[str], start_index: int, debug: bool = False) -> List[int]: """ Extract values from lines that are indented, following the format for parameters like precursor_len. Parameters ---------- lines : List[str] List of log lines. start_index : int Index of the line where the key (e.g., precursor_len) is found. Returns ------- List[int] A list of integers representing the values extracted from the nested lines. """ new_version = detect_newer_version(lines[start_index:]) if new_version: return _extract_values_newer_version(lines, start_index) else: return _extract_values_older_version(lines, start_index)
[docs] def read_file_lines(file_path: str) -> List[str]: """Read lines from a file.""" try: with open(file_path, encoding="utf-8") as f: lines = f.readlines() except: lines = [l for l in file_path.read().decode("utf-8").splitlines()] return lines
[docs] def initialize_default_parameters() -> Dict[str, str]: """Initialize default parameters.""" return { "software_name": "AlphaDIA", "search_engine": "AlphaDIA", "quantification_method": "DirectLFQ", "predictors_library": "AlphaPeptDeep", "enable_match_between_runs": False, }
[docs] def process_key_value_line(cleaned_line: str, all_parameters: Dict[str, str], version_filled: bool) -> bool: """Process a line containing a key-value pair.""" key, value = parse_key_value(cleaned_line) if key and value: if key == "version" and version_filled: return version_filled if key == "version": version_filled = True if key in all_parameters: # Check if the new value is numerically equivalent to avoid duplicates like "10, 10.0" existing_value = all_parameters[key] try: # Try to compare as numbers if float(value.split()[0]) == float(existing_value.split(",")[-1].split()[0]): # Values are numerically equal, don't concatenate return version_filled except (ValueError, IndexError): # Not numeric or can't parse, proceed with concatenation pass all_parameters[key] += f", {value}" else: all_parameters[key] = value return version_filled
[docs] def process_precursor_len(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None: """Process precursor length parameters.""" values = extract_values_from_nested_lines(lines, index + 1) all_parameters["min_peptide_length"] = values[0] all_parameters["max_peptide_length"] = values[-1]
[docs] def process_precursor_charge(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None: """Process precursor charge parameters.""" values = extract_values_from_nested_lines(lines, index + 1) all_parameters["min_precursor_charge"] = values[0] all_parameters["max_precursor_charge"] = values[-1]
[docs] def process_precursor_mz(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None: """Process precursor m/z parameters.""" values = extract_values_from_nested_lines(lines, index + 1) all_parameters["min_precursor_mz"] = values[0] all_parameters["max_precursor_mz"] = values[-1]
[docs] def process_fragment_mz(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None: """Process fragment m/z parameters.""" values = extract_values_from_nested_lines(lines, index + 1) all_parameters["min_fragment_mz"] = values[0] all_parameters["max_fragment_mz"] = values[-1]
[docs] def map_keys_to_desired_format(all_parameters: Dict[str, str]) -> None: """Map keys to the desired format.""" for key in list(all_parameters.keys()): if key in CONFIG_KEY_MAPPER: mapped_keys = CONFIG_KEY_MAPPER[key] if isinstance(mapped_keys, list): for mapped_key in mapped_keys: all_parameters[mapped_key] = all_parameters[key] else: all_parameters[mapped_keys] = all_parameters[key]
[docs] def clean_up_parameters(all_parameters: Dict[str, str]) -> None: """Clean up parameters by removing redundant keys and processing values.""" keys_to_remove = [] for key in all_parameters.keys(): if key == "fdr": all_parameters["ident_fdr_psm"] = all_parameters[key] all_parameters["ident_fdr_protein"] = all_parameters[key] keys_to_remove.append(key) elif (key not in CONFIG_KEY_MAPPER.values()) and (key not in ["ident_fdr_psm", "ident_fdr_protein"]): keys_to_remove.append(key) for key in keys_to_remove: del all_parameters[key] for key, value in all_parameters.items(): if isinstance(value, str): value_list = list(set(value.split(", "))) if len(value_list) == 1: all_parameters[key] = DEFAULT_REGEX.sub("", value_list[0]) else: for val in value_list: if "default" in val: all_parameters[key] = DEFAULT_REGEX.sub("", val).replace("]", "").strip() if "user defined" in val: all_parameters[key] = USER_DEFINED_REGEX.sub("", val) break elif isinstance(value, list): all_parameters[key] = list(set(value))
[docs] def extract_params( file_path: str, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DIA_ion.json") ) -> Dict[str, str]: """ Extract parameters from the log file and return them as a dictionary. Parameters ---------- file_path : str The path to the log file. Returns ------- Dict[str, str] A dictionary containing the extracted parameters. """ lines = read_file_lines(file_path) all_parameters = initialize_default_parameters() version_filled = False for i, line in enumerate(lines): cleaned_line = clean_line(line) if ":" in cleaned_line: version_filled = process_key_value_line(cleaned_line, all_parameters, version_filled) if "precursor_len" in cleaned_line: process_precursor_len(lines, i, all_parameters) if "precursor_charge" in cleaned_line: process_precursor_charge(lines, i, all_parameters) # Match precursor_mz but not precursor_mz_tolerance if ( "precursor_mz" in cleaned_line and "tolerance" not in cleaned_line and not all_parameters.get("min_precursor_mz") ): process_precursor_mz(lines, i, all_parameters) # Match fragment_mz but not fragment_mz_tolerance if ( "fragment_mz" in cleaned_line and "tolerance" not in cleaned_line and not all_parameters.get("min_fragment_mz") ): process_fragment_mz(lines, i, all_parameters) map_keys_to_desired_format(all_parameters) clean_up_parameters(all_parameters) # Format some values all_parameters["precursor_mass_tolerance"] = ( "[-" + all_parameters["precursor_mass_tolerance"] + "ppm, " + all_parameters["precursor_mass_tolerance"] + "ppm]" ) all_parameters["fragment_mass_tolerance"] = ( "[-" + all_parameters["fragment_mass_tolerance"] + "ppm, " + all_parameters["fragment_mass_tolerance"] + "ppm]" ) # 'True' and 'False' to boolean if isinstance(all_parameters.get("enable_match_between_runs"), str): all_parameters["enable_match_between_runs"] = all_parameters["enable_match_between_runs"].strip() == "True" else: all_parameters["enable_match_between_runs"] = bool(all_parameters["enable_match_between_runs"]) return ProteoBenchParameters(**all_parameters, filename=json_file)
if __name__ == "__main__": for fname in [ "../../../test/params/log_alphadia_1.txt", "../../../test/params/log_alphadia_2.txt", "../../../test/params/log_alphadia_1.8.txt", "../../../test/params/log_alphadia_1.10.txt", "../../../test/params/log_alphadia_1.12.txt", "../../../test/params/log_alphadia_1.12MBR.txt", ]: file = pathlib.Path(fname) pb_params = extract_params(file) params = pb_params.__dict__ series = pd.Series(params) series.to_csv(file.with_suffix(".csv")) print("\n" * 3)