Source code for proteobench.io.params.alphadia

"""
AlphaDIA parameter parsing.
"""

import os
import pathlib
import re
from typing import Dict, List, Tuple

import pandas as pd

from proteobench.io.params import ProteoBenchParameters

# Regular expression to clean up lines from ANSI escape codes
ANSI_REGEX = re.compile(r"(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]")
TIMESTAMP_REGEX = re.compile(r"(\d+ days?,\s*)?(\d+):\d{2}:\d{2}\.?\d*")
DEBUG_LEVEL_REGEX = re.compile(r"(PROGRESS|INFO|WARNING|ERROR|CRITICAL|DEBUG):")
TREE_REGEX = re.compile(r"^\s*(├──|└──|\│)\s*|\s*(├──|└──|\│)\s*")
USER_DEFINED_REGEX = re.compile(r"(\[|\()?user defined(\]|\))?")
DEFAULT_REGEX = re.compile(r"(\[|\()?default:?(\]|\))?")

CONFIG_KEY_MAPPER = {
    "version": "software_version",
    "software_name": "software_name",
    "search_engine": "search_engine",
    "search_engine_version": "search_engine_version",
    "fdr": ["ident_fdr_psm", "ident_fdr_protein"],
    "mbr_step_enabled": "enable_match_between_runs",
    "target_ms1_tolerance": "precursor_mass_tolerance",
    "target_ms2_tolerance": "fragment_mass_tolerance",
    "min_fragment_mz": "min_fragment_mz",
    "max_fragment_mz": "max_fragment_mz",
    "min_precursor_mz": "min_precursor_mz",
    "max_precursor_mz": "max_precursor_mz",
    "enzyme": "enzyme",
    "missed_cleavages": "allowed_miscleavages",
    "min_peptide_length": "min_peptide_length",
    "max_peptide_length": "max_peptide_length",
    "fixed_modifications": "fixed_mods",
    "variable_modifications": "variable_mods",
    "max_var_mod_num": "max_mods",
    "min_precursor_charge": "min_precursor_charge",
    "max_precursor_charge": "max_precursor_charge",
    "quantification_method": "quantification_method",
    "inference_strategy": "protein_inference",
    "predictors_library": "predictors_library",
}



[docs]
def clean_line(line: str) -> str:
    """Clean up a line by removing ANSI escape codes and trimming whitespace, as well as removing timestamps."""
    line = ANSI_REGEX.sub("", line)
    line = TIMESTAMP_REGEX.sub("", line)
    line = DEBUG_LEVEL_REGEX.sub("", line)
    line = TREE_REGEX.sub("", line)
    return line.strip()




[docs]
def parse_key_value(line: str) -> Tuple[str, str]:
    """
    Parse a key-value pair from a line in the log. It assumes the format 'key: value'.

    Parameters
    ----------
    line : str
        The line to parse.

    Returns
    -------
    Tuple[str, str]
        The parsed key and value.
    """
    key, value = line.split(":", 1)
    return key.strip(), value.strip()




[docs]
def detect_newer_version(lines: List[str]) -> bool:
    """
    Detect if the log file is from a newer version (>= 1.10) of AlphaDIA based on the presence of 'user defined' in the lines.

    Parameters
    ----------
    lines : List[str]
        List of log lines.

    Returns
    -------
    bool
        True if the log file is from a newer version, False otherwise.
    """
    for line in lines:
        cleaned_line = clean_line(line)
        # If user defined and default are in the same line, it indicates a newer version
        if "user defined, default" in cleaned_line:
            return True
    return False



def _extract_values_newer_version(lines: List[str], start_index: int) -> List[int]:
    """
    Extract values from lines that are indented, following the format for parameters like precursor_len in newer versions (>=1.10).

    Parameters
    ----------
    lines : List[str]
        List of log lines.
    start_index : int
        Index of the line where the key (e.g., precursor_len) is found.

    Returns
    -------
    List[int]
        A list of integers representing the values extracted from the nested lines.
    """
    values = []
    for line in lines[start_index:]:
        cleaned_line = clean_line(line)
        if cleaned_line:
            number = re.search(r"\d+", cleaned_line)
            if number:
                values.append(int(number.group(0)))
        if len(values) == 3:  # Assuming we want three values
            break
    return values[:-1]  # Exclude the last value which is often not needed


def _extract_values_older_version(lines: List[str], start_index: int) -> List[int]:
    """
    Extract values from lines that are indented, following the format for parameters like precursor_len in older versions (<1.10).

    Parameters
    ----------
    lines : List[str]
        List of log lines.
    start_index : int
        Index of the line where the key (e.g., precursor_len) is found.
    debug : bool, optional
        If True, print debug information. Default is False.

    Returns
    -------
    List[int]
        A list of integers representing the values extracted from the nested lines.
    """
    values = []
    for line in lines[start_index:]:
        if len(values) == 3:
            break
        cleaned_line = clean_line(line)

        if cleaned_line:
            if "user defined" in cleaned_line:
                # Remove the last value, as that is overwritten
                if values:
                    values.pop()

                cleaned_line = USER_DEFINED_REGEX.sub("", cleaned_line)
                number = re.search(r"\d+", cleaned_line)
                if number:
                    values.append(int(number.group(0)))
            else:
                number = re.search(r"\d+", cleaned_line)
                if number:
                    values.append(int(number.group(0)))

    return values[:-1]



[docs]
def extract_values_from_nested_lines(lines: List[str], start_index: int, debug: bool = False) -> List[int]:
    """
    Extract values from lines that are indented, following the format for parameters like precursor_len.

    Parameters
    ----------
    lines : List[str]
        List of log lines.
    start_index : int
        Index of the line where the key (e.g., precursor_len) is found.

    Returns
    -------
    List[int]
        A list of integers representing the values extracted from the nested lines.
    """
    new_version = detect_newer_version(lines[start_index:])
    if new_version:
        return _extract_values_newer_version(lines, start_index)
    else:
        return _extract_values_older_version(lines, start_index)




[docs]
def read_file_lines(file_path: str) -> List[str]:
    """Read lines from a file."""
    try:
        with open(file_path, encoding="utf-8") as f:
            lines = f.readlines()
    except:
        lines = [l for l in file_path.read().decode("utf-8").splitlines()]
    return lines




[docs]
def initialize_default_parameters() -> Dict[str, str]:
    """Initialize default parameters."""
    return {
        "software_name": "AlphaDIA",
        "search_engine": "AlphaDIA",
        "quantification_method": "DirectLFQ",
        "predictors_library": "AlphaPeptDeep",
        "enable_match_between_runs": False,
    }




[docs]
def process_key_value_line(cleaned_line: str, all_parameters: Dict[str, str], version_filled: bool) -> bool:
    """Process a line containing a key-value pair."""
    key, value = parse_key_value(cleaned_line)
    if key and value:
        if key == "version" and version_filled:
            return version_filled
        if key == "version":
            version_filled = True
        if key in all_parameters:
            # Check if the new value is numerically equivalent to avoid duplicates like "10, 10.0"
            existing_value = all_parameters[key]
            try:
                # Try to compare as numbers
                if float(value.split()[0]) == float(existing_value.split(",")[-1].split()[0]):
                    # Values are numerically equal, don't concatenate
                    return version_filled
            except (ValueError, IndexError):
                # Not numeric or can't parse, proceed with concatenation
                pass
            all_parameters[key] += f", {value}"
        else:
            all_parameters[key] = value
    return version_filled




[docs]
def process_precursor_len(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
    """Process precursor length parameters."""
    values = extract_values_from_nested_lines(lines, index + 1)
    all_parameters["min_peptide_length"] = values[0]
    all_parameters["max_peptide_length"] = values[-1]




[docs]
def process_precursor_charge(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
    """Process precursor charge parameters."""
    values = extract_values_from_nested_lines(lines, index + 1)
    all_parameters["min_precursor_charge"] = values[0]
    all_parameters["max_precursor_charge"] = values[-1]




[docs]
def process_precursor_mz(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
    """Process precursor m/z parameters."""
    values = extract_values_from_nested_lines(lines, index + 1)
    all_parameters["min_precursor_mz"] = values[0]
    all_parameters["max_precursor_mz"] = values[-1]




[docs]
def process_fragment_mz(lines: List[str], index: int, all_parameters: Dict[str, str]) -> None:
    """Process fragment m/z parameters."""
    values = extract_values_from_nested_lines(lines, index + 1)
    all_parameters["min_fragment_mz"] = values[0]
    all_parameters["max_fragment_mz"] = values[-1]




[docs]
def map_keys_to_desired_format(all_parameters: Dict[str, str]) -> None:
    """Map keys to the desired format."""
    for key in list(all_parameters.keys()):
        if key in CONFIG_KEY_MAPPER:
            mapped_keys = CONFIG_KEY_MAPPER[key]
            if isinstance(mapped_keys, list):
                for mapped_key in mapped_keys:
                    all_parameters[mapped_key] = all_parameters[key]
            else:
                all_parameters[mapped_keys] = all_parameters[key]




[docs]
def clean_up_parameters(all_parameters: Dict[str, str]) -> None:
    """Clean up parameters by removing redundant keys and processing values."""
    keys_to_remove = []
    for key in all_parameters.keys():
        if key == "fdr":
            all_parameters["ident_fdr_psm"] = all_parameters[key]
            all_parameters["ident_fdr_protein"] = all_parameters[key]
            keys_to_remove.append(key)
        elif (key not in CONFIG_KEY_MAPPER.values()) and (key not in ["ident_fdr_psm", "ident_fdr_protein"]):
            keys_to_remove.append(key)

    for key in keys_to_remove:
        del all_parameters[key]

    for key, value in all_parameters.items():
        if isinstance(value, str):
            value_list = list(set(value.split(", ")))
            if len(value_list) == 1:
                all_parameters[key] = DEFAULT_REGEX.sub("", value_list[0])
            else:
                for val in value_list:
                    if "default" in val:
                        all_parameters[key] = DEFAULT_REGEX.sub("", val).replace("]", "").strip()

                    if "user defined" in val:
                        all_parameters[key] = USER_DEFINED_REGEX.sub("", val)
                        break
        elif isinstance(value, list):
            all_parameters[key] = list(set(value))




[docs]
def extract_params(
    file_path: str, json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DIA_ion.json")
) -> Dict[str, str]:
    """
    Extract parameters from the log file and return them as a dictionary.

    Parameters
    ----------
    file_path : str
        The path to the log file.

    Returns
    -------
    Dict[str, str]
        A dictionary containing the extracted parameters.
    """
    lines = read_file_lines(file_path)
    all_parameters = initialize_default_parameters()
    version_filled = False

    for i, line in enumerate(lines):
        cleaned_line = clean_line(line)
        if ":" in cleaned_line:
            version_filled = process_key_value_line(cleaned_line, all_parameters, version_filled)

        if "precursor_len" in cleaned_line:
            process_precursor_len(lines, i, all_parameters)

        if "precursor_charge" in cleaned_line:
            process_precursor_charge(lines, i, all_parameters)

        # Match precursor_mz but not precursor_mz_tolerance
        if (
            "precursor_mz" in cleaned_line
            and "tolerance" not in cleaned_line
            and not all_parameters.get("min_precursor_mz")
        ):
            process_precursor_mz(lines, i, all_parameters)

        # Match fragment_mz but not fragment_mz_tolerance
        if (
            "fragment_mz" in cleaned_line
            and "tolerance" not in cleaned_line
            and not all_parameters.get("min_fragment_mz")
        ):
            process_fragment_mz(lines, i, all_parameters)

    map_keys_to_desired_format(all_parameters)
    clean_up_parameters(all_parameters)

    # Format some values
    all_parameters["precursor_mass_tolerance"] = (
        "[-"
        + all_parameters["precursor_mass_tolerance"]
        + "ppm, "
        + all_parameters["precursor_mass_tolerance"]
        + "ppm]"
    )
    all_parameters["fragment_mass_tolerance"] = (
        "[-" + all_parameters["fragment_mass_tolerance"] + "ppm, " + all_parameters["fragment_mass_tolerance"] + "ppm]"
    )

    # 'True' and 'False' to boolean
    if isinstance(all_parameters.get("enable_match_between_runs"), str):
        all_parameters["enable_match_between_runs"] = all_parameters["enable_match_between_runs"].strip() == "True"
    else:
        all_parameters["enable_match_between_runs"] = bool(all_parameters["enable_match_between_runs"])

    return ProteoBenchParameters(**all_parameters, filename=json_file)



if __name__ == "__main__":
    for fname in [
        "../../../test/params/log_alphadia_1.txt",
        "../../../test/params/log_alphadia_2.txt",
        "../../../test/params/log_alphadia_1.8.txt",
        "../../../test/params/log_alphadia_1.10.txt",
        "../../../test/params/log_alphadia_1.12.txt",
        "../../../test/params/log_alphadia_1.12MBR.txt",
        "../../../test/params/alphadia_weird_lengths.txt",
    ]:
        file = pathlib.Path(fname)
        pb_params = extract_params(file)
        params = pb_params.__dict__
        series = pd.Series(params)
        series.to_csv(file.with_suffix(".csv"))
        print("\n" * 3)