Source code for proteobench.validation.config

"""
Module-level validation configuration.

:class:`ModuleValidationConfig` collects the small amount of per-module
information the validator needs that is not part of the standardized result
DataFrame or the parsed parameters: the standardized column names, the
protein-group separators, the contaminant flag and decoy prefixes used to skip
non-target identifiers, and the reference FASTA location.

The ``validation_profile`` field selects which set of checks the orchestrator
runs. It is the name of a profile registered in
:mod:`proteobench.validation.profiles`. It is resolved (in order of precedence):

1. an explicit ``[validation].profile`` key in the module's ``module_settings.toml``
   (the declarative path: adding a new module of an existing category is config-only);
2. inferred from the module's parser class via the existing ``MODULE_TO_CLASS``
   registry (``ParseSettingsQuant`` -> ``"quant_lfq"``, ``ParseSettingsDeNovo`` -> ``"denovo"``);
3. the :data:`DEFAULT_VALIDATION_PROFILE` fallback.

A genuinely new category of module is supported by registering a new profile
in ``profiles.py`` (or from third-party code) and pointing the module at it via
the TOML key; the orchestrator itself never changes.

The reference FASTA is read from an optional ``[reference_database]`` section in
the module's ``module_settings.toml`` (beside ``[species_expected_ratio]`` and
``[general]``). Module types whose reference is not a FASTA (e.g. de novo, which
compares against a ground-truth table) simply omit ``fasta_url``.

Example ``module_settings.toml`` sections::

    [reference_database]
    "fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip"

    [validation]
    "profile" = "quant_lfq"
    # optional mass-tolerance plausibility ceilings (no default; skipped if unset):
    # "max_plausible_ppm" = 1000.0
    # "max_plausible_dalton" = 10.0
"""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from typing import Optional, Tuple

import toml

from proteobench.validation.protein_ids import DEFAULT_GROUP_SEPARATORS

#: Profile used when none can be resolved from config or the parser class.
DEFAULT_VALIDATION_PROFILE = "quant_lfq"

#: Maps a parser class name to the default validation profile for that family.
#: Resolution falls back to this when no ``[validation].profile`` is declared.
_PROFILE_BY_PARSER_CLASS = {
    "ParseSettingsQuant": "quant_lfq",
    "ParseSettingsDeNovo": "denovo",
}

#: Common decoy-identifier prefixes. The ParseSettings configuration marks
#: decoys via a boolean ``Reverse`` column rather than an accession prefix, so
#: these defaults provide a tool-agnostic fallback for skipping decoy proteins.
DEFAULT_DECOY_PREFIXES = ("rev_", "rev__", "decoy_", "decoy", "reverse_", "##")


def _resolve_profile(module_id: str, declared_profile: Optional[str]) -> str:
    """
    Resolve the validation profile name for a module.

    Resolution order: an explicit profile declared in ``module_settings.toml``
    wins; otherwise the profile is inferred from the module's parser class via
    the existing ``MODULE_TO_CLASS`` registry; otherwise
    :data:`DEFAULT_VALIDATION_PROFILE` is used.

    Parameters
    ----------
    module_id : str
        The module identifier.
    declared_profile : str or None
        The profile name declared in ``[validation].profile``, if any.

    Returns
    -------
    str
        The resolved profile name.
    """
    if isinstance(declared_profile, str) and declared_profile:
        return declared_profile

    try:
        from proteobench.io.parsing.parse_settings import MODULE_TO_CLASS

        parser_cls = MODULE_TO_CLASS.get(module_id)
        if parser_cls is not None:
            inferred = _PROFILE_BY_PARSER_CLASS.get(parser_cls.__name__)
            if inferred:
                return inferred
    except Exception:
        pass

    return DEFAULT_VALIDATION_PROFILE


[docs] @dataclass class ModuleValidationConfig: """ Per-module configuration for submission validation. Attributes ---------- protein_column : str, optional Column holding protein identifiers in the standardized DataFrame. Default ``"Proteins"``. sequence_column : str, optional Column holding the (plain) peptide sequence. Default ``"Sequence"``. charge_column : str, optional Column holding the precursor charge. Default ``"Charge"``. proforma_column : str, optional Column holding the ProForma modified sequence. Default ``"proforma"``. contaminant_column : str, optional Boolean column flagging contaminant rows. Default ``"contaminant"``. contaminant_flag : str, optional Substring marking contaminant proteins (from the tool parse settings, e.g. ``"Cont_"``). decoy_prefixes : tuple of str, optional Prefixes marking decoy proteins. Defaults to :data:`DEFAULT_DECOY_PREFIXES`. protein_group_separators : tuple of str, optional Separators used to split protein groups. Defaults to :data:`~proteobench.validation.protein_ids.DEFAULT_GROUP_SEPARATORS`. fasta_url : str, optional URL of the reference FASTA / zip / gzip for the module. fasta_filename : str, optional Preferred FASTA member name when the resource is an archive. species_flags : tuple of str, optional Species names configured for the module (e.g. ``("YEAST", "ECOLI", "HUMAN")``), derived from the tool's species mapper. Currently informational. recommended_max_fdr_psm : float, optional Recommended maximum PSM-level FDR for the benchmark. A parsed FDR above this value produces a warning. Default ``0.01`` (1%). Set to ``None`` to disable the recommendation check. max_plausible_ppm : float, optional Plausibility ceiling for ppm mass tolerances. A parsed tolerance above this value produces a warning. No default (``None``); when unset, the implausible-value check is skipped. Set via ``[validation]`` in ``module_settings.toml``. max_plausible_dalton : float, optional Plausibility ceiling for absolute (Da / Th / amu) mass tolerances, scaled by 1000 for mmu. No default (``None``); when unset, the implausible-value check is skipped. Set via ``[validation]`` in ``module_settings.toml``. validation_profile : str, optional Name of the registered profile whose checks the orchestrator runs. Set automatically by :meth:`from_parse_settings`; defaults to :data:`DEFAULT_VALIDATION_PROFILE` for direct construction so that the existing quant behaviour is preserved. """ protein_column: str = "Proteins" sequence_column: str = "Sequence" charge_column: str = "Charge" proforma_column: str = "proforma" contaminant_column: str = "contaminant" contaminant_flag: Optional[str] = None decoy_prefixes: Tuple[str, ...] = DEFAULT_DECOY_PREFIXES protein_group_separators: Tuple[str, ...] = tuple(DEFAULT_GROUP_SEPARATORS) fasta_url: Optional[str] = None fasta_filename: Optional[str] = None species_flags: Tuple[str, ...] = field(default_factory=tuple) recommended_max_fdr_psm: Optional[float] = 0.01 max_plausible_ppm: Optional[float] = None max_plausible_dalton: Optional[float] = None validation_profile: str = DEFAULT_VALIDATION_PROFILE
[docs] @classmethod def from_parse_settings( cls, parse_settings_dir: str, module_id: str, input_format: str, ) -> "ModuleValidationConfig": """ Build a config from the existing parse settings of a module/tool. This reuses :class:`~proteobench.io.parsing.parse_settings.ParseSettingsBuilder` to read the contaminant flag and species flags for the selected tool, reads the optional ``[reference_database]`` and ``[validation]`` sections from the module's ``module_settings.toml``, and resolves the validation profile. Parameters ---------- parse_settings_dir : str Directory containing the module's parse settings (the module's ``parse_settings_dir`` attribute). module_id : str The module identifier (e.g. ``"quant_lfq_DDA_ion_QExactive"``). input_format : str The selected software tool (e.g. ``"MaxQuant"``). Returns ------- ModuleValidationConfig Configuration populated from the parse settings. Falls back to the defaults for any value that cannot be read. """ config = cls() # Best effort: read the contaminant flag and species from the tool parser. # Wrapped defensively so validation never crashes on a parser issue. try: from proteobench.io.parsing.parse_settings import ParseSettingsBuilder builder = ParseSettingsBuilder(parse_settings_dir=parse_settings_dir, module_id=module_id) parser = builder.build_parser(input_format) config.contaminant_flag = getattr(parser, "contaminant_flag", None) species = parser.species_dict() if hasattr(parser, "species_dict") else {} config.species_flags = tuple(species.values()) except Exception: pass # Read the module settings directly from disk (independent of the parser) # so the reference and profile resolve even if the parser cannot be built. module_settings = {} try: module_settings = toml.load(os.path.join(parse_settings_dir, "module_settings.toml")) except Exception: module_settings = {} reference = module_settings.get("reference_database", {}) or {} config.fasta_url = reference.get("fasta_url") config.fasta_filename = reference.get("fasta_filename") validation_section = module_settings.get("validation", {}) or {} declared_profile = validation_section.get("profile") config.validation_profile = _resolve_profile(module_id, declared_profile) config.max_plausible_ppm = validation_section.get("max_plausible_ppm") config.max_plausible_dalton = validation_section.get("max_plausible_dalton") return config
[docs] @staticmethod def read_reference_database(parse_settings_dir: str) -> dict: """ Read the ``[reference_database]`` section of a module's settings. Parameters ---------- parse_settings_dir : str Directory containing the module's ``module_settings.toml``. Returns ------- dict The ``[reference_database]`` table, or an empty dict if absent. """ path = os.path.join(parse_settings_dir, "module_settings.toml") try: module_settings = toml.load(path) except Exception: return {} return module_settings.get("reference_database", {}) or {}