"""
Module-level validation configuration.
:class:`ModuleValidationConfig` collects the small amount of per-module
information the validator needs that is not part of the standardized result
DataFrame or the parsed parameters: the standardized column names, the
protein-group separators, the contaminant flag and decoy prefixes used to skip
non-target identifiers, and the reference FASTA location.
The ``validation_profile`` field selects which set of checks the orchestrator
runs. It is the name of a profile registered in
:mod:`proteobench.validation.profiles`. It is resolved (in order of precedence):
1. an explicit ``[validation].profile`` key in the module's ``module_settings.toml``
(the declarative path: adding a new module of an existing category is config-only);
2. inferred from the module's parser class via the existing ``MODULE_TO_CLASS``
registry (``ParseSettingsQuant`` -> ``"quant_lfq"``, ``ParseSettingsDeNovo`` -> ``"denovo"``);
3. the :data:`DEFAULT_VALIDATION_PROFILE` fallback.
A genuinely new category of module is supported by registering a new profile
in ``profiles.py`` (or from third-party code) and pointing the module at it via
the TOML key; the orchestrator itself never changes.
The reference FASTA is read from an optional ``[reference_database]`` section in
the module's ``module_settings.toml`` (beside ``[species_expected_ratio]`` and
``[general]``). Module types whose reference is not a FASTA (e.g. de novo, which
compares against a ground-truth table) simply omit ``fasta_url``.
Example ``module_settings.toml`` sections::
[reference_database]
"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip"
[validation]
"profile" = "quant_lfq"
# optional mass-tolerance plausibility ceilings (no default; skipped if unset):
# "max_plausible_ppm" = 1000.0
# "max_plausible_dalton" = 10.0
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
from typing import Optional, Tuple
import toml
from proteobench.validation.protein_ids import DEFAULT_GROUP_SEPARATORS
#: Profile used when none can be resolved from config or the parser class.
DEFAULT_VALIDATION_PROFILE = "quant_lfq"
#: Maps a parser class name to the default validation profile for that family.
#: Resolution falls back to this when no ``[validation].profile`` is declared.
_PROFILE_BY_PARSER_CLASS = {
"ParseSettingsQuant": "quant_lfq",
"ParseSettingsDeNovo": "denovo",
}
#: Common decoy-identifier prefixes. The ParseSettings configuration marks
#: decoys via a boolean ``Reverse`` column rather than an accession prefix, so
#: these defaults provide a tool-agnostic fallback for skipping decoy proteins.
DEFAULT_DECOY_PREFIXES = ("rev_", "rev__", "decoy_", "decoy", "reverse_", "##")
def _resolve_profile(module_id: str, declared_profile: Optional[str]) -> str:
"""
Resolve the validation profile name for a module.
Resolution order: an explicit profile declared in ``module_settings.toml``
wins; otherwise the profile is inferred from the module's parser class via
the existing ``MODULE_TO_CLASS`` registry; otherwise
:data:`DEFAULT_VALIDATION_PROFILE` is used.
Parameters
----------
module_id : str
The module identifier.
declared_profile : str or None
The profile name declared in ``[validation].profile``, if any.
Returns
-------
str
The resolved profile name.
"""
if isinstance(declared_profile, str) and declared_profile:
return declared_profile
try:
from proteobench.io.parsing.parse_settings import MODULE_TO_CLASS
parser_cls = MODULE_TO_CLASS.get(module_id)
if parser_cls is not None:
inferred = _PROFILE_BY_PARSER_CLASS.get(parser_cls.__name__)
if inferred:
return inferred
except Exception:
pass
return DEFAULT_VALIDATION_PROFILE
[docs]
@dataclass
class ModuleValidationConfig:
"""
Per-module configuration for submission validation.
Attributes
----------
protein_column : str, optional
Column holding protein identifiers in the standardized DataFrame.
Default ``"Proteins"``.
sequence_column : str, optional
Column holding the (plain) peptide sequence. Default ``"Sequence"``.
charge_column : str, optional
Column holding the precursor charge. Default ``"Charge"``.
proforma_column : str, optional
Column holding the ProForma modified sequence. Default ``"proforma"``.
contaminant_column : str, optional
Boolean column flagging contaminant rows. Default ``"contaminant"``.
contaminant_flag : str, optional
Substring marking contaminant proteins (from the tool parse settings,
e.g. ``"Cont_"``).
decoy_prefixes : tuple of str, optional
Prefixes marking decoy proteins. Defaults to :data:`DEFAULT_DECOY_PREFIXES`.
protein_group_separators : tuple of str, optional
Separators used to split protein groups. Defaults to
:data:`~proteobench.validation.protein_ids.DEFAULT_GROUP_SEPARATORS`.
fasta_url : str, optional
URL of the reference FASTA / zip / gzip for the module.
fasta_filename : str, optional
Preferred FASTA member name when the resource is an archive.
species_flags : tuple of str, optional
Species names configured for the module (e.g. ``("YEAST", "ECOLI", "HUMAN")``),
derived from the tool's species mapper. Currently informational.
recommended_max_fdr_psm : float, optional
Recommended maximum PSM-level FDR for the benchmark. A parsed FDR above
this value produces a warning. Default ``0.01`` (1%). Set to ``None`` to
disable the recommendation check.
max_plausible_ppm : float, optional
Plausibility ceiling for ppm mass tolerances. A parsed tolerance above
this value produces a warning. No default (``None``); when unset, the
implausible-value check is skipped. Set via ``[validation]`` in
``module_settings.toml``.
max_plausible_dalton : float, optional
Plausibility ceiling for absolute (Da / Th / amu) mass tolerances, scaled
by 1000 for mmu. No default (``None``); when unset, the implausible-value
check is skipped. Set via ``[validation]`` in ``module_settings.toml``.
validation_profile : str, optional
Name of the registered profile whose checks the orchestrator runs. Set
automatically by :meth:`from_parse_settings`; defaults to
:data:`DEFAULT_VALIDATION_PROFILE` for direct construction so that the
existing quant behaviour is preserved.
"""
protein_column: str = "Proteins"
sequence_column: str = "Sequence"
charge_column: str = "Charge"
proforma_column: str = "proforma"
contaminant_column: str = "contaminant"
contaminant_flag: Optional[str] = None
decoy_prefixes: Tuple[str, ...] = DEFAULT_DECOY_PREFIXES
protein_group_separators: Tuple[str, ...] = tuple(DEFAULT_GROUP_SEPARATORS)
fasta_url: Optional[str] = None
fasta_filename: Optional[str] = None
species_flags: Tuple[str, ...] = field(default_factory=tuple)
recommended_max_fdr_psm: Optional[float] = 0.01
max_plausible_ppm: Optional[float] = None
max_plausible_dalton: Optional[float] = None
validation_profile: str = DEFAULT_VALIDATION_PROFILE
[docs]
@classmethod
def from_parse_settings(
cls,
parse_settings_dir: str,
module_id: str,
input_format: str,
) -> "ModuleValidationConfig":
"""
Build a config from the existing parse settings of a module/tool.
This reuses :class:`~proteobench.io.parsing.parse_settings.ParseSettingsBuilder`
to read the contaminant flag and species flags for the selected tool,
reads the optional ``[reference_database]`` and ``[validation]`` sections
from the module's ``module_settings.toml``, and resolves the validation
profile.
Parameters
----------
parse_settings_dir : str
Directory containing the module's parse settings (the module's
``parse_settings_dir`` attribute).
module_id : str
The module identifier (e.g. ``"quant_lfq_DDA_ion_QExactive"``).
input_format : str
The selected software tool (e.g. ``"MaxQuant"``).
Returns
-------
ModuleValidationConfig
Configuration populated from the parse settings. Falls back to the
defaults for any value that cannot be read.
"""
config = cls()
# Best effort: read the contaminant flag and species from the tool parser.
# Wrapped defensively so validation never crashes on a parser issue.
try:
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
builder = ParseSettingsBuilder(parse_settings_dir=parse_settings_dir, module_id=module_id)
parser = builder.build_parser(input_format)
config.contaminant_flag = getattr(parser, "contaminant_flag", None)
species = parser.species_dict() if hasattr(parser, "species_dict") else {}
config.species_flags = tuple(species.values())
except Exception:
pass
# Read the module settings directly from disk (independent of the parser)
# so the reference and profile resolve even if the parser cannot be built.
module_settings = {}
try:
module_settings = toml.load(os.path.join(parse_settings_dir, "module_settings.toml"))
except Exception:
module_settings = {}
reference = module_settings.get("reference_database", {}) or {}
config.fasta_url = reference.get("fasta_url")
config.fasta_filename = reference.get("fasta_filename")
validation_section = module_settings.get("validation", {}) or {}
declared_profile = validation_section.get("profile")
config.validation_profile = _resolve_profile(module_id, declared_profile)
config.max_plausible_ppm = validation_section.get("max_plausible_ppm")
config.max_plausible_dalton = validation_section.get("max_plausible_dalton")
return config
[docs]
@staticmethod
def read_reference_database(parse_settings_dir: str) -> dict:
"""
Read the ``[reference_database]`` section of a module's settings.
Parameters
----------
parse_settings_dir : str
Directory containing the module's ``module_settings.toml``.
Returns
-------
dict
The ``[reference_database]`` table, or an empty dict if absent.
"""
path = os.path.join(parse_settings_dir, "module_settings.toml")
try:
module_settings = toml.load(path)
except Exception:
return {}
return module_settings.get("reference_database", {}) or {}