Source code for proteobench.validation.checks

"""
Individual validation checks operating on the standardized result DataFrame.

Every check is a pure function that takes the standardized DataFrame, the
parsed :class:`~proteobench.io.params.ProteoBenchParameters` (or any object with
the same attributes), and a :class:`~proteobench.validation.config.ModuleValidationConfig`,
and returns a list of :class:`~proteobench.validation.report.ValidationIssue`.

The checks are deliberately generic: they read the standardized columns
(``Proteins``, ``Sequence``, ``Charge``, ``proforma``) and the parameter
attributes, never tool-specific result columns. Missing or unparsed parameters
yield warnings rather than errors, so a submission is never blocked merely
because a value could not be parsed.

Documented limitations and intentionally skipped checks:

* **Enzyme specificity**: a missed-cleavage heuristic is implemented for common
  C-terminal cleaving enzymes (trypsin, trypsin/P, Lys-C, Arg-C, Glu-C,
  chymotrypsin) and only as a *warning*. It ignores protein N-/C-termini and
  ragged ends (resolving those would need the reference protein sequences), and
  N-terminal cleavers (Asp-N, Lys-N) are skipped.
* **Modifications**: cross-tool modification representations are not normalized
  (human-readable names, UniMod accessions, and raw masses all occur). Only
  human-readable modification names observed in the ``proforma`` column are
  compared, as warnings; mass-only / UniMod-only tokens are skipped. The
  maximum-modifications count includes any fixed modifications written into the
  sequence, so it is an upper bound (warning only).
* **Mass tolerances**: there is no per-result tolerance to compare against, so
  the precursor/fragment tolerances are only sanity-checked (present, numeric,
  positive), as warnings. An optional plausibility ceiling
  (``max_plausible_ppm`` / ``max_plausible_dalton`` on the config) has no
  default; the implausible-value check is skipped unless a module configures it.
* **PSM FDR**: validated against the valid ``[0, 1]`` range and the benchmark's
  recommended maximum (configurable), as warnings.
* **Run identity**: ``ProteoBenchParameters`` does not expose raw-file, sample,
  or experiment identifiers, so result-vs-parameter run matching is limited to
  software identity. This is reported as info.
"""

from __future__ import annotations

import re
from typing import Any, List, Optional

import numpy as np
import pandas as pd

from proteobench.validation.config import ModuleValidationConfig
from proteobench.validation.fasta import FastaReference
from proteobench.validation.protein_ids import extract_identifiers, is_decoy_or_contaminant, split_protein_groups
from proteobench.validation.report import ValidationIssue, ValidationReport

#: Maximum number of example offending protein identifiers to report.
MAX_PROTEIN_EXAMPLES = 20

#: Maximum number of example offending rows to report for other checks.
MAX_ROW_EXAMPLES = 10

#: Matches a bracketed modification label inside a ProForma string.
_PROFORMA_MOD = re.compile(r"\[([^\]]+)\]")

#: C-terminal cleavage rules per normalized enzyme name: a tuple of
#: (residues the enzyme cleaves after, whether it cleaves when proline follows).
#: A value of ``None`` marks an N-terminal cleaver, for which the simple
#: internal-site count does not apply (skipped with an info message).
#: The rules follow the MaxQuant built-in enzyme defaults (e.g. Glu-C cleaves
#: after D and E). Because the missed-cleavage check is warning-only, these are
#: convention-dependent heuristics, not authoritative cleavage definitions.
_ENZYME_CLEAVAGE_RULES = {
    "trypsin": ("KR", False),
    "trypsin/p": ("KR", True),
    "lysc": ("K", True),
    "argc": ("R", False),
    "gluc": ("DE", True),
    "chymotrypsin": ("FYWL", False),
    "lysn": None,
    "aspn": None,
}

#: Matches a signed number (including scientific notation) and an optional unit
#: in a tolerance string such as ``"[-20.0 ppm, 20.0 ppm]"`` or ``"2e-3 Da"``.
_TOLERANCE_TOKEN = re.compile(r"(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(ppm|da|th|mmu|amu)?", re.IGNORECASE)


def _normalize_enzyme(name: str) -> str:
    """
    Normalize an enzyme name for cleavage-rule lookup.

    Lower-cases the name and removes spaces, underscores, and hyphens so that
    ``"Lys-C"``, ``"lys_c"``, and ``"LysC"`` all map to ``"lysc"`` (the slash in
    ``"Trypsin/P"`` is preserved).

    Parameters
    ----------
    name : str
        The raw enzyme name.

    Returns
    -------
    str
        The normalized enzyme key.
    """
    text = str(name).strip().lower()
    for char in (" ", "_", "-"):
        text = text.replace(char, "")
    return text


def _is_missing(value: Any) -> bool:
    """
    Determine whether a parameter value should be treated as "not provided".

    Treats ``None``, ``np.nan``, and the literal strings ``"None"``/``"nan"``/``""``
    as missing (matching how :class:`ProteoBenchParameters` represents absent values).

    Parameters
    ----------
    value : Any
        The value to test.

    Returns
    -------
    bool
        ``True`` if the value is missing.
    """
    if value is None:
        return True
    if isinstance(value, float) and np.isnan(value):
        return True
    if isinstance(value, str) and value.strip().lower() in {"", "none", "nan"}:
        return True
    return False


def _as_int(value: Any) -> Optional[int]:
    """
    Coerce a value to ``int`` if possible.

    Parameters
    ----------
    value : Any
        The value to coerce.

    Returns
    -------
    int or None
        The integer value, or ``None`` if it is missing or not convertible.
    """
    if _is_missing(value):
        return None
    try:
        return int(float(value))
    except (TypeError, ValueError):
        return None


def _as_float(value: Any) -> Optional[float]:
    """
    Coerce a value to ``float`` if possible.

    Parameters
    ----------
    value : Any
        The value to coerce.

    Returns
    -------
    float or None
        The float value, or ``None`` if it is missing or not convertible.
    """
    if _is_missing(value):
        return None
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def _format_range(minimum: Optional[int], maximum: Optional[int]) -> str:
    """
    Format an inclusive numeric range for display.

    Parameters
    ----------
    minimum : int or None
        Lower bound (``None`` means unbounded).
    maximum : int or None
        Upper bound (``None`` means unbounded).

    Returns
    -------
    str
        A string such as ``"[2, 4]"`` or ``"[2, unbounded]"``.
    """
    low = "unbounded" if minimum is None else str(minimum)
    high = "unbounded" if maximum is None else str(maximum)
    return f"[{low}, {high}]"


def _identifier_series(df: pd.DataFrame, config: ModuleValidationConfig) -> Optional[pd.Series]:
    """
    Pick the best per-row identifier column for example reporting.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    pandas.Series or None
        A series of human-readable row identifiers, or ``None`` if unavailable.
    """
    for column in ("precursor ion", "peptidoform", config.sequence_column):
        if column in df.columns:
            return df[column].astype(str)
    return None


[docs] def check_protein_ids( df: pd.DataFrame, fasta: FastaReference, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Validate protein identifiers against the reference FASTA accession set. Splits protein groups, skips decoy and contaminant identifiers, and reports as an error any remaining identifier that is not found in the reference. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame. fasta : FastaReference Reference protein identifiers. config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Issues describing missing protein identifiers (or an info confirming all identifiers were found). """ report = ValidationReport() if config.protein_column not in df.columns: report.add_warning( "protein_column_missing", f"Protein column '{config.protein_column}' not found in the standardized results; " "protein-identifier validation was skipped.", "protein_ids", field=config.protein_column, ) return report.issues # Collect unique, target (non-decoy / non-contaminant) protein tokens. target_tokens: set = set() for cell in df[config.protein_column].dropna().unique(): for token in split_protein_groups(cell, config.protein_group_separators): if is_decoy_or_contaminant(token, config.contaminant_flag, config.decoy_prefixes): continue target_tokens.add(token) if not target_tokens: report.add_warning( "no_protein_ids", "No target protein identifiers were found in the results (all empty, decoy, or contaminant).", "protein_ids", field=config.protein_column, ) return report.issues missing = [token for token in target_tokens if not fasta.contains_any(extract_identifiers(token))] n_unique = len(target_tokens) n_missing = len(missing) n_found = n_unique - n_missing if n_missing > 0: examples = sorted(missing)[:MAX_PROTEIN_EXAMPLES] report.add_error( "protein_not_in_fasta", f"{n_missing} of {n_unique} unique protein identifiers are not present in the reference " f"database ({n_found} found). These are non-decoy, non-contaminant identifiers and likely " "indicate the wrong FASTA was used or proteins outside the benchmark.", "protein_ids", field=config.protein_column, observed={"n_unique": n_unique, "n_found": n_found, "n_missing": n_missing}, expected="all identifiers present in the module reference database", examples=examples, ) else: report.add_info( "protein_ids_ok", f"All {n_unique} unique protein identifiers were found in the reference database.", "protein_ids", field=config.protein_column, observed={"n_unique": n_unique, "n_found": n_found, "n_missing": 0}, ) return report.issues
[docs] def check_charge_range( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Validate that observed precursor charges fall within the parsed charge range. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame. params : Any Parsed parameters (object with ``min_precursor_charge`` / ``max_precursor_charge`` attributes). config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Issues describing out-of-range charges, or warnings when the constraint or column is unavailable. """ report = ValidationReport() check = "charge_range" min_charge = _as_int(getattr(params, "min_precursor_charge", None)) max_charge = _as_int(getattr(params, "max_precursor_charge", None)) if min_charge is None and max_charge is None: report.add_warning( "charge_range_not_parsed", "Could not validate precursor charge because no minimum/maximum charge constraint " "was parsed from the parameter file.", check, field="precursor_charge", ) return report.issues if config.charge_column not in df.columns: report.add_warning( "charge_column_missing", f"Charge column '{config.charge_column}' not found in the standardized results; " "charge-range validation was skipped.", check, field=config.charge_column, ) return report.issues charges = pd.to_numeric(df[config.charge_column], errors="coerce") valid = charges.notna() mask = pd.Series(False, index=df.index) if min_charge is not None: mask = mask | (valid & (charges < min_charge)) if max_charge is not None: mask = mask | (valid & (charges > max_charge)) n_offending = int(mask.sum()) if n_offending > 0: offending_charges = sorted({int(c) for c in charges[mask].dropna().unique()}) identifiers = _identifier_series(df, config) if identifiers is not None: examples = identifiers[mask].unique().tolist()[:MAX_ROW_EXAMPLES] else: examples = offending_charges[:MAX_ROW_EXAMPLES] report.add_error( "charge_out_of_range", f"{n_offending} result row(s) have a precursor charge outside the searched range " f"{_format_range(min_charge, max_charge)} (observed charges: {offending_charges}).", check, field=config.charge_column, observed=offending_charges, expected=_format_range(min_charge, max_charge), examples=examples, ) return report.issues
[docs] def check_peptide_length( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Validate that peptide lengths fall within the parsed peptide-length range. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame. params : Any Parsed parameters (object with ``min_peptide_length`` / ``max_peptide_length`` attributes). config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Issues describing out-of-range peptide lengths, or warnings when the constraint or column is unavailable. """ report = ValidationReport() check = "peptide_length" min_len = _as_int(getattr(params, "min_peptide_length", None)) max_len = _as_int(getattr(params, "max_peptide_length", None)) if min_len is None and max_len is None: report.add_warning( "peptide_length_not_parsed", "Could not validate peptide length because no minimum/maximum peptide-length " "constraint was parsed from the parameter file.", check, field="peptide_length", ) return report.issues if config.sequence_column not in df.columns: report.add_warning( "sequence_column_missing", f"Sequence column '{config.sequence_column}' not found in the standardized results; " "peptide-length validation was skipped.", check, field=config.sequence_column, ) return report.issues sequences = df[config.sequence_column].astype(str) lengths = sequences.str.count(r"[A-Za-z]") mask = pd.Series(False, index=df.index) if min_len is not None: mask = mask | (lengths < min_len) if max_len is not None: mask = mask | (lengths > max_len) n_offending = int(mask.sum()) if n_offending > 0: examples = sequences[mask].unique().tolist()[:MAX_ROW_EXAMPLES] offending_lengths = sorted({int(length) for length in lengths[mask].unique()}) report.add_error( "peptide_length_out_of_range", f"{n_offending} result row(s) have a peptide length outside the searched range " f"{_format_range(min_len, max_len)} (observed lengths: {offending_lengths}).", check, field=config.sequence_column, observed=offending_lengths, expected=_format_range(min_len, max_len), examples=examples, ) return report.issues
def _count_missed_cleavages(sequence: str, residues: str, cleave_before_proline: bool) -> int: """ Count internal missed cleavages for a C-terminal cleaving enzyme. A missed cleavage is an internal cleavage residue (one not at the C-terminus). For proline-restricted enzymes a cleavage residue immediately followed by ``P`` does not count. Parameters ---------- sequence : str Peptide sequence (plain amino-acid letters). residues : str Residues the enzyme cleaves C-terminal to (e.g. ``"KR"`` for trypsin). cleave_before_proline : bool ``True`` if the enzyme still cleaves when proline follows the residue. Returns ------- int Number of internal missed cleavages. """ seq = "".join(ch for ch in str(sequence) if ch.isalpha()).upper() if len(seq) < 2: return 0 residue_set = set(residues) count = 0 for i in range(len(seq) - 1): if seq[i] in residue_set and (cleave_before_proline or seq[i + 1] != "P"): count += 1 return count
[docs] def check_enzyme( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Best-effort enzyme/specificity check (missed cleavages, warning only). Supports common C-terminal cleaving enzymes via :data:`_ENZYME_CLEAVAGE_RULES` (trypsin, trypsin/P, Lys-C, Arg-C, Glu-C, chymotrypsin). For each unique peptide it counts internal cleavage residues and warns when more peptides than allowed exceed ``allowed_miscleavages``. This is a heuristic: it ignores ragged termini and protein ends, so it can only be a warning. N-terminal cleavers (Asp-N, Lys-N) and unknown enzymes are reported as info (skipped). Parameters ---------- df : pandas.DataFrame The standardized result DataFrame. params : Any Parsed parameters (object with ``enzyme``, ``semi_enzymatic``, ``allowed_miscleavages`` attributes). config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Warnings for peptides exceeding the allowed missed cleavages, or info/warning describing why the check was skipped. """ report = ValidationReport() check = "enzyme" enzyme = getattr(params, "enzyme", None) if _is_missing(enzyme): report.add_warning( "enzyme_not_parsed", "Could not validate enzyme specificity because no enzyme was parsed from the parameter file.", check, field="enzyme", ) return report.issues normalized = _normalize_enzyme(enzyme) if normalized not in _ENZYME_CLEAVAGE_RULES: report.add_info( "enzyme_check_unsupported", f"Enzyme-specificity validation is not implemented for enzyme '{enzyme}'; check skipped.", check, field="enzyme", observed=enzyme, ) return report.issues rule = _ENZYME_CLEAVAGE_RULES[normalized] if rule is None: report.add_info( "enzyme_check_unsupported", f"Enzyme '{enzyme}' cleaves N-terminal to its residue; the missed-cleavage heuristic " "does not apply and the check was skipped.", check, field="enzyme", observed=enzyme, ) return report.issues residues, cleave_before_proline = rule if bool(getattr(params, "semi_enzymatic", False)) is True: report.add_info( "enzyme_semi_skipped", "Semi-enzymatic search detected; the missed-cleavage heuristic was skipped.", check, field="semi_enzymatic", ) return report.issues allowed = _as_int(getattr(params, "allowed_miscleavages", None)) if allowed is None: report.add_warning( "miscleavages_not_parsed", "Could not validate missed cleavages because 'allowed_miscleavages' was not parsed " "from the parameter file.", check, field="allowed_miscleavages", ) return report.issues if config.sequence_column not in df.columns: report.add_warning( "sequence_column_missing", f"Sequence column '{config.sequence_column}' not found; missed-cleavage check skipped.", check, field=config.sequence_column, ) return report.issues sequences = df[config.sequence_column].astype(str) unique_sequences = pd.Series(sequences.unique()) missed = unique_sequences.apply(lambda s: _count_missed_cleavages(s, residues, cleave_before_proline)) offending = unique_sequences[missed > allowed] if len(offending) > 0: examples = [f"{seq} ({_count_missed_cleavages(seq, residues, cleave_before_proline)} MC)" for seq in offending][ :MAX_ROW_EXAMPLES ] report.add_warning( "missed_cleavages_exceeded", f"{len(offending)} unique peptide sequence(s) appear to exceed the allowed missed " f"cleavages ({allowed}) for {enzyme}. This is a heuristic (ignores ragged termini and " "protein ends); review before submitting.", check, field="allowed_miscleavages", observed=f"{len(offending)} sequences with > {allowed} internal cleavage sites", expected=f"<= {allowed} internal missed cleavages", examples=examples, ) return report.issues
[docs] def check_modifications( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Best-effort modification compatibility check (warnings only). Compares human-readable modification names observed in the ``proforma`` column against the parsed fixed/variable modification settings. Mass-only and UniMod-only modification tokens are not compared because their representation is not normalized across tools. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame. params : Any Parsed parameters (object with ``fixed_mods`` / ``variable_mods``). config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Warnings for observed modification names not found in the declared settings, or a warning/info describing why the check was limited. """ report = ValidationReport() check = "modifications" if config.proforma_column not in df.columns: report.add_info( "modifications_no_proforma", f"No '{config.proforma_column}' column in the results; modification validation was skipped.", check, field=config.proforma_column, ) return report.issues observed: set = set() for value in df[config.proforma_column].dropna().astype(str).unique(): for match in _PROFORMA_MOD.findall(value): observed.add(match.strip()) if not observed: report.add_info( "modifications_none_observed", "No modifications were observed in the results; nothing to validate.", check, ) return report.issues fixed_mods = getattr(params, "fixed_mods", None) variable_mods = getattr(params, "variable_mods", None) declared_parts = [] for value in (fixed_mods, variable_mods): if not _is_missing(value): declared_parts.append(str(value)) declared_text = " ".join(declared_parts).lower() if not declared_text: report.add_warning( "modifications_not_parsed", "Could not validate modifications because no fixed/variable modification settings were " "parsed from the parameter file.", check, field="variable_mods", observed=sorted(observed)[:MAX_ROW_EXAMPLES], ) return report.issues # Only compare clean, human-readable modification names; skip mass/UniMod tokens. unmatched = [] for token in sorted(observed): name = token.replace(" ", "") if len(name) < 3 or not name.isalpha(): continue if name.lower() not in declared_text: unmatched.append(token) if unmatched: report.add_warning( "modification_not_declared", f"{len(unmatched)} observed modification name(s) were not found in the declared " "fixed/variable modifications. Modification names differ across tools, so this is a " "heuristic; review before submitting.", check, field="variable_mods", observed=unmatched[:MAX_ROW_EXAMPLES], expected="modifications declared in the parameter file", examples=unmatched[:MAX_ROW_EXAMPLES], ) return report.issues
[docs] def check_run_consistency( df: pd.DataFrame, params: Any, input_format: Optional[str], config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Check that the parameter file matches the submitted run, where feasible. Only software identity can be compared, because :class:`ProteoBenchParameters` does not expose raw-file, sample, or experiment identifiers. A mismatch in software identity is reported as an error; the unavailable run-level matching is reported as info. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame (unused for software identity but kept for signature consistency and future extension). params : Any Parsed parameters (object with ``software_name`` / ``software_version``). input_format : str or None The selected software tool used to parse the results. config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Issues describing software-identity mismatches and the documented limitation on run-level matching. """ report = ValidationReport() check = "run_consistency" software_name = getattr(params, "software_name", None) if input_format and not _is_missing(software_name): if str(software_name).strip().lower() != str(input_format).strip().lower(): report.add_error( "software_mismatch", f"The parameter file reports software '{software_name}', but the results were " f"submitted as '{input_format}'. The parameter file may belong to a different run.", check, field="software_name", observed=software_name, expected=input_format, ) if _is_missing(getattr(params, "software_version", None)): report.add_info( "software_version_missing", "The software version could not be parsed from the parameter file.", check, field="software_version", ) report.add_info( "run_identity_limited", "Run-level matching (raw-file, sample, or experiment names) is not available because the " "parsed parameters do not expose these identifiers; only software identity was checked.", check, ) return report.issues
[docs] def check_max_modifications( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Check that no peptide carries more modifications than allowed (warning only). Counts the bracketed modifications in each ``proforma`` string and warns when more than ``max_mods`` are present. This is a heuristic: the count includes any fixed modifications written into the sequence, so it is an upper bound on the number of variable modifications. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame. params : Any Parsed parameters (object with a ``max_mods`` attribute). config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue A warning for peptides exceeding ``max_mods``, or a warning/info describing why the check was skipped. """ report = ValidationReport() check = "max_modifications" max_mods = _as_int(getattr(params, "max_mods", None)) if max_mods is None: report.add_warning( "max_mods_not_parsed", "Could not validate the maximum number of modifications because 'max_mods' was not " "parsed from the parameter file.", check, field="max_mods", ) return report.issues if config.proforma_column not in df.columns: report.add_info( "max_mods_no_proforma", f"No '{config.proforma_column}' column in the results; the maximum-modifications check was skipped.", check, field=config.proforma_column, ) return report.issues proforma = df[config.proforma_column].dropna().astype(str) unique_proforma = pd.Series(proforma.unique()) mod_counts = unique_proforma.apply(lambda s: len(_PROFORMA_MOD.findall(s))) offending = unique_proforma[mod_counts > max_mods] if len(offending) > 0: examples = [f"{seq} ({len(_PROFORMA_MOD.findall(seq))} mods)" for seq in offending][:MAX_ROW_EXAMPLES] report.add_warning( "max_modifications_exceeded", f"{len(offending)} unique peptidoform(s) carry more than the allowed {max_mods} " "modification(s). The count includes any fixed modifications present in the sequence, " "so it is an upper bound; review before submitting.", check, field="max_mods", observed=f"{len(offending)} peptidoforms with > {max_mods} modifications", expected=f"<= {max_mods} modifications per peptidoform", examples=examples, ) return report.issues
def _parse_tolerance(text: Any) -> tuple: """ Parse a tolerance string into a magnitude and unit. Handles bracketed signed ranges such as ``"[-20.0 ppm, 20.0 ppm]"`` by returning the largest absolute magnitude and the (lower-cased) unit. Parameters ---------- text : Any The tolerance value (typically a formatted string). Returns ------- tuple ``(magnitude, unit)`` where ``magnitude`` is a float (or ``None`` if no number could be parsed) and ``unit`` is a lower-case string or ``None``. """ if _is_missing(text): return None, None magnitudes = [] unit = None for number, parsed_unit in _TOLERANCE_TOKEN.findall(str(text)): try: magnitudes.append(abs(float(number))) except ValueError: continue if parsed_unit: unit = parsed_unit.lower() if not magnitudes: return None, None return max(magnitudes), unit def _check_one_tolerance( report: ValidationReport, value: Any, label: str, field: str, config: ModuleValidationConfig ) -> None: """ Sanity-check a single mass-tolerance value and append any issue. The "implausibly large" sub-check runs only when the relevant plausibility ceiling is configured (``config.max_plausible_ppm`` / ``config.max_plausible_dalton``). These have no default, so the sub-check is skipped when they are unset. The present/numeric/positive checks always run. Parameters ---------- report : ValidationReport Report to append issues to. value : Any The tolerance value from the parameters. label : str Human-readable label (e.g. ``"precursor mass tolerance"``). field : str The parameter field name (used in the issue ``field`` and codes). config : ModuleValidationConfig Module validation configuration (provides the plausibility ceilings). """ check = "mass_tolerance" if _is_missing(value): report.add_warning( f"{field}_not_parsed", f"Could not validate the {label} because it was not parsed from the parameter file.", check, field=field, ) return magnitude, unit = _parse_tolerance(value) if magnitude is None: report.add_warning( f"{field}_unparsable", f"The {label} ('{value}') could not be interpreted as a numeric tolerance.", check, field=field, observed=value, ) return if magnitude <= 0: report.add_warning( f"{field}_non_positive", f"The {label} ('{value}') is zero or negative, which is not a valid search tolerance.", check, field=field, observed=value, ) return if unit == "ppm": ceiling = config.max_plausible_ppm elif unit in {"da", "th", "amu"}: ceiling = config.max_plausible_dalton elif unit == "mmu": # 1 mmu = 1e-3 Da, so the Dalton ceiling becomes 1000x larger in mmu. ceiling = None if config.max_plausible_dalton is None else config.max_plausible_dalton * 1000 else: ceiling = None if ceiling is not None and magnitude > ceiling: report.add_warning( f"{field}_implausible", f"The {label} ('{value}') is unusually large and may indicate a mis-parsed value; " "review before submitting.", check, field=field, observed=value, expected=f"<= {ceiling:g} {unit}", )
[docs] def check_mass_tolerances( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Sanity-check the precursor and fragment mass tolerances (warning only). There is no per-result tolerance to compare against, so this validates that the parsed ``precursor_mass_tolerance`` and ``fragment_mass_tolerance`` are present, numeric, and positive. When the module configures a plausibility ceiling (``config.max_plausible_ppm`` / ``config.max_plausible_dalton``, which have no default), tolerances above it are also flagged; otherwise that sub-check is skipped. Mis-parsed or nonsensical values are flagged as warnings. Parameters ---------- df : pandas.DataFrame The standardized result DataFrame (unused; kept for signature consistency). params : Any Parsed parameters (object with ``precursor_mass_tolerance`` / ``fragment_mass_tolerance`` attributes). config : ModuleValidationConfig Module validation configuration. Returns ------- list of ValidationIssue Warnings for missing, unparsable, or implausible tolerances. """ report = ValidationReport() _check_one_tolerance( report, getattr(params, "precursor_mass_tolerance", None), "precursor mass tolerance", "precursor_mass_tolerance", config, ) _check_one_tolerance( report, getattr(params, "fragment_mass_tolerance", None), "fragment mass tolerance", "fragment_mass_tolerance", config, ) return report.issues
[docs] def check_fdr_psm( df: pd.DataFrame, params: Any, config: ModuleValidationConfig, ) -> List[ValidationIssue]: """ Sanity-check the PSM-level FDR (warning only). Validates that ``ident_fdr_psm`` is present, within ``[0, 1]``, and not above the benchmark's recommended maximum (:attr:`ModuleValidationConfig.recommended_max_fdr_psm`, default 0.01). Parameters ---------- df : pandas.DataFrame The standardized result DataFrame (unused; kept for signature consistency). params : Any Parsed parameters (object with an ``ident_fdr_psm`` attribute). config : ModuleValidationConfig Module validation configuration (provides ``recommended_max_fdr_psm``). Returns ------- list of ValidationIssue Warnings for a missing, out-of-range, or above-recommended PSM FDR. """ report = ValidationReport() check = "fdr" fdr = _as_float(getattr(params, "ident_fdr_psm", None)) if fdr is None: report.add_warning( "fdr_psm_not_parsed", "Could not validate the PSM FDR because 'ident_fdr_psm' was not parsed from the parameter file.", check, field="ident_fdr_psm", ) return report.issues if fdr < 0 or fdr > 1: report.add_warning( "fdr_psm_out_of_range", f"The PSM FDR ({fdr}) is outside the valid range [0, 1]; the value may be mis-parsed.", check, field="ident_fdr_psm", observed=fdr, expected="[0, 1]", ) return report.issues recommended = getattr(config, "recommended_max_fdr_psm", None) if recommended is not None and fdr > recommended: report.add_warning( "fdr_psm_above_recommended", f"The PSM FDR ({fdr}) is higher than the recommended maximum of {recommended} for this benchmark.", check, field="ident_fdr_psm", observed=fdr, expected=f"<= {recommended}", ) return report.issues