Source code for proteobench.validation.checks

"""
Individual validation checks operating on the standardized result DataFrame.

Every check is a pure function that takes the standardized DataFrame, the
parsed :class:`~proteobench.io.params.ProteoBenchParameters` (or any object with
the same attributes), and a :class:`~proteobench.validation.config.ModuleValidationConfig`,
and returns a list of :class:`~proteobench.validation.report.ValidationIssue`.

The checks are deliberately generic: they read the standardized columns
(``Proteins``, ``Sequence``, ``Charge``, ``proforma``) and the parameter
attributes, never tool-specific result columns. Missing or unparsed parameters
yield warnings rather than errors, so a submission is never blocked merely
because a value could not be parsed.

Documented limitations and intentionally skipped checks:

* **Enzyme specificity**: a missed-cleavage heuristic is implemented for common
  C-terminal cleaving enzymes (trypsin, trypsin/P, Lys-C, Arg-C, Glu-C,
  chymotrypsin) and only as a *warning*. It ignores protein N-/C-termini and
  ragged ends (resolving those would need the reference protein sequences), and
  N-terminal cleavers (Asp-N, Lys-N) are skipped.
* **Modifications**: cross-tool modification representations are not normalized
  (human-readable names, UniMod accessions, and raw masses all occur). Only
  human-readable modification names observed in the ``proforma`` column are
  compared, as warnings; mass-only / UniMod-only tokens are skipped. The
  maximum-modifications count includes any fixed modifications written into the
  sequence, so it is an upper bound (warning only).
* **Mass tolerances**: there is no per-result tolerance to compare against, so
  the precursor/fragment tolerances are only sanity-checked (present, numeric,
  positive), as warnings. An optional plausibility ceiling
  (``max_plausible_ppm`` / ``max_plausible_dalton`` on the config) has no
  default; the implausible-value check is skipped unless a module configures it.
* **PSM FDR**: validated against the valid ``[0, 1]`` range and the benchmark's
  recommended maximum (configurable), as warnings.
* **Run identity**: ``ProteoBenchParameters`` does not expose raw-file, sample,
  or experiment identifiers, so result-vs-parameter run matching is limited to
  software identity. This is reported as info.
"""

from __future__ import annotations

import re
from typing import Any, List, Optional

import numpy as np
import pandas as pd

from proteobench.validation.config import ModuleValidationConfig
from proteobench.validation.fasta import FastaReference
from proteobench.validation.protein_ids import extract_identifiers, is_decoy_or_contaminant, split_protein_groups
from proteobench.validation.report import ValidationIssue, ValidationReport

#: Maximum number of example offending protein identifiers to report.
MAX_PROTEIN_EXAMPLES = 20

#: Maximum number of example offending rows to report for other checks.
MAX_ROW_EXAMPLES = 10

#: Matches a bracketed modification label inside a ProForma string.
_PROFORMA_MOD = re.compile(r"\[([^\]]+)\]")

#: C-terminal cleavage rules per normalized enzyme name: a tuple of
#: (residues the enzyme cleaves after, whether it cleaves when proline follows).
#: A value of ``None`` marks an N-terminal cleaver, for which the simple
#: internal-site count does not apply (skipped with an info message).
#: The rules follow the MaxQuant built-in enzyme defaults (e.g. Glu-C cleaves
#: after D and E). Because the missed-cleavage check is warning-only, these are
#: convention-dependent heuristics, not authoritative cleavage definitions.
_ENZYME_CLEAVAGE_RULES = {
    "trypsin": ("KR", False),
    "trypsin/p": ("KR", True),
    "lysc": ("K", True),
    "argc": ("R", False),
    "gluc": ("DE", True),
    "chymotrypsin": ("FYWL", False),
    "lysn": None,
    "aspn": None,
}

#: Matches a signed number (including scientific notation) and an optional unit
#: in a tolerance string such as ``"[-20.0 ppm, 20.0 ppm]"`` or ``"2e-3 Da"``.
_TOLERANCE_TOKEN = re.compile(r"(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(ppm|da|th|mmu|amu)?", re.IGNORECASE)


def _normalize_enzyme(name: str) -> str:
    """
    Normalize an enzyme name for cleavage-rule lookup.

    Lower-cases the name and removes spaces, underscores, and hyphens so that
    ``"Lys-C"``, ``"lys_c"``, and ``"LysC"`` all map to ``"lysc"`` (the slash in
    ``"Trypsin/P"`` is preserved).

    Parameters
    ----------
    name : str
        The raw enzyme name.

    Returns
    -------
    str
        The normalized enzyme key.
    """
    text = str(name).strip().lower()
    for char in (" ", "_", "-"):
        text = text.replace(char, "")
    return text


def _is_missing(value: Any) -> bool:
    """
    Determine whether a parameter value should be treated as "not provided".

    Treats ``None``, ``np.nan``, and the literal strings ``"None"``/``"nan"``/``""``
    as missing (matching how :class:`ProteoBenchParameters` represents absent values).

    Parameters
    ----------
    value : Any
        The value to test.

    Returns
    -------
    bool
        ``True`` if the value is missing.
    """
    if value is None:
        return True
    if isinstance(value, float) and np.isnan(value):
        return True
    if isinstance(value, str) and value.strip().lower() in {"", "none", "nan"}:
        return True
    return False


def _as_int(value: Any) -> Optional[int]:
    """
    Coerce a value to ``int`` if possible.

    Parameters
    ----------
    value : Any
        The value to coerce.

    Returns
    -------
    int or None
        The integer value, or ``None`` if it is missing or not convertible.
    """
    if _is_missing(value):
        return None
    try:
        return int(float(value))
    except (TypeError, ValueError):
        return None


def _as_float(value: Any) -> Optional[float]:
    """
    Coerce a value to ``float`` if possible.

    Parameters
    ----------
    value : Any
        The value to coerce.

    Returns
    -------
    float or None
        The float value, or ``None`` if it is missing or not convertible.
    """
    if _is_missing(value):
        return None
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def _format_range(minimum: Optional[int], maximum: Optional[int]) -> str:
    """
    Format an inclusive numeric range for display.

    Parameters
    ----------
    minimum : int or None
        Lower bound (``None`` means unbounded).
    maximum : int or None
        Upper bound (``None`` means unbounded).

    Returns
    -------
    str
        A string such as ``"[2, 4]"`` or ``"[2, unbounded]"``.
    """
    low = "unbounded" if minimum is None else str(minimum)
    high = "unbounded" if maximum is None else str(maximum)
    return f"[{low}, {high}]"


def _identifier_series(df: pd.DataFrame, config: ModuleValidationConfig) -> Optional[pd.Series]:
    """
    Pick the best per-row identifier column for example reporting.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    pandas.Series or None
        A series of human-readable row identifiers, or ``None`` if unavailable.
    """
    for column in ("precursor ion", "peptidoform", config.sequence_column):
        if column in df.columns:
            return df[column].astype(str)
    return None



[docs]
def check_protein_ids(
    df: pd.DataFrame,
    fasta: FastaReference,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Validate protein identifiers against the reference FASTA accession set.

    Splits protein groups, skips decoy and contaminant identifiers, and reports
    as an error any remaining identifier that is not found in the reference.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    fasta : FastaReference
        Reference protein identifiers.
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Issues describing missing protein identifiers (or an info confirming all
        identifiers were found).
    """
    report = ValidationReport()

    if config.protein_column not in df.columns:
        report.add_warning(
            "protein_column_missing",
            f"Protein column '{config.protein_column}' not found in the standardized results; "
            "protein-identifier validation was skipped.",
            "protein_ids",
            field=config.protein_column,
        )
        return report.issues

    # Collect unique, target (non-decoy / non-contaminant) protein tokens.
    target_tokens: set = set()
    for cell in df[config.protein_column].dropna().unique():
        for token in split_protein_groups(cell, config.protein_group_separators):
            if is_decoy_or_contaminant(token, config.contaminant_flag, config.decoy_prefixes):
                continue
            target_tokens.add(token)

    if not target_tokens:
        report.add_warning(
            "no_protein_ids",
            "No target protein identifiers were found in the results (all empty, decoy, or contaminant).",
            "protein_ids",
            field=config.protein_column,
        )
        return report.issues

    missing = [token for token in target_tokens if not fasta.contains_any(extract_identifiers(token))]

    n_unique = len(target_tokens)
    n_missing = len(missing)
    n_found = n_unique - n_missing

    if n_missing > 0:
        examples = sorted(missing)[:MAX_PROTEIN_EXAMPLES]
        report.add_error(
            "protein_not_in_fasta",
            f"{n_missing} of {n_unique} unique protein identifiers are not present in the reference "
            f"database ({n_found} found). These are non-decoy, non-contaminant identifiers and likely "
            "indicate the wrong FASTA was used or proteins outside the benchmark.",
            "protein_ids",
            field=config.protein_column,
            observed={"n_unique": n_unique, "n_found": n_found, "n_missing": n_missing},
            expected="all identifiers present in the module reference database",
            examples=examples,
        )
    else:
        report.add_info(
            "protein_ids_ok",
            f"All {n_unique} unique protein identifiers were found in the reference database.",
            "protein_ids",
            field=config.protein_column,
            observed={"n_unique": n_unique, "n_found": n_found, "n_missing": 0},
        )

    return report.issues




[docs]
def check_charge_range(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Validate that observed precursor charges fall within the parsed charge range.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    params : Any
        Parsed parameters (object with ``min_precursor_charge`` /
        ``max_precursor_charge`` attributes).
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Issues describing out-of-range charges, or warnings when the constraint
        or column is unavailable.
    """
    report = ValidationReport()
    check = "charge_range"

    min_charge = _as_int(getattr(params, "min_precursor_charge", None))
    max_charge = _as_int(getattr(params, "max_precursor_charge", None))

    if min_charge is None and max_charge is None:
        report.add_warning(
            "charge_range_not_parsed",
            "Could not validate precursor charge because no minimum/maximum charge constraint "
            "was parsed from the parameter file.",
            check,
            field="precursor_charge",
        )
        return report.issues

    if config.charge_column not in df.columns:
        report.add_warning(
            "charge_column_missing",
            f"Charge column '{config.charge_column}' not found in the standardized results; "
            "charge-range validation was skipped.",
            check,
            field=config.charge_column,
        )
        return report.issues

    charges = pd.to_numeric(df[config.charge_column], errors="coerce")
    valid = charges.notna()

    mask = pd.Series(False, index=df.index)
    if min_charge is not None:
        mask = mask | (valid & (charges < min_charge))
    if max_charge is not None:
        mask = mask | (valid & (charges > max_charge))

    n_offending = int(mask.sum())
    if n_offending > 0:
        offending_charges = sorted({int(c) for c in charges[mask].dropna().unique()})
        identifiers = _identifier_series(df, config)
        if identifiers is not None:
            examples = identifiers[mask].unique().tolist()[:MAX_ROW_EXAMPLES]
        else:
            examples = offending_charges[:MAX_ROW_EXAMPLES]
        report.add_error(
            "charge_out_of_range",
            f"{n_offending} result row(s) have a precursor charge outside the searched range "
            f"{_format_range(min_charge, max_charge)} (observed charges: {offending_charges}).",
            check,
            field=config.charge_column,
            observed=offending_charges,
            expected=_format_range(min_charge, max_charge),
            examples=examples,
        )

    return report.issues




[docs]
def check_peptide_length(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Validate that peptide lengths fall within the parsed peptide-length range.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    params : Any
        Parsed parameters (object with ``min_peptide_length`` /
        ``max_peptide_length`` attributes).
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Issues describing out-of-range peptide lengths, or warnings when the
        constraint or column is unavailable.
    """
    report = ValidationReport()
    check = "peptide_length"

    min_len = _as_int(getattr(params, "min_peptide_length", None))
    max_len = _as_int(getattr(params, "max_peptide_length", None))

    if min_len is None and max_len is None:
        report.add_warning(
            "peptide_length_not_parsed",
            "Could not validate peptide length because no minimum/maximum peptide-length "
            "constraint was parsed from the parameter file.",
            check,
            field="peptide_length",
        )
        return report.issues

    if config.sequence_column not in df.columns:
        report.add_warning(
            "sequence_column_missing",
            f"Sequence column '{config.sequence_column}' not found in the standardized results; "
            "peptide-length validation was skipped.",
            check,
            field=config.sequence_column,
        )
        return report.issues

    sequences = df[config.sequence_column].astype(str)
    lengths = sequences.str.count(r"[A-Za-z]")

    mask = pd.Series(False, index=df.index)
    if min_len is not None:
        mask = mask | (lengths < min_len)
    if max_len is not None:
        mask = mask | (lengths > max_len)

    n_offending = int(mask.sum())
    if n_offending > 0:
        examples = sequences[mask].unique().tolist()[:MAX_ROW_EXAMPLES]
        offending_lengths = sorted({int(length) for length in lengths[mask].unique()})
        report.add_error(
            "peptide_length_out_of_range",
            f"{n_offending} result row(s) have a peptide length outside the searched range "
            f"{_format_range(min_len, max_len)} (observed lengths: {offending_lengths}).",
            check,
            field=config.sequence_column,
            observed=offending_lengths,
            expected=_format_range(min_len, max_len),
            examples=examples,
        )

    return report.issues



def _count_missed_cleavages(sequence: str, residues: str, cleave_before_proline: bool) -> int:
    """
    Count internal missed cleavages for a C-terminal cleaving enzyme.

    A missed cleavage is an internal cleavage residue (one not at the
    C-terminus). For proline-restricted enzymes a cleavage residue immediately
    followed by ``P`` does not count.

    Parameters
    ----------
    sequence : str
        Peptide sequence (plain amino-acid letters).
    residues : str
        Residues the enzyme cleaves C-terminal to (e.g. ``"KR"`` for trypsin).
    cleave_before_proline : bool
        ``True`` if the enzyme still cleaves when proline follows the residue.

    Returns
    -------
    int
        Number of internal missed cleavages.
    """
    seq = "".join(ch for ch in str(sequence) if ch.isalpha()).upper()
    if len(seq) < 2:
        return 0
    residue_set = set(residues)
    count = 0
    for i in range(len(seq) - 1):
        if seq[i] in residue_set and (cleave_before_proline or seq[i + 1] != "P"):
            count += 1
    return count



[docs]
def check_enzyme(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Best-effort enzyme/specificity check (missed cleavages, warning only).

    Supports common C-terminal cleaving enzymes via :data:`_ENZYME_CLEAVAGE_RULES`
    (trypsin, trypsin/P, Lys-C, Arg-C, Glu-C, chymotrypsin). For each unique
    peptide it counts internal cleavage residues and warns when more peptides
    than allowed exceed ``allowed_miscleavages``. This is a heuristic: it ignores
    ragged termini and protein ends, so it can only be a warning. N-terminal
    cleavers (Asp-N, Lys-N) and unknown enzymes are reported as info (skipped).

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    params : Any
        Parsed parameters (object with ``enzyme``, ``semi_enzymatic``,
        ``allowed_miscleavages`` attributes).
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Warnings for peptides exceeding the allowed missed cleavages, or
        info/warning describing why the check was skipped.
    """
    report = ValidationReport()
    check = "enzyme"

    enzyme = getattr(params, "enzyme", None)
    if _is_missing(enzyme):
        report.add_warning(
            "enzyme_not_parsed",
            "Could not validate enzyme specificity because no enzyme was parsed from the parameter file.",
            check,
            field="enzyme",
        )
        return report.issues

    normalized = _normalize_enzyme(enzyme)
    if normalized not in _ENZYME_CLEAVAGE_RULES:
        report.add_info(
            "enzyme_check_unsupported",
            f"Enzyme-specificity validation is not implemented for enzyme '{enzyme}'; check skipped.",
            check,
            field="enzyme",
            observed=enzyme,
        )
        return report.issues

    rule = _ENZYME_CLEAVAGE_RULES[normalized]
    if rule is None:
        report.add_info(
            "enzyme_check_unsupported",
            f"Enzyme '{enzyme}' cleaves N-terminal to its residue; the missed-cleavage heuristic "
            "does not apply and the check was skipped.",
            check,
            field="enzyme",
            observed=enzyme,
        )
        return report.issues
    residues, cleave_before_proline = rule

    if bool(getattr(params, "semi_enzymatic", False)) is True:
        report.add_info(
            "enzyme_semi_skipped",
            "Semi-enzymatic search detected; the missed-cleavage heuristic was skipped.",
            check,
            field="semi_enzymatic",
        )
        return report.issues

    allowed = _as_int(getattr(params, "allowed_miscleavages", None))
    if allowed is None:
        report.add_warning(
            "miscleavages_not_parsed",
            "Could not validate missed cleavages because 'allowed_miscleavages' was not parsed "
            "from the parameter file.",
            check,
            field="allowed_miscleavages",
        )
        return report.issues

    if config.sequence_column not in df.columns:
        report.add_warning(
            "sequence_column_missing",
            f"Sequence column '{config.sequence_column}' not found; missed-cleavage check skipped.",
            check,
            field=config.sequence_column,
        )
        return report.issues

    sequences = df[config.sequence_column].astype(str)
    unique_sequences = pd.Series(sequences.unique())
    missed = unique_sequences.apply(lambda s: _count_missed_cleavages(s, residues, cleave_before_proline))
    offending = unique_sequences[missed > allowed]

    if len(offending) > 0:
        examples = [f"{seq} ({_count_missed_cleavages(seq, residues, cleave_before_proline)} MC)" for seq in offending][
            :MAX_ROW_EXAMPLES
        ]
        report.add_warning(
            "missed_cleavages_exceeded",
            f"{len(offending)} unique peptide sequence(s) appear to exceed the allowed missed "
            f"cleavages ({allowed}) for {enzyme}. This is a heuristic (ignores ragged termini and "
            "protein ends); review before submitting.",
            check,
            field="allowed_miscleavages",
            observed=f"{len(offending)} sequences with > {allowed} internal cleavage sites",
            expected=f"<= {allowed} internal missed cleavages",
            examples=examples,
        )

    return report.issues




[docs]
def check_modifications(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Best-effort modification compatibility check (warnings only).

    Compares human-readable modification names observed in the ``proforma``
    column against the parsed fixed/variable modification settings. Mass-only
    and UniMod-only modification tokens are not compared because their
    representation is not normalized across tools.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    params : Any
        Parsed parameters (object with ``fixed_mods`` / ``variable_mods``).
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Warnings for observed modification names not found in the declared
        settings, or a warning/info describing why the check was limited.
    """
    report = ValidationReport()
    check = "modifications"

    if config.proforma_column not in df.columns:
        report.add_info(
            "modifications_no_proforma",
            f"No '{config.proforma_column}' column in the results; modification validation was skipped.",
            check,
            field=config.proforma_column,
        )
        return report.issues

    observed: set = set()
    for value in df[config.proforma_column].dropna().astype(str).unique():
        for match in _PROFORMA_MOD.findall(value):
            observed.add(match.strip())

    if not observed:
        report.add_info(
            "modifications_none_observed",
            "No modifications were observed in the results; nothing to validate.",
            check,
        )
        return report.issues

    fixed_mods = getattr(params, "fixed_mods", None)
    variable_mods = getattr(params, "variable_mods", None)

    declared_parts = []
    for value in (fixed_mods, variable_mods):
        if not _is_missing(value):
            declared_parts.append(str(value))
    declared_text = " ".join(declared_parts).lower()

    if not declared_text:
        report.add_warning(
            "modifications_not_parsed",
            "Could not validate modifications because no fixed/variable modification settings were "
            "parsed from the parameter file.",
            check,
            field="variable_mods",
            observed=sorted(observed)[:MAX_ROW_EXAMPLES],
        )
        return report.issues

    # Only compare clean, human-readable modification names; skip mass/UniMod tokens.
    unmatched = []
    for token in sorted(observed):
        name = token.replace(" ", "")
        if len(name) < 3 or not name.isalpha():
            continue
        if name.lower() not in declared_text:
            unmatched.append(token)

    if unmatched:
        report.add_warning(
            "modification_not_declared",
            f"{len(unmatched)} observed modification name(s) were not found in the declared "
            "fixed/variable modifications. Modification names differ across tools, so this is a "
            "heuristic; review before submitting.",
            check,
            field="variable_mods",
            observed=unmatched[:MAX_ROW_EXAMPLES],
            expected="modifications declared in the parameter file",
            examples=unmatched[:MAX_ROW_EXAMPLES],
        )

    return report.issues




[docs]
def check_run_consistency(
    df: pd.DataFrame,
    params: Any,
    input_format: Optional[str],
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Check that the parameter file matches the submitted run, where feasible.

    Only software identity can be compared, because
    :class:`ProteoBenchParameters` does not expose raw-file, sample, or
    experiment identifiers. A mismatch in software identity is reported as an
    error; the unavailable run-level matching is reported as info.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame (unused for software identity but kept
        for signature consistency and future extension).
    params : Any
        Parsed parameters (object with ``software_name`` / ``software_version``).
    input_format : str or None
        The selected software tool used to parse the results.
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Issues describing software-identity mismatches and the documented
        limitation on run-level matching.
    """
    report = ValidationReport()
    check = "run_consistency"

    software_name = getattr(params, "software_name", None)
    if input_format and not _is_missing(software_name):
        if str(software_name).strip().lower() != str(input_format).strip().lower():
            report.add_error(
                "software_mismatch",
                f"The parameter file reports software '{software_name}', but the results were "
                f"submitted as '{input_format}'. The parameter file may belong to a different run.",
                check,
                field="software_name",
                observed=software_name,
                expected=input_format,
            )

    if _is_missing(getattr(params, "software_version", None)):
        report.add_info(
            "software_version_missing",
            "The software version could not be parsed from the parameter file.",
            check,
            field="software_version",
        )

    report.add_info(
        "run_identity_limited",
        "Run-level matching (raw-file, sample, or experiment names) is not available because the "
        "parsed parameters do not expose these identifiers; only software identity was checked.",
        check,
    )

    return report.issues




[docs]
def check_max_modifications(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Check that no peptide carries more modifications than allowed (warning only).

    Counts the bracketed modifications in each ``proforma`` string and warns
    when more than ``max_mods`` are present. This is a heuristic: the count
    includes any fixed modifications written into the sequence, so it is an
    upper bound on the number of variable modifications.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame.
    params : Any
        Parsed parameters (object with a ``max_mods`` attribute).
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        A warning for peptides exceeding ``max_mods``, or a warning/info
        describing why the check was skipped.
    """
    report = ValidationReport()
    check = "max_modifications"

    max_mods = _as_int(getattr(params, "max_mods", None))
    if max_mods is None:
        report.add_warning(
            "max_mods_not_parsed",
            "Could not validate the maximum number of modifications because 'max_mods' was not "
            "parsed from the parameter file.",
            check,
            field="max_mods",
        )
        return report.issues

    if config.proforma_column not in df.columns:
        report.add_info(
            "max_mods_no_proforma",
            f"No '{config.proforma_column}' column in the results; the maximum-modifications check was skipped.",
            check,
            field=config.proforma_column,
        )
        return report.issues

    proforma = df[config.proforma_column].dropna().astype(str)
    unique_proforma = pd.Series(proforma.unique())
    mod_counts = unique_proforma.apply(lambda s: len(_PROFORMA_MOD.findall(s)))
    offending = unique_proforma[mod_counts > max_mods]

    if len(offending) > 0:
        examples = [f"{seq} ({len(_PROFORMA_MOD.findall(seq))} mods)" for seq in offending][:MAX_ROW_EXAMPLES]
        report.add_warning(
            "max_modifications_exceeded",
            f"{len(offending)} unique peptidoform(s) carry more than the allowed {max_mods} "
            "modification(s). The count includes any fixed modifications present in the sequence, "
            "so it is an upper bound; review before submitting.",
            check,
            field="max_mods",
            observed=f"{len(offending)} peptidoforms with > {max_mods} modifications",
            expected=f"<= {max_mods} modifications per peptidoform",
            examples=examples,
        )

    return report.issues



def _parse_tolerance(text: Any) -> tuple:
    """
    Parse a tolerance string into a magnitude and unit.

    Handles bracketed signed ranges such as ``"[-20.0 ppm, 20.0 ppm]"`` by
    returning the largest absolute magnitude and the (lower-cased) unit.

    Parameters
    ----------
    text : Any
        The tolerance value (typically a formatted string).

    Returns
    -------
    tuple
        ``(magnitude, unit)`` where ``magnitude`` is a float (or ``None`` if no
        number could be parsed) and ``unit`` is a lower-case string or ``None``.
    """
    if _is_missing(text):
        return None, None
    magnitudes = []
    unit = None
    for number, parsed_unit in _TOLERANCE_TOKEN.findall(str(text)):
        try:
            magnitudes.append(abs(float(number)))
        except ValueError:
            continue
        if parsed_unit:
            unit = parsed_unit.lower()
    if not magnitudes:
        return None, None
    return max(magnitudes), unit


def _check_one_tolerance(
    report: ValidationReport, value: Any, label: str, field: str, config: ModuleValidationConfig
) -> None:
    """
    Sanity-check a single mass-tolerance value and append any issue.

    The "implausibly large" sub-check runs only when the relevant plausibility
    ceiling is configured (``config.max_plausible_ppm`` /
    ``config.max_plausible_dalton``). These have no default, so the sub-check is
    skipped when they are unset. The present/numeric/positive checks always run.

    Parameters
    ----------
    report : ValidationReport
        Report to append issues to.
    value : Any
        The tolerance value from the parameters.
    label : str
        Human-readable label (e.g. ``"precursor mass tolerance"``).
    field : str
        The parameter field name (used in the issue ``field`` and codes).
    config : ModuleValidationConfig
        Module validation configuration (provides the plausibility ceilings).
    """
    check = "mass_tolerance"
    if _is_missing(value):
        report.add_warning(
            f"{field}_not_parsed",
            f"Could not validate the {label} because it was not parsed from the parameter file.",
            check,
            field=field,
        )
        return

    magnitude, unit = _parse_tolerance(value)
    if magnitude is None:
        report.add_warning(
            f"{field}_unparsable",
            f"The {label} ('{value}') could not be interpreted as a numeric tolerance.",
            check,
            field=field,
            observed=value,
        )
        return

    if magnitude <= 0:
        report.add_warning(
            f"{field}_non_positive",
            f"The {label} ('{value}') is zero or negative, which is not a valid search tolerance.",
            check,
            field=field,
            observed=value,
        )
        return

    if unit == "ppm":
        ceiling = config.max_plausible_ppm
    elif unit in {"da", "th", "amu"}:
        ceiling = config.max_plausible_dalton
    elif unit == "mmu":
        # 1 mmu = 1e-3 Da, so the Dalton ceiling becomes 1000x larger in mmu.
        ceiling = None if config.max_plausible_dalton is None else config.max_plausible_dalton * 1000
    else:
        ceiling = None

    if ceiling is not None and magnitude > ceiling:
        report.add_warning(
            f"{field}_implausible",
            f"The {label} ('{value}') is unusually large and may indicate a mis-parsed value; "
            "review before submitting.",
            check,
            field=field,
            observed=value,
            expected=f"<= {ceiling:g} {unit}",
        )



[docs]
def check_mass_tolerances(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Sanity-check the precursor and fragment mass tolerances (warning only).

    There is no per-result tolerance to compare against, so this validates that
    the parsed ``precursor_mass_tolerance`` and ``fragment_mass_tolerance`` are
    present, numeric, and positive. When the module configures a plausibility
    ceiling (``config.max_plausible_ppm`` / ``config.max_plausible_dalton``,
    which have no default), tolerances above it are also flagged; otherwise that
    sub-check is skipped. Mis-parsed or nonsensical values are flagged as
    warnings.

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame (unused; kept for signature consistency).
    params : Any
        Parsed parameters (object with ``precursor_mass_tolerance`` /
        ``fragment_mass_tolerance`` attributes).
    config : ModuleValidationConfig
        Module validation configuration.

    Returns
    -------
    list of ValidationIssue
        Warnings for missing, unparsable, or implausible tolerances.
    """
    report = ValidationReport()
    _check_one_tolerance(
        report,
        getattr(params, "precursor_mass_tolerance", None),
        "precursor mass tolerance",
        "precursor_mass_tolerance",
        config,
    )
    _check_one_tolerance(
        report,
        getattr(params, "fragment_mass_tolerance", None),
        "fragment mass tolerance",
        "fragment_mass_tolerance",
        config,
    )
    return report.issues




[docs]
def check_fdr_psm(
    df: pd.DataFrame,
    params: Any,
    config: ModuleValidationConfig,
) -> List[ValidationIssue]:
    """
    Sanity-check the PSM-level FDR (warning only).

    Validates that ``ident_fdr_psm`` is present, within ``[0, 1]``, and not
    above the benchmark's recommended maximum
    (:attr:`ModuleValidationConfig.recommended_max_fdr_psm`, default 0.01).

    Parameters
    ----------
    df : pandas.DataFrame
        The standardized result DataFrame (unused; kept for signature consistency).
    params : Any
        Parsed parameters (object with an ``ident_fdr_psm`` attribute).
    config : ModuleValidationConfig
        Module validation configuration (provides ``recommended_max_fdr_psm``).

    Returns
    -------
    list of ValidationIssue
        Warnings for a missing, out-of-range, or above-recommended PSM FDR.
    """
    report = ValidationReport()
    check = "fdr"

    fdr = _as_float(getattr(params, "ident_fdr_psm", None))
    if fdr is None:
        report.add_warning(
            "fdr_psm_not_parsed",
            "Could not validate the PSM FDR because 'ident_fdr_psm' was not parsed from the parameter file.",
            check,
            field="ident_fdr_psm",
        )
        return report.issues

    if fdr < 0 or fdr > 1:
        report.add_warning(
            "fdr_psm_out_of_range",
            f"The PSM FDR ({fdr}) is outside the valid range [0, 1]; the value may be mis-parsed.",
            check,
            field="ident_fdr_psm",
            observed=fdr,
            expected="[0, 1]",
        )
        return report.issues

    recommended = getattr(config, "recommended_max_fdr_psm", None)
    if recommended is not None and fdr > recommended:
        report.add_warning(
            "fdr_psm_above_recommended",
            f"The PSM FDR ({fdr}) is higher than the recommended maximum of {recommended} for this benchmark.",
            check,
            field="ident_fdr_psm",
            observed=fdr,
            expected=f"<= {recommended}",
        )

    return report.issues