Source code for proteobench.io.params

"""
Parameter handling for ProteoBench.

``ProteoBenchParameters`` is initialized from a JSON field-definition file
(default: ``json/Quant/quant_lfq_DDA_ion.json``) and populated by
per-software ``extract_params`` functions in the sibling modules.
After population, every parser calls ``fill_none()``, which coerces values
to canonical types via ``normalize()``.

``normalize_dataframe_columns`` applies the same coercion rules to a full
DataFrame of historical datapoints loaded from the results repository.

Normalization rules (applied by ``normalize()`` / ``normalize_dataframe_columns``):

- Missing sentinel strings (``"None"``, ``"N/A"``, ``""``, ``"unknown"``,
  etc.) → ``np.nan``
- ``ident_fdr_psm``, ``ident_fdr_peptide``, ``ident_fdr_protein`` → float
  in [0, 1]; values ≥ 1 are treated as percentages and divided by 100
- ``allowed_miscleavages``, ``min/max_peptide_length``,
  ``min/max_precursor_charge``, ``max_mods``,
  ``min/max_precursor_mz``, ``min/max_fragment_mz``,
  ``n_beams``, ``n_peaks``, ``min_mz``, ``max_mz`` → int
- ``enable_match_between_runs`` → bool
- ``enzyme`` → canonical name via ``_ENZYME_MAP``
  (e.g. ``"trypsin"`` → ``"Trypsin"``, ``"kr|p,true"`` → ``"Trypsin"``)
- ``precursor_mass_tolerance``, ``fragment_mass_tolerance`` → mapped to
  ``"Automatic calibration"`` when a known auto-calibration sentinel is
  detected (e.g. ``"dynamic"``, ``"0 ppm"``)

NOT normalized (kept as-is from parsers, parsers should homogenize themselves):

- ``precursor_mass_tolerance``, ``fragment_mass_tolerance``,
  ``remove_precursor_tol`` — string, format varies by tool
- ``fixed_mods``, ``variable_mods`` — string, tool-specific format
- ``quantification_method``, ``protein_inference``,
  ``abundance_normalization_ions`` — string
- ``software_name``, ``software_version``, ``search_engine``,
  ``search_engine_version`` — string
- ``min_intensity``, ``max_intensity`` — float/int, kept as-is
- ``tokens`` — string, semicolon-separated amino acids/modifications
- ``isotope_error_range`` — string (e.g. ``"[0, 2]"``)
- ``decoding_strategy``, ``checkpoint`` — string, tool-specific

Classes
-------
ProteoBenchParameters
    Parameter container initialized from a JSON field-definition file.

Functions
---------
normalize_dataframe_columns
    Apply the same normalization rules to a historical-results DataFrame.
"""

# Reference for parameter names
# https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/assets/param2sdrf.yml
import json
import os
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
import pandas as pd

# Strings that should be treated as missing / unset values.
_MISSING_SENTINELS = frozenset({"none", "n/a", "not specified", "unknown", "placeholder", "na", "nan", "", "-"})

# Canonical enzyme name mapping (lowercase key → display name). Add keys here in lowercase.
_ENZYME_MAP = {
    "trypsin": "Trypsin",
    "trypsin/p": "Trypsin/P",
    "stricttrypsin": "Trypsin/P",
    "k*,r*,!p*": "Trypsin",
    "[rk]|{p}": "Trypsin",
    "[rk]": "Trypsin/P",
    "kr": "Trypsin/P",
    "kr|p,true": "Trypsin",
    "kr|p,t": "Trypsin",
    "kr,true": "Trypsin/P",
    "kr,t": "Trypsin/P",
    "lys-c": "Lys-C",
    "lysc": "Lys-C",
    "arg-c": "Arg-C",
    "argc": "Arg-C",
    "asp-n": "Asp-N",
    "aspn": "Asp-N",
    "chymotrypsin": "Chymotrypsin",
    "gluc": "Glu-C",
    "glu-c": "Glu-C",
}

# Tolerance values (lowercase) that indicate automatic calibration.
_AUTO_CALIBRATION_SENTINELS = frozenset({"dynamic", "auto detected", "0", 0, "0 ppm", "[-0.0 ppm, 0.0 ppm]"})
_AUTO_CALIBRATION_LABEL = "Automatic calibration"

# Tolerance fields to which the auto-calibration mapping applies.
_TOLERANCE_FIELDS = ("precursor_mass_tolerance", "fragment_mass_tolerance")

# Fields that must be coerced to float (FDR values, decimal 0-1).
_FLOAT_FIELDS = ("ident_fdr_psm", "ident_fdr_peptide", "ident_fdr_protein")

# Fields that must be coerced to int.
_INT_FIELDS = (
    # Shared quant / de novo
    "allowed_miscleavages",
    "min_peptide_length",
    "max_peptide_length",
    "min_precursor_charge",
    "max_precursor_charge",
    "max_mods",
    # Quant specific (m/z ranges)
    "min_precursor_mz",
    "max_precursor_mz",
    "min_fragment_mz",
    "max_fragment_mz",
    # De novo specific
    "n_beams",
    "n_peaks",
    "min_mz",
    "max_mz",
)



[docs]
@dataclass
class ProteoBenchParameters:
    """
    Parameter container for a single ProteoBench submission.

    Attributes are determined at runtime by the JSON field-definition file;
    only fields present in that file are set as instance attributes.

    Parameters
    ----------
    filename : str or os.PathLike
        Path to a JSON field-definition file. Defaults to
        ``json/Quant/quant_lfq_DDA_ion.json`` (relative to this package).
    **kwargs
        Optional attribute overrides applied after JSON initialization.
        A string value of ``"None"`` is coerced to ``np.nan``.
    """

    def __init__(self, filename=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json"), **kwargs):
        """
        Initialize attributes from *filename* and apply any *kwargs* overrides.

        Parameters
        ----------
        filename : str or os.PathLike
            Path to a JSON field-definition file.
        **kwargs
            Attribute overrides. A value of ``"None"`` is stored as ``np.nan``.
        """
        if not os.path.isfile(filename):
            print(f"Error: File '{filename}' not found.")
            return  # No initialization happens if the file is missing

        with open(filename, "r", encoding="utf-8") as file:
            json_dict = json.load(file)

        # Initialize only the fields present in the JSON
        for key, value in json_dict.items():
            if "value" in value:
                setattr(self, key, value["value"])
            elif "placeholder" in value:
                setattr(self, key, value["placeholder"])
            else:
                setattr(self, key, None)

        for key, value in kwargs.items():
            if hasattr(self, key) and value == "None":
                setattr(self, key, np.nan)
            elif hasattr(self, key):
                setattr(self, key, value)

    def __repr__(self):
        """
        Custom string representation to only show initialized attributes.

        Returns
        -------
        str
            String representation to only show initialized attributes.
        """
        return str({key: value for key, value in self.__dict__.items() if value is not None})


[docs]
    def fill_none(self):
        """
        Convert string ``"None"`` sentinels to ``np.nan`` and call ``normalize()``.

        Every ``extract_params`` function should call this at the end of
        parameter extraction so that normalization is applied uniformly.
        """
        for key, value in self.__dict__.items():
            if value == "None":
                setattr(self, key, np.nan)
        self.normalize()


    def _is_missing(self, val) -> bool:
        """Return True if *val* represents a missing / unset value."""
        if val is None:
            return True
        if isinstance(val, float) and np.isnan(val):
            return True
        if isinstance(val, str) and val.strip().lower() in _MISSING_SENTINELS:
            return True
        return False


[docs]
    def normalize(self):
        """
        Coerce parsed parameter values to their canonical types.

        This method is called automatically at the end of ``fill_none()`` so
        that every parser benefits without per-parser changes.

        Normalization rules
        -------------------
        1. Any attribute whose value is a missing sentinel string (e.g.
           ``"not specified"``, ``"N/A"``, ``"None"``, ``""``) is set to
           ``np.nan``.
        2. FDR fields are coerced to ``float`` in the range [0, 1]. Values
           ``> 1`` are assumed to be percentages and divided by 100.
        3. Integer fields (miscleavages, peptide length, charge, max_mods)
           are coerced to ``int``.
        4. ``enable_match_between_runs`` is coerced to ``bool``.
        5. ``enzyme`` is mapped to a canonical name via ``_ENZYME_MAP``.
        """
        # A. Sanitize missing values across ALL attributes
        for key, val in list(self.__dict__.items()):
            if self._is_missing(val):
                setattr(self, key, np.nan)

        # B. Float coercion (FDR fields)
        for fld in _FLOAT_FIELDS:
            val = getattr(self, fld, None)
            if val is None or (isinstance(val, float) and np.isnan(val)):
                continue
            try:
                val = float(val)
                if val >= 1:  # percentage → decimal (FDR is always < 1)
                    val /= 100
                setattr(self, fld, val)
            except (ValueError, TypeError):
                setattr(self, fld, np.nan)

        # C. Integer coercion
        for fld in _INT_FIELDS:
            val = getattr(self, fld, None)
            if val is None or (isinstance(val, float) and np.isnan(val)):
                continue
            try:
                setattr(self, fld, int(float(val)))
            except (ValueError, TypeError):
                setattr(self, fld, np.nan)

        # D. Boolean coercion
        val = getattr(self, "enable_match_between_runs", None)
        if val is not None and not (isinstance(val, float) and np.isnan(val)):
            if isinstance(val, bool):
                pass  # already correct
            elif isinstance(val, str):
                setattr(
                    self,
                    "enable_match_between_runs",
                    val.strip().lower() in ("true", "1", "yes"),
                )
            else:
                try:
                    setattr(self, "enable_match_between_runs", bool(val))
                except (ValueError, TypeError):
                    setattr(self, "enable_match_between_runs", np.nan)

        # --- E. Enzyme name normalization -----------------------------------
        val = getattr(self, "enzyme", None)
        if val is not None and not (isinstance(val, float) and np.isnan(val)):
            if isinstance(val, str):
                canonical = _ENZYME_MAP.get(val.strip().lower())
                if canonical is not None:
                    setattr(self, "enzyme", canonical)
                # If not in map, keep original value as-is

        # --- F. Tolerance auto-calibration mapping ----------------------------
        for fld in _TOLERANCE_FIELDS:
            val = getattr(self, fld, None)
            if val is not None and not (isinstance(val, float) and np.isnan(val)):
                check = val.strip().lower() if isinstance(val, str) else val
                if check in _AUTO_CALIBRATION_SENTINELS:
                    setattr(self, fld, _AUTO_CALIBRATION_LABEL)




# Note: this should be able to be removed when we have resubmitted all points again.

[docs]
def normalize_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Apply the same coercion rules as :meth:`ProteoBenchParameters.normalize`
    to an entire DataFrame of historical results.

    Operates **in-place** on *df* and also returns it for convenience.
    """
    # A. Sanitize missing sentinel strings across all object/string columns
    for col in df.columns:
        if df[col].dtype == object or pd.api.types.is_string_dtype(df[col]):
            mask = df[col].apply(lambda v: isinstance(v, str) and v.strip().lower() in _MISSING_SENTINELS)
            df.loc[mask, col] = np.nan

    # B. Float coercion (FDR fields — decimal in [0, 1])
    for col in _FLOAT_FIELDS:
        if col not in df.columns:
            continue
        df[col] = pd.to_numeric(df[col], errors="coerce").astype(float)
        # Values >= 1 are assumed to be percentages
        pct_mask = df[col] >= 1
        df.loc[pct_mask, col] = df.loc[pct_mask, col] / 100

    # C. Integer coercion (nullable Int64 so NaN is preserved)
    for col in _INT_FIELDS:
        if col not in df.columns:
            continue
        df[col] = pd.to_numeric(df[col], errors="coerce").round().astype("Int64")

    # D. Boolean coercion
    if "enable_match_between_runs" in df.columns:
        col = "enable_match_between_runs"
        df[col] = df[col].apply(
            lambda v: (
                v
                if isinstance(v, (bool, np.bool_))
                else (str(v).strip().lower() in ("true", "1", "yes") if pd.notna(v) else np.nan)
            )
        )

    # E. Enzyme name normalization
    if "enzyme" in df.columns:
        df["enzyme"] = df["enzyme"].apply(
            lambda v: (_ENZYME_MAP.get(v.strip().lower(), v) if isinstance(v, str) and pd.notna(v) else v)
        )

    # F. Tolerance auto-calibration mapping
    for col in _TOLERANCE_FIELDS:
        if col not in df.columns:
            continue
        df[col] = df[col].apply(
            lambda v: (
                _AUTO_CALIBRATION_LABEL
                if (v.strip().lower() if isinstance(v, str) else v) in _AUTO_CALIBRATION_SENTINELS and pd.notna(v)
                else v
            )
        )

    return df



# Automatically initialize from fields.json if run directly
if __name__ == "__main__":
    proteo_params = ProteoBenchParameters()
    print(proteo_params)