Source code for proteobench.io.params

"""
Parameter handling for ProteoBench.

``ProteoBenchParameters`` is initialized from a JSON field-definition file
(default: ``json/Quant/quant_lfq_DDA_ion.json``) and populated by
per-software ``extract_params`` functions in the sibling modules.
After population, every parser calls ``fill_none()``, which coerces values
to canonical types via ``normalize()``.

``normalize_dataframe_columns`` applies the same coercion rules to a full
DataFrame of historical datapoints loaded from the results repository.

Normalization rules (applied by ``normalize()`` / ``normalize_dataframe_columns``):

- Missing sentinel strings (``"None"``, ``"N/A"``, ``""``, ``"unknown"``,
  etc.) → ``np.nan``
- ``ident_fdr_psm``, ``ident_fdr_peptide``, ``ident_fdr_protein`` → float
  in [0, 1]; values ≥ 1 are treated as percentages and divided by 100
- ``allowed_miscleavages``, ``min/max_peptide_length``,
  ``min/max_precursor_charge``, ``max_mods``,
  ``min/max_precursor_mz``, ``min/max_fragment_mz``,
  ``n_beams``, ``n_peaks``, ``min_mz``, ``max_mz`` → int
- ``enable_match_between_runs`` → bool
- ``enzyme`` → canonical name via ``_ENZYME_MAP``
  (e.g. ``"trypsin"`` → ``"Trypsin"``, ``"kr|p,true"`` → ``"Trypsin"``)
- ``precursor_mass_tolerance``, ``fragment_mass_tolerance`` → mapped to
  ``"Automatic calibration"`` when a known auto-calibration sentinel is
  detected (e.g. ``"dynamic"``, ``"0 ppm"``)

NOT normalized (kept as-is from parsers, parsers should homogenize themselves):

- ``precursor_mass_tolerance``, ``fragment_mass_tolerance``,
  ``remove_precursor_tol`` — string, format varies by tool
- ``fixed_mods``, ``variable_mods`` — string, tool-specific format
- ``quantification_method``, ``protein_inference``,
  ``abundance_normalization_ions`` — string
- ``software_name``, ``software_version``, ``search_engine``,
  ``search_engine_version`` — string
- ``min_intensity``, ``max_intensity`` — float/int, kept as-is
- ``tokens`` — string, semicolon-separated amino acids/modifications
- ``isotope_error_range`` — string (e.g. ``"[0, 2]"``)
- ``decoding_strategy``, ``checkpoint`` — string, tool-specific

Classes
-------
ProteoBenchParameters
    Parameter container initialized from a JSON field-definition file.

Functions
---------
normalize_dataframe_columns
    Apply the same normalization rules to a historical-results DataFrame.
"""

# Reference for parameter names
# https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/assets/param2sdrf.yml
import json
import os
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
import pandas as pd

# Strings that should be treated as missing / unset values.
_MISSING_SENTINELS = frozenset({"none", "n/a", "not specified", "unknown", "placeholder", "na", "nan", "", "-"})

# Canonical enzyme name mapping (lowercase key → display name). Add keys here in lowercase.
_ENZYME_MAP = {
    "trypsin": "Trypsin",
    "trypsin/p": "Trypsin/P",
    "stricttrypsin": "Trypsin/P",
    "k*,r*,!p*": "Trypsin",
    "[rk]|{p}": "Trypsin",
    "[rk]": "Trypsin/P",
    "kr": "Trypsin/P",
    "kr|p,true": "Trypsin",
    "kr|p,t": "Trypsin",
    "kr,true": "Trypsin/P",
    "kr,t": "Trypsin/P",
    "lys-c": "Lys-C",
    "lysc": "Lys-C",
    "arg-c": "Arg-C",
    "argc": "Arg-C",
    "asp-n": "Asp-N",
    "aspn": "Asp-N",
    "chymotrypsin": "Chymotrypsin",
    "gluc": "Glu-C",
    "glu-c": "Glu-C",
}

# Tolerance values (lowercase) that indicate automatic calibration.
_AUTO_CALIBRATION_SENTINELS = frozenset({"dynamic", "auto detected", "0", 0, "0 ppm", "[-0.0 ppm, 0.0 ppm]"})
_AUTO_CALIBRATION_LABEL = "Automatic calibration"

# Tolerance fields to which the auto-calibration mapping applies.
_TOLERANCE_FIELDS = ("precursor_mass_tolerance", "fragment_mass_tolerance")

# Fields that must be coerced to float (FDR values, decimal 0-1).
_FLOAT_FIELDS = ("ident_fdr_psm", "ident_fdr_peptide", "ident_fdr_protein")

# Fields that must be coerced to int.
_INT_FIELDS = (
    # Shared quant / de novo
    "allowed_miscleavages",
    "min_peptide_length",
    "max_peptide_length",
    "min_precursor_charge",
    "max_precursor_charge",
    "max_mods",
    # Quant specific (m/z ranges)
    "min_precursor_mz",
    "max_precursor_mz",
    "min_fragment_mz",
    "max_fragment_mz",
    # De novo specific
    "n_beams",
    "n_peaks",
    "min_mz",
    "max_mz",
)


[docs] @dataclass class ProteoBenchParameters: """ Parameter container for a single ProteoBench submission. Attributes are determined at runtime by the JSON field-definition file; only fields present in that file are set as instance attributes. Parameters ---------- filename : str or os.PathLike Path to a JSON field-definition file. Defaults to ``json/Quant/quant_lfq_DDA_ion.json`` (relative to this package). **kwargs Optional attribute overrides applied after JSON initialization. A string value of ``"None"`` is coerced to ``np.nan``. """ def __init__(self, filename=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json"), **kwargs): """ Initialize attributes from *filename* and apply any *kwargs* overrides. Parameters ---------- filename : str or os.PathLike Path to a JSON field-definition file. **kwargs Attribute overrides. A value of ``"None"`` is stored as ``np.nan``. """ if not os.path.isfile(filename): print(f"Error: File '{filename}' not found.") return # No initialization happens if the file is missing with open(filename, "r", encoding="utf-8") as file: json_dict = json.load(file) # Initialize only the fields present in the JSON for key, value in json_dict.items(): if "value" in value: setattr(self, key, value["value"]) elif "placeholder" in value: setattr(self, key, value["placeholder"]) else: setattr(self, key, None) for key, value in kwargs.items(): if hasattr(self, key) and value == "None": setattr(self, key, np.nan) elif hasattr(self, key): setattr(self, key, value) def __repr__(self): """ Custom string representation to only show initialized attributes. Returns ------- str String representation to only show initialized attributes. """ return str({key: value for key, value in self.__dict__.items() if value is not None})
[docs] def fill_none(self): """ Convert string ``"None"`` sentinels to ``np.nan`` and call ``normalize()``. Every ``extract_params`` function should call this at the end of parameter extraction so that normalization is applied uniformly. """ for key, value in self.__dict__.items(): if value == "None": setattr(self, key, np.nan) self.normalize()
def _is_missing(self, val) -> bool: """Return True if *val* represents a missing / unset value.""" if val is None: return True if isinstance(val, float) and np.isnan(val): return True if isinstance(val, str) and val.strip().lower() in _MISSING_SENTINELS: return True return False
[docs] def normalize(self): """ Coerce parsed parameter values to their canonical types. This method is called automatically at the end of ``fill_none()`` so that every parser benefits without per-parser changes. Normalization rules ------------------- 1. Any attribute whose value is a missing sentinel string (e.g. ``"not specified"``, ``"N/A"``, ``"None"``, ``""``) is set to ``np.nan``. 2. FDR fields are coerced to ``float`` in the range [0, 1]. Values ``> 1`` are assumed to be percentages and divided by 100. 3. Integer fields (miscleavages, peptide length, charge, max_mods) are coerced to ``int``. 4. ``enable_match_between_runs`` is coerced to ``bool``. 5. ``enzyme`` is mapped to a canonical name via ``_ENZYME_MAP``. """ # A. Sanitize missing values across ALL attributes for key, val in list(self.__dict__.items()): if self._is_missing(val): setattr(self, key, np.nan) # B. Float coercion (FDR fields) for fld in _FLOAT_FIELDS: val = getattr(self, fld, None) if val is None or (isinstance(val, float) and np.isnan(val)): continue try: val = float(val) if val >= 1: # percentage → decimal (FDR is always < 1) val /= 100 setattr(self, fld, val) except (ValueError, TypeError): setattr(self, fld, np.nan) # C. Integer coercion for fld in _INT_FIELDS: val = getattr(self, fld, None) if val is None or (isinstance(val, float) and np.isnan(val)): continue try: setattr(self, fld, int(float(val))) except (ValueError, TypeError): setattr(self, fld, np.nan) # D. Boolean coercion val = getattr(self, "enable_match_between_runs", None) if val is not None and not (isinstance(val, float) and np.isnan(val)): if isinstance(val, bool): pass # already correct elif isinstance(val, str): setattr( self, "enable_match_between_runs", val.strip().lower() in ("true", "1", "yes"), ) else: try: setattr(self, "enable_match_between_runs", bool(val)) except (ValueError, TypeError): setattr(self, "enable_match_between_runs", np.nan) # --- E. Enzyme name normalization ----------------------------------- val = getattr(self, "enzyme", None) if val is not None and not (isinstance(val, float) and np.isnan(val)): if isinstance(val, str): canonical = _ENZYME_MAP.get(val.strip().lower()) if canonical is not None: setattr(self, "enzyme", canonical) # If not in map, keep original value as-is # --- F. Tolerance auto-calibration mapping ---------------------------- for fld in _TOLERANCE_FIELDS: val = getattr(self, fld, None) if val is not None and not (isinstance(val, float) and np.isnan(val)): check = val.strip().lower() if isinstance(val, str) else val if check in _AUTO_CALIBRATION_SENTINELS: setattr(self, fld, _AUTO_CALIBRATION_LABEL)
# Note: this should be able to be removed when we have resubmitted all points again.
[docs] def normalize_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame: """Apply the same coercion rules as :meth:`ProteoBenchParameters.normalize` to an entire DataFrame of historical results. Operates **in-place** on *df* and also returns it for convenience. """ # A. Sanitize missing sentinel strings across all object/string columns for col in df.columns: if df[col].dtype == object or pd.api.types.is_string_dtype(df[col]): mask = df[col].apply(lambda v: isinstance(v, str) and v.strip().lower() in _MISSING_SENTINELS) df.loc[mask, col] = np.nan # B. Float coercion (FDR fields — decimal in [0, 1]) for col in _FLOAT_FIELDS: if col not in df.columns: continue df[col] = pd.to_numeric(df[col], errors="coerce").astype(float) # Values >= 1 are assumed to be percentages pct_mask = df[col] >= 1 df.loc[pct_mask, col] = df.loc[pct_mask, col] / 100 # C. Integer coercion (nullable Int64 so NaN is preserved) for col in _INT_FIELDS: if col not in df.columns: continue df[col] = pd.to_numeric(df[col], errors="coerce").round().astype("Int64") # D. Boolean coercion if "enable_match_between_runs" in df.columns: col = "enable_match_between_runs" df[col] = df[col].apply( lambda v: ( v if isinstance(v, (bool, np.bool_)) else (str(v).strip().lower() in ("true", "1", "yes") if pd.notna(v) else np.nan) ) ) # E. Enzyme name normalization if "enzyme" in df.columns: df["enzyme"] = df["enzyme"].apply( lambda v: (_ENZYME_MAP.get(v.strip().lower(), v) if isinstance(v, str) and pd.notna(v) else v) ) # F. Tolerance auto-calibration mapping for col in _TOLERANCE_FIELDS: if col not in df.columns: continue df[col] = df[col].apply( lambda v: ( _AUTO_CALIBRATION_LABEL if (v.strip().lower() if isinstance(v, str) else v) in _AUTO_CALIBRATION_SENTINELS and pd.notna(v) else v ) ) return df
# Automatically initialize from fields.json if run directly if __name__ == "__main__": proteo_params = ProteoBenchParameters() print(proteo_params)