Source code for proteobench.datapoint.quant_datapoint

"""
This module provides functionality for handling and processing quantitative datapoints in the ProteoBench framework.
"""

from __future__ import annotations

import dataclasses
import hashlib
import logging
from collections import ChainMap, defaultdict
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict

import pandas as pd

import proteobench



[docs]
def filter_df_numquant_epsilon(row: Dict[str, Any], min_quant: int = 3, metric: str = "median") -> float | None:
    """
    Extract the 'median_abs_epsilon' value from a row (assumed to be a dictionary).

    Parameters
    ----------
    row : dict
        The row from which to extract the value. Expected to be a dictionary.
    min_quant : int or str, optional
        The key for the desired value. Defaults to 3.
    metric : str
        The metric to be calculated. Should be either median or mean, defaults to median.

    Returns
    -------
    float or None
        The 'median_abs_epsilon' value if found, otherwise None.
    """
    if not row:  # Handle empty dictionary
        return None

    if isinstance(list(row.keys())[0], str):
        min_quant = str(min_quant)
    if isinstance(row, dict) and min_quant in row and isinstance(row[min_quant], dict):
        return row[min_quant].get("{}_abs_epsilon".format(metric))

    return None




[docs]
def filter_df_numquant_nr_prec(row: pd.Series, min_quant: int = 3) -> int | None:
    """
    Extract the 'nr_prec' value from a row (assumed to be a dictionary).

    Parameters
    ----------
    row : pd.Series
        The row from which to extract the value. Expected to be a dictionary or Series.
    min_quant : int or str, optional
        The key for the desired value. Defaults to 3.

    Returns
    -------
    int, None
        The 'nr_prec' value if found, otherwise None.
    """
    if isinstance(list(row.keys())[0], str):
        min_quant = str(min_quant)
    if isinstance(row, dict) and min_quant in row and isinstance(row[min_quant], dict):
        return row[min_quant].get("nr_prec")
    return None




[docs]
@dataclass
class QuantDatapoint:
    """
    A data structure used to store the results of a benchmark run.

    Attributes:
        id (str): Unique identifier for the benchmark run.
        software_name (str): Name of the software used in the benchmark.
        software_version (str): Version of the software.
        search_engine (str): Name of the search engine used.
        search_engine_version (str): Version of the search engine.
        ident_fdr_psm (float): False discovery rate for PSMs.
        ident_fdr_peptide (float): False discovery rate for peptides.
        ident_fdr_protein (float): False discovery rate for proteins.
        enable_match_between_runs (bool): Whether matching between runs is enabled.
        precursor_mass_tolerance (str): Mass tolerance for precursor ions.
        fragment_mass_tolerance (str): Mass tolerance for fragment ions.
        enzyme (str): Enzyme used for digestion.
        allowed_miscleavages (int): Number of allowed miscleavages.
        min_peptide_length (int): Minimum peptide length.
        max_peptide_length (int): Maximum peptide length.
        is_temporary (bool): Whether the data is temporary.
        intermediate_hash (str): Hash of the intermediate result.
        results (dict): A dictionary of metrics for the benchmark run.
        median_abs_epsilon (float): Median absolute epsilon value for the benchmark.
        mean_abs_epsilon (float): Mean absolute epsilon value for the benchmark.
        nr_prec (int): Number of precursors identified.
        comments (str): Any additional comments.
        proteobench_version (str): Version of the Proteobench tool used.
    """

    id: str = None
    software_name: str = None
    software_version: int = 0
    search_engine: str = None
    search_engine_version: int = 0
    ident_fdr_psm: int = 0
    ident_fdr_peptide: int = 0
    ident_fdr_protein: int = 0
    enable_match_between_runs: bool = False
    precursor_mass_tolerance: str = None
    fragment_mass_tolerance: str = None
    enzyme: str = None
    allowed_miscleavages: int = 0
    min_peptide_length: int = 0
    max_peptide_length: int = 0
    is_temporary: bool = True
    intermediate_hash: str = ""
    results: dict = None
    median_abs_epsilon: float = 0
    mean_abs_epsilon: float = 0
    nr_prec: int = 0
    comments: str = ""
    proteobench_version: str = ""


[docs]
    def generate_id(self) -> None:
        """
        Generate a unique ID for the benchmark run by combining the software name and a timestamp.

        This ID is used to uniquely identify each run of the benchmark.
        """
        time_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.id = "_".join([self.software_name, str(time_stamp)])
        logging.info(f"Assigned the following ID to this run: {self.id}")



[docs]
    @staticmethod
    def generate_datapoint(
        intermediate: pd.DataFrame, input_format: str, user_input: dict, default_cutoff_min_prec: int = 3
    ) -> pd.Series:
        """
        Generate a Datapoint object containing metadata and results from the benchmark run.

        Parameters
        ----------
        intermediate : pd.DataFrame
            The intermediate DataFrame containing benchmark results.
        input_format : str
            The format of the input data (e.g., file format).
        user_input : dict
            User-defined input values for the benchmark.
        default_cutoff_min_prec : int, optional
            The default minimum precursor cutoff value. Defaults to 3.

        Returns
        -------
        pd.Series
            A Pandas Series containing the Datapoint's attributes as key-value pairs.
        """
        current_datetime = datetime.now()
        formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S_%f")

        if "comments_for_plotting" not in user_input.keys():
            user_input["comments_for_plotting"] = ""

        try:
            user_input = defaultdict(
                user_input.default_factory,  # Preserve the default factory
                {key: ("" if value is None else value) for key, value in user_input.items()},
            )
        except AttributeError:
            user_input = {key: ("" if value is None else value) for key, value in user_input.items()}

        result_datapoint = QuantDatapoint(
            id=input_format + "_" + user_input["software_version"] + "_" + formatted_datetime,
            software_name=input_format,
            software_version=user_input["software_version"],
            search_engine=user_input["search_engine"],
            search_engine_version=user_input["search_engine_version"],
            ident_fdr_psm=user_input["ident_fdr_psm"],
            ident_fdr_peptide=user_input["ident_fdr_peptide"],
            ident_fdr_protein=user_input["ident_fdr_protein"],
            enable_match_between_runs=user_input["enable_match_between_runs"],
            precursor_mass_tolerance=user_input["precursor_mass_tolerance"],
            fragment_mass_tolerance=user_input["fragment_mass_tolerance"],
            enzyme=user_input["enzyme"],
            allowed_miscleavages=user_input["allowed_miscleavages"],
            min_peptide_length=user_input["min_peptide_length"],
            max_peptide_length=user_input["max_peptide_length"],
            intermediate_hash=str(hashlib.sha1(intermediate.to_string().encode("utf-8")).hexdigest()),
            comments=user_input["comments_for_plotting"],
            proteobench_version=proteobench.__version__,
        )

        result_datapoint.generate_id()

        results = dict(
            ChainMap(*[QuantDatapoint.get_metrics(intermediate, nr_observed) for nr_observed in range(1, 7)])
        )
        result_datapoint.results = results
        result_datapoint.median_abs_epsilon = result_datapoint.results[default_cutoff_min_prec]["median_abs_epsilon"]
        result_datapoint.mean_abs_epsilon = result_datapoint.results[default_cutoff_min_prec]["mean_abs_epsilon"]
        result_datapoint.nr_prec = result_datapoint.results[default_cutoff_min_prec]["nr_prec"]

        results_series = pd.Series(dataclasses.asdict(result_datapoint))

        return results_series



[docs]
    def get_metrics(df: pd.DataFrame, min_nr_observed: int = 1) -> dict[int, dict[str, float]]:
        """
        Compute various statistical metrics from the provided DataFrame for the benchmark,
        but optimized to do fewer passes over the data.
        """
        # 1) Filter once
        df_slice = df[df["nr_observed"] >= min_nr_observed]
        nr_prec = len(df_slice)

        # 2) Compute abs-epsilon only once
        eps = df_slice["epsilon"].abs()

        # 3) Batch the CV quantiles in one go
        #    This returns a DataFrame with index [0.50, 0.75, 0.90, 0.95]
        cv_q = df_slice[["CV_A", "CV_B"]].quantile([0.5, 0.75, 0.9, 0.95])
        #    Then average across the two columns for each quantile
        cv_avg = cv_q.mean(axis=1)

        return {
            min_nr_observed: {
                "median_abs_epsilon": eps.median(),
                "mean_abs_epsilon": eps.mean(),
                "variance_epsilon": df_slice["epsilon"].var(),
                "nr_prec": nr_prec,
                "CV_median": cv_avg.loc[0.50],
                "CV_q75": cv_avg.loc[0.75],
                "CV_q90": cv_avg.loc[0.90],
                "CV_q95": cv_avg.loc[0.95],
            }
        }



[docs]
    @staticmethod
    def get_metrics_old(df: pd.DataFrame, min_nr_observed: int = 1) -> Dict[int, Dict[str, float]]:
        """
        Compute various statistical metrics from the provided DataFrame for the benchmark.

        Parameters
        ----------
        df : pd.DataFrame
            The DataFrame containing the benchmark results.
        min_nr_observed : int, optional
            The minimum number of observed values for a valid computation. Defaults to 1.

        Returns
        -------
        dict
            A dictionary containing computed metrics such as 'median_abs_epsilon', 'variance_epsilon', etc.
        """
        # Filter DataFrame by the minimum number of observations
        df_slice = df[df["nr_observed"] >= min_nr_observed]
        nr_prec = len(df_slice)

        # Calculate the median absolute epsilon (insensitive to outliers)
        median_abs_epsilon = df_slice["epsilon"].abs().median()
        # Calculate the mean absolute epsilon (sensitive to outliers)
        mean_abs_epsilon = df_slice["epsilon"].abs().mean()

        # Calculate the variance of epsilon (sensitive to outliers)
        variance_epsilon = df_slice["epsilon"].var()

        # Compute the median of the coefficient of variation (CV) for both 'CV_A' and 'CV_B'
        cv_median = (df_slice["CV_A"].median() + df_slice["CV_B"].median()) / 2
        cv_q75 = (df_slice["CV_A"].quantile(0.75) + df_slice["CV_B"].quantile(0.75)) / 2
        cv_q90 = (df_slice["CV_A"].quantile(0.9) + df_slice["CV_B"].quantile(0.9)) / 2
        cv_q95 = (df_slice["CV_A"].quantile(0.95) + df_slice["CV_B"].quantile(0.95)) / 2

        return {
            min_nr_observed: {
                "median_abs_epsilon": median_abs_epsilon,
                "mean_abs_epsilon": mean_abs_epsilon,
                "variance_epsilon": variance_epsilon,
                "nr_prec": nr_prec,
                "CV_median": cv_median,
                "CV_q90": cv_q90,
                "CV_q75": cv_q75,
                "CV_q95": cv_q95,
            }
        }