Source code for proteobench.datapoint.entrapment_datapoint

"""
This module provides functionality for handling and processing quantitative datapoints in the ProteoBench framework.
"""

from __future__ import annotations

import dataclasses
import hashlib
import logging
from collections import ChainMap, defaultdict
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict

import numpy as np
import pandas as pd

import proteobench
from proteobench.datapoint.datapoint_base import DatapointBase
from proteobench.score.entrapmentscores import EntrapmentScores



[docs]
@dataclass
class EntrapmentDatapoint(DatapointBase):
    """
    A data structure used to store the results of a entrapment benchmark run.

    This class extends DatapointBase to implement entrapment-specific metrics and metadata
    storage for LFQ benchmarking runs.

    Attributes:
        id (str): Unique identifier for the benchmark run.
        software_name (str): Name of the software used in the benchmark.
        software_version (str): Version of the software.
        search_engine (str): Name of the search engine used.
        search_engine_version (str): Version of the search engine.
        ident_fdr_psm (float): False discovery rate for PSMs.
        ident_fdr_peptide (float): False discovery rate for peptides.
        ident_fdr_protein (float): False discovery rate for proteins.
        enable_match_between_runs (bool): Whether matching between runs is enabled.
        precursor_mass_tolerance (str): Mass tolerance for precursor ions.
        fragment_mass_tolerance (str): Mass tolerance for fragment ions.
        enzyme (str): Enzyme used for digestion.
        allowed_miscleavages (int): Number of allowed miscleavages.
        min_peptide_length (int): Minimum peptide length.
        max_peptide_length (int): Maximum peptide length.
        is_temporary (bool): Whether the data is temporary.
        intermediate_hash (str): Hash of the intermediate result.
        results (dict): A dictionary of metrics for the benchmark run.
        nr_id_features (int): Number of identified features.
        lower_bound_FDP (float): estimated false discovery proportion based on entrapment IDs.
        combined_FDP (float): estimated False discovery proportion based on entrapment IDs.
        paired_FDP (float): estimated False discovery proportion based on entrapment IDs.
        reported_fdr_parsed_from_input (float): FDR threshold inferred from the input data (max Q-value).
        comments (str): Any additional comments.
        proteobench_version (str): Version of the Proteobench tool used.
    """

    id: str = None
    software_name: str = None
    software_version: int = 0
    search_engine: str = None
    search_engine_version: int = 0
    ident_fdr_psm: int = 0
    ident_fdr_peptide: int = 0
    ident_fdr_protein: int = 0
    enable_match_between_runs: bool = False
    precursor_mass_tolerance: str = None
    fragment_mass_tolerance: str = None
    enzyme: str = None
    allowed_miscleavages: int = 0
    min_peptide_length: int = 0
    max_peptide_length: int = 0
    is_temporary: bool = True
    intermediate_hash: str = ""
    results: dict = None
    nr_id_features: int = 0
    lower_bound_FDP: float = np.nan
    combined_FDP: float = np.nan
    category_combined: str = ""
    category_paired: str = ""
    paired_FDP: float = np.nan
    reported_fdr_parsed_from_input: float = np.nan
    fdp_curve: dict = None
    comments: str = ""
    proteobench_version: str = ""


[docs]
    def generate_id(self) -> None:
        """
        Generate a unique ID for the benchmark run by combining the software name and a timestamp.

        This ID is used to uniquely identify each run of the benchmark.
        """
        time_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.id = "_".join([self.software_name, str(time_stamp)])
        logging.info(f"Assigned the following ID to this run: {self.id}")



[docs]
    @staticmethod
    def generate_datapoint(
        intermediate: pd.DataFrame,
        input_format: str,
        user_input: dict,
    ) -> pd.Series:
        """
        Generate a Datapoint object containing metadata and results from the benchmark run.

        Parameters
        ----------
        intermediate : pd.DataFrame
            The intermediate DataFrame containing benchmark results.
        input_format : str
            The format of the input data (e.g., file format).
        user_input : dict
            User-defined input values for the benchmark.
        default_cutoff_min_prec : int, optional
            The default minimum precursor cutoff value. Defaults to 3.
        max_nr_observed : int, optional
            Maximum nr_observed value to calculate metrics for. If None, defaults to 6.

        Returns
        -------
        pd.Series
            A Pandas Series containing the Datapoint's attributes as key-value pairs.
        """
        current_datetime = datetime.now()
        formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S_%f")

        if "comments_for_plotting" not in user_input.keys():
            user_input["comments_for_plotting"] = ""

        try:
            user_input = defaultdict(
                user_input.default_factory,  # Preserve the default factory
                {key: ("" if value is None else value) for key, value in user_input.items()},
            )
        except AttributeError:
            user_input = {key: ("" if value is None else value) for key, value in user_input.items()}

        result_datapoint = EntrapmentDatapoint(
            id=input_format + "_" + user_input["software_version"] + "_" + formatted_datetime,
            software_name=input_format,
            software_version=user_input["software_version"],
            search_engine=user_input["search_engine"],
            search_engine_version=user_input["search_engine_version"],
            ident_fdr_psm=user_input["ident_fdr_psm"],
            ident_fdr_peptide=user_input["ident_fdr_peptide"],
            ident_fdr_protein=user_input["ident_fdr_protein"],
            enable_match_between_runs=user_input["enable_match_between_runs"],
            precursor_mass_tolerance=user_input["precursor_mass_tolerance"],
            fragment_mass_tolerance=user_input["fragment_mass_tolerance"],
            enzyme=user_input["enzyme"],
            allowed_miscleavages=user_input["allowed_miscleavages"],
            min_peptide_length=user_input["min_peptide_length"],
            max_peptide_length=user_input["max_peptide_length"],
            intermediate_hash=str(hashlib.sha1(intermediate.to_string().encode("utf-8")).hexdigest()),
            comments=user_input["comments_for_plotting"],
            proteobench_version=proteobench.__version__,
        )

        result_datapoint.generate_id()
        metrics = EntrapmentDatapoint.get_metrics(intermediate)

        result_datapoint.reported_fdr_parsed_from_input = metrics["reported_fdr_parsed_from_input"]
        result_datapoint.nr_id_features = metrics["nr_id_features"]
        result_datapoint.lower_bound_FDP = metrics["lower_bound_FDP"]
        result_datapoint.combined_FDP = metrics["combined_FDP"]
        result_datapoint.paired_FDP = metrics["paired_FDP"]
        result_datapoint.category_combined = metrics["category_combined"]
        result_datapoint.category_paired = metrics["category_paired"]
        result_datapoint.fdp_curve = metrics["fdp_curve"]

        result_datapoint.results = metrics
        results_series = pd.Series(dataclasses.asdict(result_datapoint))

        return results_series



[docs]
    @staticmethod
    def get_metrics(intermediate: pd.DataFrame) -> Dict[str, Any]:
        metrics = EntrapmentScores.calculate_metrics(intermediate)
        return metrics