Source code for proteobench.datapoint.entrapment_datapoint
"""
This module provides functionality for handling and processing quantitative datapoints in the ProteoBench framework.
"""
from __future__ import annotations
import dataclasses
import hashlib
import logging
from collections import ChainMap, defaultdict
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict
import numpy as np
import pandas as pd
import proteobench
from proteobench.datapoint.datapoint_base import DatapointBase
from proteobench.score.entrapmentscores import EntrapmentScores
[docs]
@dataclass
class EntrapmentDatapoint(DatapointBase):
"""
A data structure used to store the results of a entrapment benchmark run.
This class extends DatapointBase to implement entrapment-specific metrics and metadata
storage for LFQ benchmarking runs.
Attributes:
id (str): Unique identifier for the benchmark run.
software_name (str): Name of the software used in the benchmark.
software_version (str): Version of the software.
search_engine (str): Name of the search engine used.
search_engine_version (str): Version of the search engine.
ident_fdr_psm (float): False discovery rate for PSMs.
ident_fdr_peptide (float): False discovery rate for peptides.
ident_fdr_protein (float): False discovery rate for proteins.
enable_match_between_runs (bool): Whether matching between runs is enabled.
precursor_mass_tolerance (str): Mass tolerance for precursor ions.
fragment_mass_tolerance (str): Mass tolerance for fragment ions.
enzyme (str): Enzyme used for digestion.
allowed_miscleavages (int): Number of allowed miscleavages.
min_peptide_length (int): Minimum peptide length.
max_peptide_length (int): Maximum peptide length.
is_temporary (bool): Whether the data is temporary.
intermediate_hash (str): Hash of the intermediate result.
results (dict): A dictionary of metrics for the benchmark run.
nr_id_features (int): Number of identified features.
lower_bound_FDP (float): estimated false discovery proportion based on entrapment IDs.
combined_FDP (float): estimated False discovery proportion based on entrapment IDs.
paired_FDP (float): estimated False discovery proportion based on entrapment IDs.
reported_fdr_parsed_from_input (float): FDR threshold inferred from the input data (max Q-value).
comments (str): Any additional comments.
proteobench_version (str): Version of the Proteobench tool used.
"""
id: str = None
software_name: str = None
software_version: int = 0
search_engine: str = None
search_engine_version: int = 0
ident_fdr_psm: int = 0
ident_fdr_peptide: int = 0
ident_fdr_protein: int = 0
enable_match_between_runs: bool = False
precursor_mass_tolerance: str = None
fragment_mass_tolerance: str = None
enzyme: str = None
allowed_miscleavages: int = 0
min_peptide_length: int = 0
max_peptide_length: int = 0
is_temporary: bool = True
intermediate_hash: str = ""
results: dict = None
nr_id_features: int = 0
lower_bound_FDP: float = np.nan
combined_FDP: float = np.nan
category_combined: str = ""
category_paired: str = ""
paired_FDP: float = np.nan
reported_fdr_parsed_from_input: float = np.nan
fdp_curve: dict = None
comments: str = ""
proteobench_version: str = ""
[docs]
def generate_id(self) -> None:
"""
Generate a unique ID for the benchmark run by combining the software name and a timestamp.
This ID is used to uniquely identify each run of the benchmark.
"""
time_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.id = "_".join([self.software_name, str(time_stamp)])
logging.info(f"Assigned the following ID to this run: {self.id}")
[docs]
@staticmethod
def generate_datapoint(
intermediate: pd.DataFrame,
input_format: str,
user_input: dict,
) -> pd.Series:
"""
Generate a Datapoint object containing metadata and results from the benchmark run.
Parameters
----------
intermediate : pd.DataFrame
The intermediate DataFrame containing benchmark results.
input_format : str
The format of the input data (e.g., file format).
user_input : dict
User-defined input values for the benchmark.
default_cutoff_min_prec : int, optional
The default minimum precursor cutoff value. Defaults to 3.
max_nr_observed : int, optional
Maximum nr_observed value to calculate metrics for. If None, defaults to 6.
Returns
-------
pd.Series
A Pandas Series containing the Datapoint's attributes as key-value pairs.
"""
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S_%f")
if "comments_for_plotting" not in user_input.keys():
user_input["comments_for_plotting"] = ""
try:
user_input = defaultdict(
user_input.default_factory, # Preserve the default factory
{key: ("" if value is None else value) for key, value in user_input.items()},
)
except AttributeError:
user_input = {key: ("" if value is None else value) for key, value in user_input.items()}
result_datapoint = EntrapmentDatapoint(
id=input_format + "_" + user_input["software_version"] + "_" + formatted_datetime,
software_name=input_format,
software_version=user_input["software_version"],
search_engine=user_input["search_engine"],
search_engine_version=user_input["search_engine_version"],
ident_fdr_psm=user_input["ident_fdr_psm"],
ident_fdr_peptide=user_input["ident_fdr_peptide"],
ident_fdr_protein=user_input["ident_fdr_protein"],
enable_match_between_runs=user_input["enable_match_between_runs"],
precursor_mass_tolerance=user_input["precursor_mass_tolerance"],
fragment_mass_tolerance=user_input["fragment_mass_tolerance"],
enzyme=user_input["enzyme"],
allowed_miscleavages=user_input["allowed_miscleavages"],
min_peptide_length=user_input["min_peptide_length"],
max_peptide_length=user_input["max_peptide_length"],
intermediate_hash=str(hashlib.sha1(intermediate.to_string().encode("utf-8")).hexdigest()),
comments=user_input["comments_for_plotting"],
proteobench_version=proteobench.__version__,
)
result_datapoint.generate_id()
metrics = EntrapmentDatapoint.get_metrics(intermediate)
result_datapoint.reported_fdr_parsed_from_input = metrics["reported_fdr_parsed_from_input"]
result_datapoint.nr_id_features = metrics["nr_id_features"]
result_datapoint.lower_bound_FDP = metrics["lower_bound_FDP"]
result_datapoint.combined_FDP = metrics["combined_FDP"]
result_datapoint.paired_FDP = metrics["paired_FDP"]
result_datapoint.category_combined = metrics["category_combined"]
result_datapoint.category_paired = metrics["category_paired"]
result_datapoint.fdp_curve = metrics["fdp_curve"]
result_datapoint.results = metrics
results_series = pd.Series(dataclasses.asdict(result_datapoint))
return results_series
[docs]
@staticmethod
def get_metrics(intermediate: pd.DataFrame) -> Dict[str, Any]:
metrics = EntrapmentScores.calculate_metrics(intermediate)
return metrics