Source code for proteobench.modules.template.module
from __future__ import annotations
import datetime
from dataclasses import asdict
import pandas as pd
from typing import Any, Optional, Tuple
from proteobench.modules.template.datapoint import Datapoint
from proteobench.modules.template.parse import ParseInputs
from proteobench.modules.template.parse_settings import TEMPLATE_RESULTS_PATH, ParseSettings
[docs]class Module:
"""Description of the Module."""
[docs] def is_implemented() -> bool:
"""Returns whether the module is fully implemented."""
return True
[docs] def generate_datapoint(intermediate: pd.DataFrame, input_format: str, user_input: dict) -> Datapoint:
"""
Method used to compute benchmarks for the provided intermediate structure.
Parameters
----------
intermediate
The intermediate data structure.
input_format
The format of the input file.
user_input
The user input settings.
Returns
-------
df
The computed benchmark values.
"""
# Leave these lines as they are
result_datapoint = Datapoint(
id=input_format + "_" + user_input["version"] + "_" + str(datetime.datetime.now()),
# Add/remove your own metadata here
search_engine=input_format,
software_version=user_input["version"],
fdr_psm=user_input["fdr_psm"],
fdr_peptide=user_input["fdr_peptide"],
fdr_protein=user_input["fdr_protein"],
MBR=user_input["mbr"],
precursor_tol=user_input["precursor_mass_tolerance"],
fragment_tol=user_input["fragment_mass_tolerance"],
enzyme_name=user_input["search_enzyme_name"],
missed_cleavages=user_input["allowed_missed_cleavage"],
min_pep_length=user_input["min_peptide_length"],
max_pep_length=user_input["max_peptide_length"],
)
result_datapoint.generate_id()
result_datapoint.calculate_plot_data(intermediate)
df = pd.Series(asdict(result_datapoint))
return df
[docs] def add_current_data_point(
self, current_datapoint: pd.Series, all_datapoints: Optional[pd.DataFrame] = None
) -> pd.DataFrame:
"""
Add current data point to all data points and load them from file if empty.
Parameters
----------
all_datapoints
The data points from previous runs.
current_datapoint
The current data point to be added.
Returns
-------
all_datapoints
The data points with the current data point added.
"""
if not isinstance(all_datapoints, pd.DataFrame):
all_datapoints = pd.read_json(TEMPLATE_RESULTS_PATH)
else:
all_datapoints = all_datapoints.T
all_datapoints = pd.concat([all_datapoints, current_datapoint], axis=1)
all_datapoints = all_datapoints.T.reset_index(drop=True)
return all_datapoints
[docs] def benchmarking(self, input_file: str, input_format: str, user_input: dict, all_datapoints):
"""
Main workflow of the module. Used to benchmark workflow results.
Parameters
----------
input_file
Path to the workflow output file.
input_format
Format of the workflow output file.
user_input
User provided parameters for plotting.
all_datapoints
DataFrame containing all datapoints from the proteobench repo.
default_cutoff_min_prec
Minimum number of runs an ion has to be identified in.
Returns
-------
tuple[DataFrame, DataFrame]
Tuple containing the intermediate data structure, and all datapoints.
"""
# Read input file
# call load_input_file() method
input_df = self.load_input_file(input_file, input_format)
# Parse user config
parse_settings = ParseSettings(input_format)
# Converte uploaded data to standard format
standard_format = ParseInputs().convert_to_standard_format(input_df, parse_settings)
# Create intermediate data structure for benchmarking
intermediate_data_structure = self.generate_intermediate(standard_format, parse_settings)
# Compute performance metrics
current_datapoint = self.generate_datapoint(intermediate_data_structure, input_format, user_input)
# Add data point to all data points
all_datapoints = self.add_current_data_point(all_datapoints, current_datapoint)
return intermediate_data_structure, all_datapoints