Source code for proteobench.modules.template.module

from __future__ import annotations

import datetime
from dataclasses import asdict

import pandas as pd

from typing import Any, Optional, Tuple

from proteobench.modules.template.datapoint import Datapoint
from proteobench.modules.template.parse import ParseInputs
from proteobench.modules.template.parse_settings import TEMPLATE_RESULTS_PATH, ParseSettings


[docs]class Module: """Description of the Module."""
[docs] def is_implemented() -> bool: """Returns whether the module is fully implemented.""" return True
[docs] def generate_intermediate(standard_format: dict, parse_settings: ParseSettings) -> pd.DataFrame: """ Calculate intermediate values from the uploaded file. Parameters ---------- standard_format The uploaded file in a standard format. parse_settings The settings used to parse the uploaded file. Returns ------- intermediate The intermediate values calculated from the uploaded file. """ # TODO calculate intermediate values intermediate = pd.DataFrame() return intermediate
[docs] def generate_datapoint(intermediate: pd.DataFrame, input_format: str, user_input: dict) -> Datapoint: """ Method used to compute benchmarks for the provided intermediate structure. Parameters ---------- intermediate The intermediate data structure. input_format The format of the input file. user_input The user input settings. Returns ------- df The computed benchmark values. """ # Leave these lines as they are result_datapoint = Datapoint( id=input_format + "_" + user_input["version"] + "_" + str(datetime.datetime.now()), # Add/remove your own metadata here search_engine=input_format, software_version=user_input["version"], fdr_psm=user_input["fdr_psm"], fdr_peptide=user_input["fdr_peptide"], fdr_protein=user_input["fdr_protein"], MBR=user_input["mbr"], precursor_tol=user_input["precursor_mass_tolerance"], fragment_tol=user_input["fragment_mass_tolerance"], enzyme_name=user_input["search_enzyme_name"], missed_cleavages=user_input["allowed_missed_cleavage"], min_pep_length=user_input["min_peptide_length"], max_pep_length=user_input["max_peptide_length"], ) result_datapoint.generate_id() result_datapoint.calculate_plot_data(intermediate) df = pd.Series(asdict(result_datapoint)) return df
[docs] def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame: """ Method loads dataframe from a input file depending on its format. Parameters ---------- input_csv The path to the input file. input_format The format of the input file. Returns ------- input_data_frame The dataframe loaded from the input file. """ input_data_frame: pd.DataFrame # Format1 are the results from e.g. different search engines # Add simple format manupulations here if necessary if input_format == "Format1": input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False) elif input_format == "Format2": input_data_frame = pd.read_csv(input_csv, low_memory=False) return input_data_frame
[docs] def add_current_data_point( self, current_datapoint: pd.Series, all_datapoints: Optional[pd.DataFrame] = None ) -> pd.DataFrame: """ Add current data point to all data points and load them from file if empty. Parameters ---------- all_datapoints The data points from previous runs. current_datapoint The current data point to be added. Returns ------- all_datapoints The data points with the current data point added. """ if not isinstance(all_datapoints, pd.DataFrame): all_datapoints = pd.read_json(TEMPLATE_RESULTS_PATH) else: all_datapoints = all_datapoints.T all_datapoints = pd.concat([all_datapoints, current_datapoint], axis=1) all_datapoints = all_datapoints.T.reset_index(drop=True) return all_datapoints
[docs] def benchmarking(self, input_file: str, input_format: str, user_input: dict, all_datapoints): """ Main workflow of the module. Used to benchmark workflow results. Parameters ---------- input_file Path to the workflow output file. input_format Format of the workflow output file. user_input User provided parameters for plotting. all_datapoints DataFrame containing all datapoints from the proteobench repo. default_cutoff_min_prec Minimum number of runs an ion has to be identified in. Returns ------- tuple[DataFrame, DataFrame] Tuple containing the intermediate data structure, and all datapoints. """ # Read input file # call load_input_file() method input_df = self.load_input_file(input_file, input_format) # Parse user config parse_settings = ParseSettings(input_format) # Converte uploaded data to standard format standard_format = ParseInputs().convert_to_standard_format(input_df, parse_settings) # Create intermediate data structure for benchmarking intermediate_data_structure = self.generate_intermediate(standard_format, parse_settings) # Compute performance metrics current_datapoint = self.generate_datapoint(intermediate_data_structure, input_format, user_input) # Add data point to all data points all_datapoints = self.add_current_data_point(all_datapoints, current_datapoint) return intermediate_data_structure, all_datapoints