Source code for proteobench.modules.denovo.denovo_DDA_HCD
"""
De Novo Module DDA-HCD spectra.
"""
from __future__ import annotations
import pandas as pd
from pandas import DataFrame
from proteobench.datapoint.denovo_datapoint import DenovoDatapoint
from proteobench.exceptions import (
ConvertStandardFormatError,
IntermediateFormatGenerationError,
ParseError,
ParseSettingsError,
QuantificationError,
)
from proteobench.io.parsing.parse_denovo import load_input_file
from proteobench.io.parsing.parse_settings import ParseSettingsBuilder
from proteobench.modules.constants import MODULE_SETTINGS_DIRS
from proteobench.modules.denovo.denovo_base import DeNovoModule
from proteobench.score.denovoscores import DenovoScores
[docs]
class DDAHCDDeNovoModule(DeNovoModule):
"""
De Novo Module.
"""
module_id = "denovo_DDA_HCD"
def __init__(
self,
token: str,
proteobot_repo_name: str = "Proteobot/Results_denovo_lfq_DDA_HCD",
proteobench_repo_name: str = "Proteobench/Results_denovo_lfq_DDA_HCD",
):
"""
Initialize the DDA Quantification Module for Ion level Quantification.
Parameters
----------
token : str
GitHub token for the user.
proteobot_repo_name : str, optional
Name of the repository for pull requests and where new points are added, by default "Proteobot/Results_quant_ion_DDA".
proteobench_repo_name : str, optional
Name of the repository where the benchmarking results will be stored, by default "Proteobench/Results_quant_ion_DDA".
"""
super().__init__(
token,
proteobot_repo_name=proteobot_repo_name,
proteobench_repo_name=proteobench_repo_name,
parse_settings_dir=MODULE_SETTINGS_DIRS[self.module_id],
module_id=self.module_id,
)
[docs]
def is_implemented(self) -> bool:
"""
Return whether the module is fully implemented.
Returns
-------
bool
Always returns True in this implementation.
"""
return False
[docs]
def benchmarking(
self,
input_file_loc: any,
input_format: str,
user_input: dict,
all_datapoints: pd.DataFrame,
evaluation_type: str = "mass",
) -> tuple[DataFrame, DataFrame, DataFrame]:
"""
Main workflow of the module. Used to benchmark workflow results.
Parameters
----------
input_file_loc : any
Path to the workflow output file.
input_format : str
Format of the workflow output file.
user_input : dict
User provided parameters for plotting.
all_datapoints : pd.DataFrame
DataFrame containing all datapoints from the proteobench repo.
level : str
The level precision and recall is calculated. Either `precision` or `recall`
evaluation_type : str
The evaluation type for precision calculation. Either `exact` or `mass`-based
Returns
-------
tuple[DataFrame, DataFrame, DataFrame]
Tuple containing the intermediate data structure, all datapoints, and the input DataFrame.
"""
# Parse workflow output file
try:
input_df = load_input_file(input_file_loc, input_format)
except pd.errors.ParserError as e:
raise ParseError(
f"Error parsing {input_format} file, please make sure the format is correct and the correct software tool is chosen: {e}"
) from e
except Exception as e:
raise ParseSettingsError("Error parsing the input file.") from e
msg = f"Folder: {self.parse_settings_dir}, Module: {self.module_id}"
# Parse settings file
try:
parse_settings = ParseSettingsBuilder(
parse_settings_dir=self.parse_settings_dir, module_id=self.module_id
).build_parser(input_format)
except KeyError as e:
raise ParseSettingsError(
f"Error parsing settings file for parsing, settings seem to be missing: {msg}"
) from e
except FileNotFoundError as e:
raise ParseSettingsError(f"Could not find the parsing settings file: {msg}") from e
except Exception as e:
raise ParseSettingsError(f"Error parsing settings file for parsing: {msg}") from e
try:
standard_format = parse_settings.convert_to_standard_format(input_df)
except KeyError as e:
raise ConvertStandardFormatError("Error converting to standard format, key missing.") from e
except Exception as e:
raise ConvertStandardFormatError("Error converting to standard format.") from e
# Instantiate de novo scores
denovo_score = DenovoScores()
# generate intermediate data structure (Calculate the scores)
try:
intermediate_metric_structure = denovo_score.generate_intermediate(standard_format)
except Exception as e:
raise IntermediateFormatGenerationError("Error generating intermediate data structure.") from e
# try:
dtp = DenovoDatapoint()
current_datapoint = dtp.generate_datapoint(
intermediate=intermediate_metric_structure,
input_format=input_format,
user_input=user_input,
evaluation_type=evaluation_type,
)
all_datapoints = self.add_current_data_point(current_datapoint, all_datapoints=all_datapoints)
return (
intermediate_metric_structure,
all_datapoints,
input_df,
)