Source code for proteobench.io.parsing.parse_denovo
"""
Module for parsing results data from various de novo sequencing engines.
"""
import pandas as pd
from pyteomics.mztab import MzTab
def _load_adanovo(input_mztab: str) -> pd.DataFrame:
"""
Load an AdaNovo output file.
Parameters
----------
input_mztab: str
The path to the AdaNovo output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
input_data_frame = MzTab(input_mztab, encoding="utf-8")
input_data_frame = input_data_frame.spectrum_match_table
return input_data_frame
def _load_casanovo(input_mztab: str) -> pd.DataFrame:
"""
Load a Casanovo output file.
Parameters
----------
input_mztab: str
The path to the Casanovo output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
input_data_frame = MzTab(input_mztab)
input_data_frame = input_data_frame.spectrum_match_table
return input_data_frame
def _load_deepnovo(input_csv: str) -> pd.DataFrame:
"""
Load a DeepNovo output file.
Parameters
----------
input_path: str
The path to the DeepNovo output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
return pd.read_csv(input_csv, sep="\t", low_memory=False)
def _load_instanovo(input_mztab: str) -> pd.DataFrame:
"""
Load an InstaNovo output file.
Parameters
----------
input_mztab: str
The path to the InstaNovo output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
input_data_frame = MzTab(input_mztab)
input_data_frame = input_data_frame.spectrum_match_table
return input_data_frame
def _load_pepnet(input_csv: str) -> pd.DataFrame:
"""
Load a PepNet output file.
Parameters
----------
input_path: str
The path to the PepNet output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
return pd.read_csv(input_csv, sep="\t", low_memory=False)
def _load_pihelixnovo(input_path: str) -> pd.DataFrame:
"""
Load a Pi-HelixNovo output file.
Parameters
----------
input_path: str
The path to the Pi-HelixNovo output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
input_data_frame = pd.read_csv(input_path, sep="\t", low_memory=False, header=None)
return input_data_frame.rename(columns={0: "0", 1: "1", 2: "2"})
def _load_piprimenovo(input_path: str) -> pd.DataFrame:
"""
Load a Pi-PrimeNovo output file.
Parameters
----------
input_path: str
The path to the Pi-PrimeNovo output file.
Returns
-------
pd.DataFrame
The loaded dataframe.
"""
return pd.read_csv(input_path, sep="\t", low_memory=False).dropna()
def _load_custom(input_path: str) -> pd.DataFrame:
"""
Load a de novo output file in custom (generic) format.
Parameters
----------
input_path: str
The path to the output file
Returns
-------
pd.DataFrame
The loaded dataframe
"""
return pd.read_csv(input_path, low_memory=False)
_LOAD_FUNCTIONS = {
"AdaNovo": _load_adanovo,
"Casanovo": _load_casanovo,
"DeepNovo": _load_deepnovo,
"InstaNovo": _load_instanovo,
"PepNet": _load_pepnet,
"Pi-HelixNovo": _load_pihelixnovo,
"Pi-PrimeNovo": _load_piprimenovo,
"Custom": _load_custom,
}