Source code for proteobench.plotting.plot_denovo

"""
Module for plotting results of de novo models
"""

from typing import Dict, List

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

EPSILON = 0.0001



[docs]
def flatten_results_column(df):
    results = {
        "engine": [],
        "peptide_mass_precision": [],
        "peptide_mass_recall": [],
        "peptide_mass_coverage": [],
        "peptide_exact_precision": [],
        "peptide_exact_recall": [],
        "peptide_exact_coverage": [],
        "aa_mass_precision": [],
        "aa_mass_recall": [],
        "aa_mass_coverage": [],
        "aa_exact_precision": [],
        "aa_exact_recall": [],
        "aa_exact_coverage": [],
    }
    for i, row in df.iterrows():
        results["engine"].append(row["software_name"])

        results["peptide_mass_precision"].append(row["results"]["peptide"]["mass"]["precision"])
        results["peptide_mass_recall"].append(row["results"]["peptide"]["mass"]["recall"])
        results["peptide_mass_coverage"].append(row["results"]["peptide"]["mass"]["coverage"])

        results["peptide_exact_precision"].append(row["results"]["peptide"]["exact"]["precision"])
        results["peptide_exact_recall"].append(row["results"]["peptide"]["exact"]["recall"])
        results["peptide_exact_coverage"].append(row["results"]["peptide"]["exact"]["coverage"])

        results["aa_mass_precision"].append(row["results"]["aa"]["mass"]["precision"])
        results["aa_mass_recall"].append(row["results"]["aa"]["mass"]["recall"])
        results["aa_mass_coverage"].append(row["results"]["aa"]["mass"]["coverage"])

        results["aa_exact_precision"].append(row["results"]["aa"]["exact"]["precision"])
        results["aa_exact_recall"].append(row["results"]["aa"]["exact"]["recall"])
        results["aa_exact_coverage"].append(row["results"]["aa"]["exact"]["coverage"])
    return pd.DataFrame(results)




[docs]
class PlotDataPoint:
    """
    Class for plotting data points.
    """


[docs]
    @staticmethod
    def plot_metric(
        benchmark_metrics_df: pd.DataFrame,
        level: str = "precision",
        evaluation_type: str = "mass",
        software_colors: Dict[str, str] = {
            "AdaNovo": "#8b26ff",
            "Casanovo": "#8bc6fd",
            "DeepNovo": "#108E2E",
            "PepNet": "#F89008",
            "Pi-HelixNovo": "#E43924",
            "Pi-PrimeNovo": "#663200",
            "PEAKS": "#f032e6",
        },
        mapping: Dict[str, int] = {"old": 10, "new": 20},
        highlight_color: str = "#d30067",
        label: str = "None",
    ) -> go.Figure:
        # Define layout
        results_df = flatten_results_column(benchmark_metrics_df)
        benchmark_metrics_df = pd.concat([benchmark_metrics_df, results_df], axis=1)
        results_min = results_df.min()
        results_max = results_df.max()

        # Add hover text with detailed information for each data point
        hover_texts = []
        for idx, _ in benchmark_metrics_df.iterrows():
            datapoint_text = ""
            if benchmark_metrics_df.is_temporary[idx] == True:
                datapoint_text = (
                    f"ProteoBench ID: {benchmark_metrics_df.id[idx]}<br>"
                    + f"Software tool: {benchmark_metrics_df.software_name[idx]} {benchmark_metrics_df.software_version[idx]}<br>"
                )
                if "comments" in benchmark_metrics_df.columns:
                    comment = benchmark_metrics_df.comments[idx]
                    if isinstance(comment, str):
                        datapoint_text = (
                            datapoint_text
                            + f"Comment (private submission): {comment[:10] + '...' if len(comment) > 10 else comment}..."
                        )
            else:
                # TODO: Determine parameters based on module
                datapoint_text = (
                    f"ProteoBench ID: {benchmark_metrics_df.id[idx]}<br>"
                    + f"Software tool: {benchmark_metrics_df.software_name[idx]} {benchmark_metrics_df.software_version[idx]}<br>"
                    + f"Model checkpoint: {benchmark_metrics_df.checkpoint[idx]}<br>"
                    + f"Number of Beams: {benchmark_metrics_df.n_beams[idx]}<br>"
                    + f"Decoding Strategy: {benchmark_metrics_df.decoding_strategy[idx]}<br>"
                    + f"Precursor Tolerance: {benchmark_metrics_df.precursor_mass_tolerance[idx]}<br>"
                    + f"Tolerance for precursor removal: {benchmark_metrics_df.remove_precursor_tol[idx]}<br>"
                    + f"Number of peaks: {benchmark_metrics_df.n_peaks[idx]} <br>"
                    + f"Min mz: {benchmark_metrics_df.min_mz[idx]}<br>"
                    + f"Max mz: {benchmark_metrics_df.max_mz[idx]}<br>"
                    + f"Min peptide length: {benchmark_metrics_df.min_peptide_length[idx]}<br>"
                    + f"Max peptide length: {benchmark_metrics_df.max_peptide_length[idx]}<br>"
                    + f"Min intensity: {benchmark_metrics_df.min_intensity[idx]}<br>"
                    + f"Max intensity: {benchmark_metrics_df.max_intensity[idx]}<br>"
                    + f"Max precursor charge: {benchmark_metrics_df.max_precursor_charge[idx]}<br>"
                    + f"Isotope error range: {benchmark_metrics_df.isotope_error_range[idx]}<br>"
                )
                if "submission_comments" in benchmark_metrics_df.columns:
                    comment = benchmark_metrics_df.submission_comments[idx]
                    if isinstance(comment, str):
                        datapoint_text = (
                            datapoint_text
                            + f"Comment (public submission): {comment[:10] + '...' if len(comment) > 10 else comment}..."
                        )

            hover_texts.append(datapoint_text)

        scatter_size = [mapping[item] for item in benchmark_metrics_df["old_new"]]
        if "Highlight" in benchmark_metrics_df.columns:
            scatter_size = [
                item * 2 if highlight else item
                for item, highlight in zip(scatter_size, benchmark_metrics_df["Highlight"])
            ]

        # Color plot based on software tool
        colors = [software_colors[software] for software in benchmark_metrics_df["software_name"]]
        if "Highlight" in benchmark_metrics_df.columns:
            colors = [
                highlight_color if highlight else item
                for item, highlight in zip(colors, benchmark_metrics_df["Highlight"])
            ]

        benchmark_metrics_df["color"] = colors
        benchmark_metrics_df["hover_text"] = hover_texts
        benchmark_metrics_df["scatter_size"] = scatter_size

        layout_xaxis_range = [
            results_min[f"peptide_{evaluation_type}_{level}"]
            - results_min[f"peptide_{evaluation_type}_{level}"] * 0.05,
            results_max[f"peptide_{evaluation_type}_{level}"]
            + results_max[f"peptide_{evaluation_type}_{level}"] * 0.05,
        ]
        layout_yaxis_range = [
            results_min[f"aa_{evaluation_type}_{level}"] - results_min[f"aa_{evaluation_type}_{level}"] * 0.05,
            results_max[f"aa_{evaluation_type}_{level}"] + results_max[f"aa_{evaluation_type}_{level}"] * 0.05,
        ]
        layout_xaxis_title = f"Peptide {level.capitalize()}"
        layout_yaxis_title = f"Amino Acid {level.capitalize()}"

        fig = go.Figure(
            layout_yaxis_range=layout_yaxis_range,
            layout_xaxis_range=layout_xaxis_range,
        )

        # Get all unique color-software combinations (necessary for highlighting)
        color_software_combinations = benchmark_metrics_df[["color", "software_name"]].drop_duplicates()

        # plot the data points, one trace per software tool
        for _, row in color_software_combinations.iterrows():
            color = row["color"]
            software = row["software_name"]

            tmp_df = benchmark_metrics_df[
                (benchmark_metrics_df["color"] == color) & (benchmark_metrics_df["software_name"] == software)
            ]

            fig.add_trace(
                go.Scatter(
                    x=tmp_df[f"peptide_{evaluation_type}_{level}"],
                    y=tmp_df[f"aa_{evaluation_type}_{level}"],
                    mode="markers" if label == "None" else "markers+text",
                    hovertext=tmp_df["hover_text"],
                    text=tmp_df[label] if label != "None" else None,
                    marker=dict(color=tmp_df["color"], showscale=False),
                    marker_size=tmp_df["scatter_size"],
                    name=tmp_df["software_name"].iloc[0],
                )
            )

        fig.update_layout(
            width=700,
            height=700,
            xaxis=dict(
                title=layout_xaxis_title,
                gridcolor="white",
                gridwidth=2,
                linecolor="black",
            ),
            yaxis=dict(
                title=layout_yaxis_title,
                gridcolor="white",
                gridwidth=2,
                linecolor="black",
            ),
        )
        fig.update_xaxes(showgrid=True, gridcolor="lightgray", gridwidth=1)
        fig.update_yaxes(showgrid=True, gridcolor="lightgray", gridwidth=1)

        fig.add_annotation(
            x=0.5,
            y=0.5,
            xref="paper",
            yref="paper",
            text="-Alpha-",
            font=dict(size=50, color="rgba(0,0,0,0.1)"),
            showarrow=False,
        )

        fig.update_layout(clickmode="event+select")

        return fig



[docs]
    def plot_ptm_overview(
        self,
        benchmark_metrics_df: pd.DataFrame,
        mod_labels: List[str],
        software_colors: Dict[str, str] = {
            "AdaNovo": "#8b26ff",
            "Casanovo": "#8bc6fd",
            "DeepNovo": "#108E2E",
            "PepNet": "#F89008",
            "Pi-HelixNovo": "#E43924",
            "Pi-PrimeNovo": "#663200",
            "PEAKS": "#f032e6",
        },
    ):
        fig = go.Figure()
        for i, row in benchmark_metrics_df.iterrows():
            x, y = self.get_modification_scores(row["results"]["in_depth"]["PTM"], mod_labels=mod_labels)
            tool = row["software_name"]

            fig.add_trace(
                go.Scatter(x=x, y=y, mode="lines+markers", name=tool, marker=dict(color=software_colors[tool]))
            )

        fig.update_layout(
            width=700,
            height=400,
            xaxis=dict(title="Modification", color="black", gridwidth=2, linecolor="black"),
            yaxis=dict(linecolor="black"),
        )
        fig.update_yaxes(title="Precision", color="black", gridwidth=2)
        fig.update_yaxes(showgrid=True, gridcolor="lightgray", gridwidth=1)

        return fig



[docs]
    def plot_ptm_specific(
        self,
        benchmark_metrics_df,
        mod_label,
        software_colors: Dict[str, str] = {
            "AdaNovo": "#8b26ff",
            "Casanovo": "#8bc6fd",
            "DeepNovo": "#108E2E",
            "PepNet": "#F89008",
            "Pi-HelixNovo": "#E43924",
            "Pi-PrimeNovo": "#663200",
            "PEAKS": "#f032e6",
        },
    ):
        fig = go.Figure()
        for i, row in benchmark_metrics_df.iterrows():
            ptm_data = row["results"]["in_depth"]["PTM"]

            # To make division by 0 impossible

            x = ptm_data[mod_label]["correct_gt"] / (ptm_data[mod_label]["counts_gt"] + EPSILON)
            y = ptm_data[mod_label]["correct_dn"] / (ptm_data[mod_label]["counts_dn"] + EPSILON)
            tool = row["software_name"]
            fig.add_trace(go.Scatter(x=[x], y=[y], name=tool, marker=dict(color=software_colors[tool])))

        fig.update_layout(
            width=500,
            height=500,
            xaxis=dict(title="Precision (Ground-truth)", color="black", gridwidth=2),
            yaxis=dict(title="Precision (denovo)", color="black", gridwidth=2),
        )

        return fig



[docs]
    @staticmethod
    def get_modification_scores(mod_dict, mod_labels):
        x = []
        y = []

        for mod_label in mod_labels:
            x.append(mod_label)
            y.append(mod_dict[mod_label]["correct_gt"] / mod_dict[mod_label]["counts_gt"] + EPSILON)
        return x, y



[docs]
    def plot_spectrum_feature(
        self,
        benchmark_metrics_df,
        feature,
        evaluation_type="mass",
        software_colors={
            "AdaNovo": "#8b26ff",
            "Casanovo": "#8bc6fd",
            "DeepNovo": "#108E2E",
            "PepNet": "#F89008",
            "Pi-HelixNovo": "#E43924",
            "Pi-PrimeNovo": "#663200",
            "PEAKS": "#f032e6",
            "test": "black",
        },
    ):
        # Create a subplot with 2 rows, shared x-axis
        fig = make_subplots(
            rows=2,
            cols=1,
            shared_xaxes=True,
            row_heights=[0.8, 0.2],
            vertical_spacing=0,
            subplot_titles=(f"{feature} vs Precision", None),
        )
        if len(benchmark_metrics_df) == 0:
            fig.add_trace(go.Scatter())
            fig.add_trace(go.Bar())
            fig.update_layout(
                height=600,
                width=600,
                xaxis=dict(title=None, color="black"),
                yaxis=dict(title="Precision", color="black"),
                xaxis2=dict(title=f"{feature}", color="black"),
                yaxis2=dict(title="Number of Spectra", color="black"),
                margin=dict(t=50),
            )
            fig.update_yaxes(
                autorange="reversed",
                # tickvals=[-v for v in sorted(set(df['y_bar']))],
                # ticktext=[v for v in sorted(set(df['y_bar']))],
                row=2,
                col=1,
            )
            return fig

        ### Reformat df
        benchmark_metrics_df = benchmark_metrics_df.reset_index(drop=True)

        dtps_to_plot = [x["in_depth"]["Spectrum"][feature] for x in benchmark_metrics_df["results"].tolist()]
        # Stringify the keys of the datapoint to plot and convert to dataframe
        df = pd.DataFrame([{str(k): v for k, v in i.items()} for i in dtps_to_plot])

        df = df.fillna(str({"exact": 0.0, "mass": 0.0, "n_spectra": 0}))
        df = (
            pd.concat([df, benchmark_metrics_df[["software_name", "id"]]], axis=1)
            .melt(id_vars=["id", "software_name"])
            .rename(columns={"variable": feature, "value": "metrics"})
        )
        df["metrics"] = df["metrics"].apply(lambda x: eval(x) if isinstance(x, str) else x)

        ### Create the scatter-lineplot of the feature
        for dtp_id in df["id"].unique():
            df_dtp = df.loc[df["id"] == dtp_id]
            tool = df_dtp.reset_index().loc[0, "software_name"]

            fig.add_trace(
                go.Scatter(
                    x=df_dtp[feature].tolist(),
                    y=df_dtp["metrics"].apply(lambda x: x[evaluation_type]).tolist(),
                    name=tool,
                    marker=dict(color=software_colors.get(tool, "gray")),
                    mode="lines+markers",
                ),
                row=1,
                col=1,
            )

        ### Create the bar chart
        # Extract the counts as medians for all plotted points
        bar_data = df.groupby(feature)["metrics"].apply(lambda x: np.median([i["n_spectra"] for i in x]))
        bar_counts = bar_data.tolist()
        bar_xaxis = bar_data.index.tolist()

        # Construct hover text
        def create_hovertext(df: pd.DataFrame):
            text = "Number of spectra for each tool"
            for i, (id, metric) in df[["id", "metrics"]].iterrows():
                text += f'<br>{id}: {metric["n_spectra"]}'
            return text

        hovertexts = df.groupby(feature).apply(lambda x: create_hovertext(x)).tolist()

        # Construct the barchart
        fig.add_trace(
            go.Bar(x=bar_xaxis, y=bar_counts, hovertext=hovertexts, marker=dict(color="gray"), showlegend=False),
            row=2,
            col=1,
        )

        fig.update_layout(
            height=600,
            width=600,
            xaxis=dict(title=None, color="black"),
            yaxis=dict(title="Precision", color="black"),
            xaxis2=dict(title=f"{feature}", color="black"),
            yaxis2=dict(title="Number of Spectra", color="black"),
            margin=dict(t=50),
        )
        fig.update_yaxes(
            autorange="reversed",
            # tickvals=[-v for v in sorted(set(df['y_bar']))],
            # ticktext=[v for v in sorted(set(df['y_bar']))],
            row=2,
            col=1,
        )

        return fig



[docs]
    def plot_species_overview(
        self,
        benchmark_metrics_df,
        evaluation_type="mass",
        software_colors={
            "AdaNovo": "#8b26ff",
            "Casanovo": "#8bc6fd",
            "DeepNovo": "#108E2E",
            "PepNet": "#F89008",
            "Pi-HelixNovo": "#E43924",
            "Pi-PrimeNovo": "#663200",
            "PEAKS": "#f032e6",
            "test": "black",
        },
    ):
        # Create a subplot with 2 rows, shared x-axis
        fig = make_subplots(
            rows=2,
            cols=1,
            shared_xaxes=True,
            row_heights=[0.8, 0.2],
            vertical_spacing=0,
            subplot_titles=("Species vs Precision", None),
        )
        if len(benchmark_metrics_df) == 0:
            fig.add_trace(go.Scatter())
            fig.add_trace(go.Bar())
            fig.update_layout(
                height=600,
                width=600,
                xaxis=dict(title=None, color="black"),
                yaxis=dict(title="Precision", color="black"),
                xaxis2=dict(title="Species", color="black"),
                yaxis2=dict(title="Number of Spectra", color="black"),
                margin=dict(t=50),
            )
            fig.update_yaxes(
                autorange="reversed",
                # tickvals=[-v for v in sorted(set(df['y_bar']))],
                # ticktext=[v for v in sorted(set(df['y_bar']))],
                row=2,
                col=1,
            )
            return fig

        benchmark_metrics_df = benchmark_metrics_df.reset_index(drop=True)

        df = (
            pd.DataFrame([x["in_depth"]["Species"] for x in benchmark_metrics_df["results"].tolist()])
            .fillna(str({"exact": 0.0, "mass": 0.0, "n_spectra": 0}))
            .map(lambda x: eval(x) if isinstance(x, str) else x)
        )
        df = (
            pd.concat([df, benchmark_metrics_df[["software_name", "id"]]], axis=1)
            .melt(id_vars=["id", "software_name"])
            .rename(columns={"variable": "Species", "value": "metrics"})
        )
        df["metrics"] = df["metrics"].apply(lambda x: eval(x) if isinstance(x, str) else x)

        ### Create the scatter-lineplot of the feature
        for dtp_id in df["id"].unique():
            df_dtp = df.loc[df["id"] == dtp_id]
            tool = df_dtp.reset_index().loc[0, "software_name"]

            fig.add_trace(
                go.Scatter(
                    x=df_dtp["Species"].tolist(),
                    y=df_dtp["metrics"].apply(lambda x: x[evaluation_type]).tolist(),
                    name=tool,
                    marker=dict(color=software_colors.get(tool, "gray")),
                    mode="lines+markers",
                ),
                row=1,
                col=1,
            )

        ### Create the bar chart
        # Extract the counts as medians for all plotted points
        bar_data = df.groupby("Species")["metrics"].apply(lambda x: np.median([i["n_spectra"] for i in x]))
        bar_counts = bar_data.tolist()
        bar_xaxis = bar_data.index.tolist()

        # Construct hover text
        def create_hovertext(df: pd.DataFrame):
            text = "Number of spectra for each tool"
            for i, (id, metric) in df[["id", "metrics"]].iterrows():
                text += f'<br>{id}: {metric["n_spectra"]}'
            return text

        hovertexts = df.groupby("Species").apply(lambda x: create_hovertext(x)).tolist()

        # Construct the barchart
        fig.add_trace(
            go.Bar(x=bar_xaxis, y=bar_counts, hovertext=hovertexts, marker=dict(color="gray"), showlegend=False),
            row=2,
            col=1,
        )

        fig.update_layout(
            height=600,
            width=600,
            xaxis=dict(title=None, color="black"),
            yaxis=dict(title="Precision", color="black"),
            xaxis2=dict(title="Species", color="black"),
            yaxis2=dict(title="Number of Spectra", color="black"),
            margin=dict(t=50),
        )
        fig.update_yaxes(
            autorange="reversed",
            # tickvals=[-v for v in sorted(set(df['y_bar']))],
            # ticktext=[v for v in sorted(set(df['y_bar']))],
            row=2,
            col=1,
        )
        return fig



[docs]
    def plot_species_specific():
        pass