Source code for proteobench.plotting.plot_generator_lfq_HYE

from typing import Dict, List, Tuple, Union

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.figure_factory import create_distplot

from proteobench.plotting.plot_generator_base import PlotGeneratorBase


[docs] class LFQHYEPlotGenerator(PlotGeneratorBase): """ Plot generator for LFQ HYE (Human-Yeast-Ecoli) quantification modules. Used by DIA/DDA ion modules that use the HYE benchmark dataset. """
[docs] def generate_in_depth_plots( self, performance_data: pd.DataFrame, parse_settings: any, **kwargs ) -> Dict[str, go.Figure]: """ Generate standard LFQ HYE plots. Parameters ---------- performance_data : pd.DataFrame The performance data to plot parse_settings : ParseSettings The parse settings for the module recalculate : bool Whether to recalculate or use cached plots **kwargs : dict Additional module-specific parameters Returns ------- Dict[str, go.Figure] Dictionary mapping plot names to plotly figures """ plots = {} # Get expected ratios from parse settings species_expected_ratio = parse_settings.species_expected_ratio() # Generate fold change histogram plots["logfc"] = self._plot_fold_change_histogram(performance_data, species_expected_ratio) # Generate CV violin plot plots["cv"] = self._plot_cv_violinplot(performance_data) # Generate MA plot plots["ma_plot"] = self._plot_ma_plot(performance_data, species_expected_ratio) return plots
[docs] def get_in_depth_plot_layout(self) -> list: """ Define layout for LFQ HYE plots. Returns ------- list List of in-depth plot configurations defining how plots should be displayed """ return [ { "plots": ["logfc", "cv"], "columns": 2, "titles": { "logfc": "Log2 Fold Change distributions by species.", "cv": "Coefficient of variation distribution in Condition A and B.", }, }, { "plots": ["ma_plot"], "columns": 1, "titles": {"ma_plot": "MA plot"}, }, ]
[docs] def get_in_depth_plot_descriptions(self) -> Dict[str, str]: """ Get descriptions for each plot. Returns ------- Dict[str, str] Dictionary mapping plot names to their descriptions """ return { "logfc": "log2 fold changes calculated from the performance data", "cv": "CVs calculated from the performance data", "ma_plot": "MA plot calculated from the performance data", }
def _get_metric_column_name(self, metric: str, mode: str) -> Tuple[str, str, str]: """ Get the appropriate metric column names based on metric type and calculation mode. Parameters ---------- metric : str The metric type: "Median" or "Mean" mode : str The calculation mode: "Global" or "Equal weighted species" Returns ------- Tuple[str, str, str] Tuple of (metric_lower, mode_suffix, plot_title) for constructing column names and plot title. """ metric_lower = metric.lower() mode_suffix = "global" if mode == "Global" else "eq_species" mode_description = "global" if mode == "Global" else "equal weighted species" plot_title = ( f"{metric} absolute difference between measured and expected log2-transformed fold change " f"(calculated {mode_description})" ) return metric_lower, mode_suffix, plot_title
[docs] def plot_main_metric( self, result_df: pd.DataFrame, metric: str = "Median", mode: str = "Species-weighted", colorblind_mode: bool = False, software_colors: Dict[str, str] = { "MaxQuant": "#88ccef", "AlphaPept": "#cc6777", "ProlineStudio": "#ddcc77", "MSAngel": "#147733", "FragPipe": "#342288", "i2MassChroQ": "#aa4599", "Sage": "#671100", "WOMBAT": "#44aa9a", "DIA-NN": "#999934", "AlphaDIA": "#1D2732", "Custom": "#000000", "Spectronaut": "#007548", "FragPipe (DIA-NN quant)": "#F89008", "MSAID": "#bfef45", "MetaMorpheus": "#637C7A", "Proteome Discoverer": "#911eb4", "PEAKS": "#f032e6", "quantms": "#f5e830", }, software_markers: Dict[str, str] = { "MaxQuant": "circle", "AlphaPept": "square", "ProlineStudio": "diamond", "MSAngel": "cross", "FragPipe": "x", "i2MassChroQ": "triangle-up", "Sage": "triangle-down", "WOMBAT": "pentagon", "DIA-NN": "star", "AlphaDIA": "star-triangle-up", "Custom": "star-square", "Spectronaut": "diamond-tall", "FragPipe (DIA-NN quant)": "circle-x", "MSAID": "square-cross", "MetaMorpheus": "asterisk", "Proteome Discoverer": "hash", "PEAKS": "diamond-wide", "quantms": "hexagram", }, mapping: Dict[str, str] = {"old": 10, "new": 20}, highlight_color: str = "#d30067", label: str = "", legend_name_map: Dict[str, str] = {"AlphaPept": "AlphaPept (legacy tool)"}, annotation: str = "", **kwargs, ) -> go.Figure: """ Generate the main performance metric plot for LFQ HYE modules. Parameters ---------- result_df : pd.DataFrame DataFrame containing the results to plot. metric : str, optional Metric to plot, either "Median" or "Mean". mode : str, optional Mode of calculation, either, "Species-weighted" or "Global". Case-insensitive. colorblind_mode : Bool, optional If True, use different shapes for workflows. software_colors : Dict[str, str] Mapping of software names to colors. software_markers : Dict[str, str] Mapping of software names to markers. mapping : Dict[str, str] Mapping for renaming software versions. highlight_color : str Color to use for highlighting a specific software/tool. label : str Label for the highlighted software/tool. legend_name_map : Dict[str, str] Mapping for legend names. hide_annot : bool Whether to hide annotations on the plot. **kwargs : dict Additional module-specific parameters. Returns ------- go.Figure Plotly figure with the main performance metric plot. """ # Get metric column names and plot_title based on selected metric and mode metric_lower, mode_suffix, plot_title = self._get_metric_column_name(metric, mode) # ROC-AUC is a special case - uses direct column name without mode suffix if metric == "ROC-AUC": metric_col_name = "roc_auc" legacy_metric_col_name = None # No legacy column for ROC-AUC # Filter to only datapoints that have ROC-AUC calculated result_df = self._filter_datapoints_with_metric(result_df, metric_col_name) else: metric_col_name = f"{metric_lower}_abs_epsilon_{mode_suffix}" legacy_metric_col_name = f"{metric_lower}_abs_epsilon" # Filter based on mode # If user selects "Species-weighted" mode, only show datapoints that have the new metrics if mode == "Species-weighted": result_df = self._filter_datapoints_with_metric(result_df, metric_col_name) # Extract all values for the selected metric mode # Handle mixed old/new datapoints by trying the new key first, then falling back to legacy all_metric_values = [] for v in result_df["results"]: for v2 in v.values(): # Try new metric name first, fall back to legacy if not present value = v2.get(metric_col_name) if value is None and legacy_metric_col_name is not None: value = v2.get(legacy_metric_col_name) if value is not None: all_metric_values.append(value) all_nr_prec = [v2["nr_prec"] for v in result_df["results"] for v2 in v.values()] # Add hover text with detailed information for each data point hover_texts = [] for idx, _ in result_df.iterrows(): datapoint_text = "" if result_df.is_temporary[idx] == True: datapoint_text = ( f"ProteoBench ID: {result_df.id[idx]}<br>" + f"Software tool: {result_df.software_name[idx]} {result_df.software_version[idx]}<br>" ) # Add keyword if present # TODO: potentially make more generic so that this does not have to be added in multiple plot_generator classes if "Keyword" in result_df.columns: keyword = result_df.Keyword[idx] if isinstance(keyword, str) and keyword.strip(): datapoint_text = datapoint_text + f"Keyword: {keyword}<br>" if "comments" in result_df.columns: comment = result_df.comments[idx] if isinstance(comment, str): datapoint_text = ( datapoint_text + f"Comment (private submission): {comment[:10] + '...' if len(comment) > 10 else comment}..." ) else: # TODO: Determine parameters based on module datapoint_text = ( f"ProteoBench ID: {result_df.id[idx]}<br>" + f"Software tool: {result_df.software_name[idx]} {result_df.software_version[idx]}<br>" + f"Search engine: {result_df.search_engine[idx]} {result_df.search_engine_version[idx]}<br>" + f"FDR psm: {result_df.ident_fdr_psm[idx]}<br>" + f"MBR: {result_df.enable_match_between_runs[idx]}<br>" + f"Precursor Tolerance: {result_df.precursor_mass_tolerance[idx]}<br>" + f"Fragment Tolerance: {result_df.fragment_mass_tolerance[idx]}<br>" + f"Enzyme: {result_df.enzyme[idx]} <br>" + f"Missed Cleavages: {result_df.allowed_miscleavages[idx]}<br>" + f"Min peptide length: {result_df.min_peptide_length[idx]}<br>" + f"Max peptide length: {result_df.max_peptide_length[idx]}<br>" ) if "submission_comments" in result_df.columns: comment = result_df.submission_comments[idx] if isinstance(comment, str): datapoint_text = ( datapoint_text + f"Comment (public submission): {comment[:10] + '...' if len(comment) > 10 else comment}..." ) hover_texts.append(datapoint_text) scatter_size = [mapping[item] for item in result_df["old_new"]] if "Highlight" in result_df.columns: scatter_size = [ item * 2 if highlight else item for item, highlight in zip(scatter_size, result_df["Highlight"]) ] # Color plot based on software tool colors = [software_colors[software] for software in result_df["software_name"]] markers = [software_markers[software] for software in result_df["software_name"]] if "Highlight" in result_df.columns: colors = [highlight_color if highlight else item for item, highlight in zip(colors, result_df["Highlight"])] result_df["color"] = colors result_df["hover_text"] = hover_texts result_df["scatter_size"] = scatter_size if colorblind_mode: result_df["marker"] = markers else: result_df["marker"] = "circle" if all_metric_values: layout_xaxis_range = [ min(all_metric_values) - min(all_metric_values) * 0.05, max(all_metric_values) + max(all_metric_values) * 0.05, ] else: layout_xaxis_range = [0, 1] if all_nr_prec: layout_yaxis_range = [ min(all_nr_prec) - min(max(all_nr_prec) * 0.05, 2000), max(all_nr_prec) + min(max(all_nr_prec) * 0.05, 2000), ] else: layout_yaxis_range = [0, 1000] fig = go.Figure( layout_yaxis_range=layout_yaxis_range, layout_xaxis_range=layout_xaxis_range, ) # Get all unique color-software combinations (necessary for highlighting) color_software_combinations = result_df[["color", "software_name", "marker"]].drop_duplicates() result_df["enable_match_between_runs"] = result_df["enable_match_between_runs"].astype(str) # plot the data points, one trace per software tool for _, row in color_software_combinations.iterrows(): color = row["color"] software = row["software_name"] marker = row["marker"] tmp_df = result_df[(result_df["color"] == color) & (result_df["software_name"] == software)] # to do: remove this line as soon as parameters are homogeneous, see #380 # tmp_df["enable_match_between_runs"] = tmp_df["enable_match_between_runs"].astype(str) if metric_col_name in tmp_df.columns and tmp_df[metric_col_name].notna().any(): # use new column, but fill null values with legacy if available if legacy_metric_col_name is not None and legacy_metric_col_name in tmp_df.columns: x_values = tmp_df[metric_col_name].fillna(tmp_df[legacy_metric_col_name]) else: x_values = tmp_df[metric_col_name] elif legacy_metric_col_name is not None: # fall back to legacy column if new not available x_values = tmp_df[legacy_metric_col_name] else: # No fallback available (e.g. ROC-AUC case) x_values = tmp_df[metric_col_name] fig.add_trace( go.Scatter( x=x_values, y=tmp_df["nr_prec"].tolist(), mode="markers" if label == "None" else "markers+text", hovertext=tmp_df["hover_text"].tolist(), text=tmp_df[label].tolist() if label != "None" else None, marker=dict(color=tmp_df["color"].tolist(), showscale=False, symbol=tmp_df["marker"].tolist()), marker_size=tmp_df["scatter_size"].tolist(), name=legend_name_map.get(tmp_df["software_name"].iloc[0], tmp_df["software_name"].iloc[0]), ) ) fig.update_layout( width=None, height=700, autosize=True, xaxis=dict( title=plot_title, gridcolor="white", gridwidth=2, linecolor="black", ), yaxis=dict( title="Total number of precursor ions quantified in the selected number of raw files", gridcolor="white", gridwidth=2, linecolor="black", ), margin=dict(l=80, r=20, t=50, b=80), ) fig.update_xaxes(showgrid=True, gridcolor="lightgray", gridwidth=1) fig.update_yaxes(showgrid=True, gridcolor="lightgray", gridwidth=1) fig.add_annotation( x=0.5, y=0.5, xref="paper", yref="paper", text=annotation, font=dict(size=50, color="rgba(0,0,0,0.1)"), showarrow=False, ) fig.update_layout(clickmode="event+select") return fig
def _plot_fold_change_histogram( self, performance_data: pd.DataFrame, species_expected_ratio: Dict[str, Dict[str, Union[float, str]]] ) -> go.Figure: """ Generate fold change histogram plot. Parameters ---------- performance_data : pd.DataFrame Performance data containing log2_A_vs_B column species_expected_ratio : Dict[str, Dict[str, Union[float, str]]] Dictionary with expected ratios for each species, and colors Returns ------- go.Figure Plotly figure with fold change distributions """ species_list = list(species_expected_ratio.keys()) # Filter to rows where at least one species is present performance_data = performance_data[performance_data[species_list].any(axis=1)] performance_data["species"] = performance_data[species_list].apply(lambda x: species_list[np.argmax(x)], axis=1) # Prepare plot data hist_data = [] group_labels = [] colors = [] for species in species_list: hist_data.append( performance_data.loc[performance_data["species"] == species, "log2_A_vs_B"].dropna().tolist() ) group_labels.append(species) colors.append(species_expected_ratio[species].get("color", "#000000")) # Create distribution plot if hist_data: fig = create_distplot( hist_data, group_labels, show_hist=False, show_rug=False, colors=colors, ) for trace in fig.data: if trace.mode == "lines": trace.update(fill="tozeroy", opacity=0.4) fig.update_layout( xaxis=dict( title="Log2(Condition A / Condition B)", color="black", gridwidth=1, gridcolor="lightgray", range=[-4, 4], ), yaxis=dict(title="Density", color="black", gridwidth=1, gridcolor="lightgray"), ) ratio_map = {species: np.log2(data["A_vs_B"]) for species, data in species_expected_ratio.items()} for species, ratio in ratio_map.items(): fig.add_vline( x=ratio, line_dash="dash", line_color=species_expected_ratio[species].get("color", "#000000"), annotation_text=f"Expected {species}", ) else: # Create empty figure if no data fig = go.Figure() fig.add_annotation( text="No data available for fold change plot", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, ) return fig def _plot_cv_violinplot(self, performance_data: pd.DataFrame) -> go.Figure: """ Generate coefficient of variation violin plot. Parameters ---------- performance_data : pd.DataFrame Performance data containing CV_A and CV_B columns Returns ------- go.Figure Plotly figure with CV violin plots """ # Prepare data for violin plot cv_data = [] conditions = [] # Add CV data for Condition A if "CV_A" in performance_data.columns: cv_a = performance_data["CV_A"].replace([np.inf, -np.inf], np.nan).dropna() cv_data.extend(cv_a) conditions.extend(["Condition A"] * len(cv_a)) # Add CV data for Condition B if "CV_B" in performance_data.columns: cv_b = performance_data["CV_B"].replace([np.inf, -np.inf], np.nan).dropna() cv_data.extend(cv_b) conditions.extend(["Condition B"] * len(cv_b)) # Create violin plot if cv_data: df_plot = pd.DataFrame({"CV": cv_data, "Condition": conditions}) fig = px.violin(df_plot, y="CV", x="Condition", box=True, points=False) else: # Create empty figure if no data fig = go.Figure() fig.add_annotation( text="No CV data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, ) return fig def _plot_ma_plot( self, performance_data: pd.DataFrame, species_expected_ratio: Dict[str, Dict[str, Union[float, str]]] ) -> go.Figure: """ Generate MA plot (M vs A plot) but with A on the y-axis and M on the x-axis. Parameters ---------- performance_data : pd.DataFrame Performance data containing log2_A_vs_B and mean abundance columns species_expected_ratio : Dict[str, Dict[str, Union[float, str]]] Expected ratios for each species and their colors Returns ------- go.Figure Plotly figure with MA plot (M on x, A on y) """ # Define colors for species color_map = {species: data["color"] for species, data in species_expected_ratio.items()} performance_data["logIntensityMean"] = ( performance_data["log_Intensity_mean_A"] + performance_data["log_Intensity_mean_B"] ) / 2 fig = px.scatter( performance_data, x="log2_A_vs_B", y="logIntensityMean", color="species", color_discrete_map=color_map, labels={"log2_A_vs_B": "M (Log2 Fold Change(A:B))", "logIntensityMean": "A (Mean Abundance)"}, title="MA Plot", size_max=10, opacity=0.2, ) # Add vertical lines for expected M values (since M is on x-axis) across the A range if fig.data: ratio_map = {species: np.log2(data["A_vs_B"]) for species, data in species_expected_ratio.items()} for species, ratio in ratio_map.items(): fig.add_vline( x=ratio, line_dash="dash", line_color=species_expected_ratio[species].get("color", "#000000"), annotation_text=f"Expected {species}", ) fig.update_traces(marker=dict(size=6)) else: fig.add_annotation( text="No data available for MA plot", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, ) return fig def _get_metric_column_name(self, metric: str, mode: str) -> Tuple[str, str, str]: """ Get the appropriate metric column names based on the specified metric and mode. Parameters ---------- metric : str The metric to plot: "Median", "Mean", or "ROC-AUC". mode : str The mode for filtering, either "global" or "eq_species". Ignored for ROC-AUC. Returns ------- Tuple[str, str, str] A tuple containing the metric_lower, mode_suffix, and plot_title """ # ROC-AUC is a special case - no mode suffix, single column name if metric == "ROC-AUC": return "roc_auc", None, "ROC-AUC score for distinguishing changed from unchanged species" metric_lower = metric.lower() mode_suffix = "global" if mode.lower() == "global" else "eq_species" mode_description = "globally" if mode.lower() == "global" else "using equally weighted species averages" plot_title = f"{metric} absolute difference between measured and expected log2-transformed fold change (calculated {mode_description})" return metric_lower, mode_suffix, plot_title def _filter_datapoints_with_metric(self, result_df: pd.DataFrame, metric_col_name: str) -> pd.DataFrame: """ Filter datapoints to only include those that have the specified metric calculated. This is used when the user selects "Species-weighted" or "ROC-AUC" mode to ensure only datapoints with the new metric calculation are displayed (avoiding visual confusion with legacy metric calucations). Parameters ---------- result_df : pd.DataFrame DataFrame containing benchmark metrics for datapoints. metric_col_name : str The name of the metric column to filter on. Returns ------- pd.DataFrame Filtered DataFrame containing only datapoints with the specified metric. """ def has_metric(results_dict): """Check if the results dictionary contains the specified metric.""" try: for threshold_dict in results_dict.values(): if metric_col_name in threshold_dict: return True except (TypeError, AttributeError): pass return False # Filter to only datapoints that have the specified metric calculated return result_df[result_df["results"].apply(has_metric)].copy()