"""
Plot generator for LFQ PYE (Plasma-Yeast-Ecoli) quantification modules.
"""
from typing import Dict, List, Tuple, Union
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.figure_factory import create_distplot
from proteobench.plotting.plot_generator_base import PlotGeneratorBase
[docs]
class LFQPYEPlotGenerator(PlotGeneratorBase):
"""
Plot generator for LFQ PYE (Plasma-Yeast-Ecoli) quantification modules.
Used by plasma benchmarking modules that use human plasma, yeast, and E. coli species.
"""
[docs]
def generate_in_depth_plots(
self, performance_data: pd.DataFrame, parse_settings: any, **kwargs
) -> Dict[str, go.Figure]:
"""
Generate standard LFQ PYE plots from intermediate data.
Parameters
----------
performance_data : pd.DataFrame
The intermediate performance data to plot
parse_settings : ParseSettings
The parse settings for the module
**kwargs : dict
Additional module-specific parameters
Returns
-------
Dict[str, go.Figure]
Dictionary mapping plot names to plotly figures
"""
plots = {}
# Get expected ratios from parse settings if available
try:
species_expected_ratio = parse_settings.species_expected_ratio()
except:
species_expected_ratio = {}
# Generate fold change histogram
plots["logfc"] = self._plot_fold_change_histogram(performance_data, species_expected_ratio)
# Generate CV violin plot
plots["cv"] = self._plot_cv_violinplot(performance_data)
plots["ma_plot"] = self._plot_ma_plot(performance_data, species_expected_ratio)
plots["dynamic_range_plot"] = self._plot_dynamic_range(performance_data, species_expected_ratio)
plots["missing_values_plot"] = self._plot_missing_values(performance_data)
return plots
[docs]
def get_in_depth_plot_layout(self) -> list:
"""
Define layout for LFQ PYE plots.
Returns
-------
list
List of in-depth plot configurations defining how plots should be displayed
"""
return [
{
"plots": ["dynamic_range_plot", "missing_values_plot"],
"columns": 2,
"titles": {
"dynamic_range_plot": "Dynamic Range in Condition A and B.",
"missing_values_plot": "Missing Values Distribution across runs.",
},
},
{
"plots": ["logfc", "cv"],
"columns": 2,
"titles": {
"logfc": "Log2 Fold Change distributions by species (Human plasma, Yeast, E. coli).",
"cv": "Coefficient of variation distribution in Condition A and B.",
},
},
{
"plots": ["ma_plot"],
"columns": 1,
"titles": {
"ma_plot": "MA Plot",
},
},
]
[docs]
def get_in_depth_plot_descriptions(self) -> Dict[str, str]:
"""
Get descriptions for each plot.
Returns
-------
Dict[str, str]
Dictionary mapping plot names to their descriptions
"""
return {
"logfc": "log2 fold changes calculated from the intermediate data",
"cv": "CVs calculated from the intermediate data",
"ma_plot": "MA plot (M vs A plot) showing log2 fold changes against mean abundance",
# TODO: improve
"dynamic_range_plot": "Dynamic range of human precursor intensities in Condition A and B",
"missing_values_plot": "Distribution of missing values (%) of quantified human precursors",
}
def _plot_fold_change_histogram(
self, performance_data: pd.DataFrame, species_expected_ratio: Dict[str, Dict[str, Union[float, str]]]
) -> go.Figure:
"""
Generate fold change histogram plot.
Parameters
----------
performance_data : pd.DataFrame
Intermediate data containing log2_A_vs_B column
species_expected_ratio : Dict[str, Dict[str, Union[float, str]]]
Dictionary with expected ratios for each species, and colors
Returns
-------
go.Figure
Plotly figure with fold change distributions
"""
species_list = list(species_expected_ratio.keys())
# Filter to rows where at least one species is present
species_cols = [s for s in species_list if s in performance_data.columns]
if not species_cols:
# If no species columns, create empty figure
fig = go.Figure()
fig.add_annotation(
text="No species data available for fold change plot",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
)
return fig
performance_data_filtered = performance_data[performance_data[species_cols].any(axis=1)].copy()
performance_data_filtered["species"] = performance_data_filtered[species_cols].apply(
lambda x: species_cols[np.argmax(x)], axis=1
)
# Prepare plot data
hist_data = []
group_labels = []
colors = []
for species in species_list:
if species in performance_data_filtered.columns or species in species_expected_ratio:
species_data = (
performance_data_filtered.loc[performance_data_filtered["species"] == species, "log2_A_vs_B"]
.dropna()
.tolist()
)
if species_data:
hist_data.append(species_data)
group_labels.append(species)
if species_expected_ratio and species in species_expected_ratio:
colors.append(species_expected_ratio[species].get("color", "#000000"))
else:
colors.append("#000000")
# Create distribution plot
if hist_data:
fig = create_distplot(
hist_data,
group_labels,
show_hist=False,
show_rug=False,
colors=colors,
)
for trace in fig.data:
if trace.mode == "lines":
trace.update(fill="tozeroy", opacity=0.4)
fig.update_layout(
xaxis=dict(
title="Log2(Condition A / Condition B)",
color="black",
gridwidth=1,
gridcolor="lightgray",
range=[-4, 4],
),
yaxis=dict(title="Density", color="black", gridwidth=1, gridcolor="lightgray"),
)
# Add expected ratio lines if available
if species_expected_ratio:
ratio_map = {species: np.log2(data["A_vs_B"]) for species, data in species_expected_ratio.items()}
for species, ratio in ratio_map.items():
fig.add_vline(
x=ratio,
line_dash="dash",
line_color=species_expected_ratio[species].get("color", "#000000"),
annotation_text=f"Expected {species}",
)
else:
# Create empty figure if no data
fig = go.Figure()
fig.add_annotation(
text="No data available for fold change plot",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
)
return fig
def _plot_cv_violinplot(self, performance_data: pd.DataFrame) -> go.Figure:
"""
Generate coefficient of variation violin plot.
Parameters
----------
performance_data : pd.DataFrame
Intermediate data containing CV_A and CV_B columns
Returns
-------
go.Figure
Plotly figure with CV violin plots
"""
# Prepare data for violin plot
cv_data = []
conditions = []
# Add CV data for Condition A
if "CV_A" in performance_data.columns:
cv_a = performance_data["CV_A"].replace([np.inf, -np.inf], np.nan).dropna()
cv_data.extend(cv_a)
conditions.extend(["Condition A"] * len(cv_a))
# Add CV data for Condition B
if "CV_B" in performance_data.columns:
cv_b = performance_data["CV_B"].replace([np.inf, -np.inf], np.nan).dropna()
cv_data.extend(cv_b)
conditions.extend(["Condition B"] * len(cv_b))
# Create violin plot
if cv_data:
df_plot = pd.DataFrame({"CV": cv_data, "Condition": conditions})
fig = px.violin(df_plot, y="CV", x="Condition", box=True, points=False)
else:
# Create empty figure if no data
fig = go.Figure()
fig.add_annotation(
text="No CV data available",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
)
return fig
def _plot_ma_plot(
self, performance_data: pd.DataFrame, species_expected_ratio: Dict[str, Dict[str, Union[float, str]]]
) -> go.Figure:
"""
Generate MA plot (M vs A plot) but with A on the y-axis and M on the x-axis.
Parameters
----------
performance_data : pd.DataFrame
Performance data containing log2_A_vs_B and mean abundance columns
species_expected_ratio : Dict[str, Dict[str, Union[float, str]]]
Expected ratios for each species and their colors
Returns
-------
go.Figure
Plotly figure with MA plot (M on x, A on y)
"""
fig = go.Figure()
# Define colors for species
color_map = {species: data["color"] for species, data in species_expected_ratio.items()}
performance_data["logIntensityMean"] = (
performance_data["log_Intensity_mean_A"] + performance_data["log_Intensity_mean_B"]
) / 2
fig = px.scatter(
performance_data,
x="log2_A_vs_B",
y="logIntensityMean",
color="species",
color_discrete_map=color_map,
labels={"log2_A_vs_B": "M (Log2 Fold Change(A:B))", "logIntensityMean": "A (Mean Abundance)"},
title="MA Plot",
size_max=10,
opacity=0.2,
)
# Add vertical lines for expected M values (since M is on x-axis) across the A range
if fig.data:
ratio_map = {species: np.log2(data["A_vs_B"]) for species, data in species_expected_ratio.items()}
for species, ratio in ratio_map.items():
fig.add_vline(
x=ratio,
line_dash="dash",
line_color=species_expected_ratio[species].get("color", "#000000"),
annotation_text=f"Expected {species}",
)
fig.update_traces(marker=dict(size=6))
else:
fig.add_annotation(
text="No data available for MA plot",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
)
return fig
def _plot_dynamic_range(self, performance_data: pd.DataFrame, species_expected_ratio: any) -> go.Figure:
"""
Generate dynamic range plot for both conditions A and B, with a smoothed
epsilon trend on a secondary y-axis.
Parameters
----------
performance_data : pd.DataFrame
Performance data containing dynamic range information
Returns
-------
go.Figure
Plotly figure with dynamic range plots for both conditions
"""
fig = go.Figure()
# Process data for both conditions
conditions_data = []
if len(performance_data) > 0:
# Calculate mean intensity across both conditions
performance_data_copy = performance_data.copy()
performance_data_copy["mean_intensity"] = performance_data_copy[
["Intensity_mean_A", "Intensity_mean_B"]
].mean(axis=1, skipna=True)
if performance_data_copy["mean_intensity"].max() > 0:
performance_data_copy["normalized_intensity"] = (
performance_data_copy["mean_intensity"] / performance_data_copy["mean_intensity"].max() * 100
)
performance_data_copy = performance_data_copy.sort_values(by="normalized_intensity", ascending=False)
performance_data_copy["rank"] = range(1, len(performance_data_copy) + 1)
conditions_data.append(performance_data_copy[["rank", "normalized_intensity", "epsilon", "species"]])
if conditions_data:
plot_df = conditions_data[0]
# Get colors from species_expected_ratio if available
if species_expected_ratio:
color_map = {species: data.get("color", "#000000") for species, data in species_expected_ratio.items()}
else:
# Fallback colors if not provided
color_map = {}
# Create figure with dropdown for species selection
fig = go.Figure()
species_order = ["HUMAN", "YEAST", "ECOLI"] # Human first as default
# Create traces for each species (all hidden initially except first)
for idx, species in enumerate(species_order):
species_df = plot_df[plot_df["species"] == species].copy()
if len(species_df) > 0:
# Add scatter trace for this species
fig.add_trace(
go.Scattergl(
x=species_df["rank"],
y=species_df["normalized_intensity"],
mode="markers",
marker=dict(
color=color_map.get(species, "#000000"),
size=6,
opacity=0.3,
line=dict(width=0.5, color="white"),
),
name=f"{species} precursors",
visible=(idx == 0), # Only first (HUMAN) visible by default
hovertemplate=f"<b>{species}</b><br>Rank: %{{x}}<br>Intensity: %{{y:.2f}}%<extra></extra>",
)
)
# Calculate epsilon trend for this species
eps_df = species_df[["rank", "epsilon"]].copy()
eps_df["absolute_eps"] = eps_df["epsilon"].abs()
eps_df = eps_df.sort_values("rank")
# window ~1% of points, minimum 5
window = max(5, len(eps_df) // 10 if len(eps_df) >= 100 else 5)
eps_df["epsilon_trend"] = (
eps_df["absolute_eps"].rolling(window=window, center=True, min_periods=1).median()
)
# Add epsilon trend line for this species
fig.add_trace(
go.Scatter(
x=eps_df["rank"],
y=eps_df["epsilon_trend"],
mode="lines",
name=f"{species} epsilon trend",
yaxis="y2",
line=dict(dash="dash", color=color_map.get(species, "#000000"), width=2),
visible=(idx == 0), # Only first (HUMAN) visible by default
hovertemplate=f"<b>{species} epsilon trend</b><br>Rank: %{{x}}<br>Epsilon: %{{y:.3f}}<extra></extra>",
)
)
# Create dropdown buttons for species selection
buttons = []
for idx, species in enumerate(species_order):
# Create visibility array: each species has 2 traces (scatter + epsilon line)
visibility = [False] * (len(species_order) * 2)
visibility[idx * 2] = True # Show scatter for this species
visibility[idx * 2 + 1] = True # Show epsilon line for this species
buttons.append(
dict(
label=species,
method="update",
args=[{"visible": visibility}],
)
)
fig.update_xaxes(
title="Intensity Rank (1 = highest intensity)",
gridcolor="lightgray",
showgrid=True,
)
fig.update_yaxes(
title="Normalized Intensity (%)",
type="log",
dtick="1",
gridcolor="lightgray",
showgrid=True,
)
# Update layout with dropdown menu
epsilon_q85 = plot_df["epsilon"].quantile(0.85)
if pd.isna(epsilon_q85) or epsilon_q85 == 0:
epsilon_q85 = 1.0
fig.update_layout(
updatemenus=[
dict(
buttons=buttons,
direction="down",
pad={"r": 10, "t": 10},
showactive=True,
x=0.15,
xanchor="left",
y=1.15,
yanchor="top",
)
],
annotations=[
dict(
text="Species:",
showarrow=False,
x=0.02,
xref="paper",
y=1.13,
yref="paper",
align="left",
)
],
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1.0,
),
yaxis2=dict(
title="Absolute epsilon (rolling median)",
overlaying="y",
side="right",
range=[0, epsilon_q85],
),
margin=dict(l=60, r=80, t=80, b=60), # Reduce top margin to fill space
hovermode="closest",
)
else:
# No data available
fig = go.Figure()
fig.add_annotation(
text="No data available for dynamic range plot",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
)
return fig
def _plot_missing_values(self, performance_data: pd.DataFrame, max_observations=12) -> go.Figure:
"""
Generate missing values plot with smoothed trend line and color gradient.
This plot shows how missingness increases with lower abundance precursors.
High-abundance precursors (low rank) typically have low missingness,
while low-abundance precursors (high rank) have higher missingness.
Parameters
----------
performance_data : pd.DataFrame
Performance data containing missing values information
max_observations : int
Maximum number of observations possible (default 12)
Returns
-------
go.Figure
Plotly figure with missing values plot, trend line, and reference lines
"""
fig = go.Figure()
# Filter and prepare data
human_slice = performance_data[performance_data["species"] == "HUMAN"].copy()
if len(human_slice) == 0:
fig.add_annotation(
text="No human plasma data available for missing values plot",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
)
return fig
# Compute logIntensityMean locally to avoid hidden dependency on _plot_ma_plot
human_slice["logIntensityMean"] = (
human_slice["log_Intensity_mean_A"] + human_slice["log_Intensity_mean_B"]
) / 2
human_slice = human_slice.sort_values(by="logIntensityMean", ascending=False) # Across conditions
human_slice["rank"] = range(1, len(human_slice) + 1)
human_slice["missingness"] = (1 - human_slice["nr_observed"] / max_observations) * 100
# Calculate smoothed trend line (rolling median)
window = max(5, len(human_slice) // 20) # ~5% of points, minimum 5
human_slice["missingness_trend"] = (
human_slice["missingness"].rolling(window=window, center=True, min_periods=1).median()
)
# Create scatter plot with color gradient based on missingness
fig.add_trace(
go.Scatter(
x=human_slice["rank"],
y=human_slice["missingness"],
mode="markers",
marker=dict(
size=4,
color=human_slice["missingness"],
colorscale="Reds",
showscale=True,
colorbar=dict(title="Missing<br>Values (%)", thickness=15, len=0.7),
cmin=0,
cmax=100,
opacity=0.6,
),
name="Precursors",
hovertemplate="Rank: %{x}<br>Missing: %{y:.1f}%<extra></extra>",
)
)
# Add smoothed trend line
fig.add_trace(
go.Scatter(
x=human_slice["rank"],
y=human_slice["missingness_trend"],
mode="lines",
line=dict(color="darkred", width=3),
name="Trend (rolling median)",
hovertemplate="Rank: %{x}<br>Trend: %{y:.1f}%<extra></extra>",
)
)
# Update layout
fig.update_layout(
xaxis=dict(
title="Intensity Rank (1 = highest intensity)",
gridcolor="lightgray",
showgrid=True,
),
yaxis=dict(
title="Missing Values (%)",
gridcolor="lightgray",
showgrid=True,
range=[-5, 105], # Give some padding
),
hovermode="closest",
showlegend=True,
legend=dict(x=0.02, y=0.98, bgcolor="rgba(255,255,255,0.8)"),
margin=dict(l=60, r=20, t=20, b=60), # Reduce margins to fill space
)
return fig
[docs]
def plot_main_metric(
self,
result_df: pd.DataFrame,
metric: str = "Median",
mode: str = "Species-weighted",
software_colors: Dict[str, str] = {
"MaxQuant": "#8bc6fd",
"AlphaPept": "#17212b",
"ProlineStudio": "#8b26ff",
"MSAngel": "#C0FA7D",
"FragPipe": "#F89008",
"i2MassChroQ": "#108E2E",
"Sage": "#E43924",
"WOMBAT": "#663200",
"DIA-NN": "#d42f2f",
"AlphaDIA": "#1D2732",
"Custom": "#000000",
"Spectronaut": "#007548",
"FragPipe (DIA-NN quant)": "#F89008",
"MSAID": "#bfef45",
"MetaMorpheus": "#637C7A",
"Proteome Discoverer": "#911eb4",
"PEAKS": "#f032e6",
"quantms": "#f5e830",
},
software_markers: Dict[str, str] = {
"MaxQuant": "circle",
"AlphaPept": "square",
"ProlineStudio": "diamond",
"MSAngel": "cross",
"FragPipe": "x",
"i2MassChroQ": "triangle-up",
"Sage": "triangle-down",
"WOMBAT": "pentagon",
"DIA-NN": "star",
"AlphaDIA": "star-triangle-up",
"Custom": "star-square",
"Spectronaut": "diamond-tall",
"FragPipe (DIA-NN quant)": "circle-x",
"MSAID": "square-cross",
"MetaMorpheus": "asterisk",
"Proteome Discoverer": "hash",
"PEAKS": "diamond-wide",
"quantms": "hexagram",
},
mapping: Dict[str, str] = {"old": 10, "new": 20},
highlight_color: str = "#d30067",
label: str = "",
legend_name_map: Dict[str, str] = {"AlphaPept": "AlphaPept (legacy tool)"},
hide_annot: bool = False,
colorblind_mode: bool = False,
default_cutoff_min_prec: int = 3,
min_nr_observed: int = None,
annotation: str = "",
**kwargs,
) -> go.Figure:
"""
Generate the main plasma benchmarking scatterplot.
This method returns the plasma performance scatterplot for comparing multiple methods.
Parameters
----------
result_df : pd.DataFrame
DataFrame containing the results to plot, must have 'results' column with metrics.
metric : str, optional
Metric to use for calculations: "Median" or "Mean". Defaults to "Median".
mode : str, optional
Mode for metric calculation: "Global" or "Species-weighted". Currently both modes
use the same metrics for plasma. Defaults to "Species-weighted".
software_colors : Dict[str, str]
Mapping of software names to colors.
mapping : Dict[str, str]
Mapping for marker sizes.
highlight_color : str
Color to use for highlighting specific software.
label : str
Label for plot annotations.
legend_name_map : Dict[str, str]
Mapping for legend names.
hide_annot : bool
Whether to hide annotations on the plot.
default_cutoff_min_prec : int
Default min precursor threshold for extracting metrics.
min_nr_observed : int, optional
Override the cutoff level with this value if provided.
annotation : str, optional
Text annotation to display on the plot (e.g., "-Alpha-", "-Beta-").
**kwargs : dict
Additional parameters.
Returns
-------
go.Figure
Plotly figure with the plasma scatterplot.
"""
# Use min_nr_observed if provided, otherwise use default_cutoff_min_prec
cutoff_level = min_nr_observed if min_nr_observed is not None else default_cutoff_min_prec
return self._plot_plasma_scatterplot(
result_df,
metric=metric,
mode=mode,
software_colors=software_colors,
software_markers=software_markers,
mapping=mapping,
highlight_color=highlight_color,
label=label,
legend_name_map=legend_name_map,
hide_annot=hide_annot,
colorblind_mode=colorblind_mode,
default_cutoff_min_prec=cutoff_level,
annotation=annotation,
**kwargs,
)
def _plot_plasma_scatterplot(
self,
result_df: pd.DataFrame,
metric: str = "Median",
mode: str = "Species-weighted",
# TODO: move software_colors to constants
software_colors: Dict[str, str] = {
"MaxQuant": "#8bc6fd",
"AlphaPept": "#17212b",
"ProlineStudio": "#8b26ff",
"MSAngel": "#C0FA7D",
"FragPipe": "#F89008",
"i2MassChroQ": "#108E2E",
"Sage": "#E43924",
"WOMBAT": "#663200",
"DIA-NN": "#d42f2f",
"AlphaDIA": "#1D2732",
"Custom": "#000000",
"Spectronaut": "#007548",
"FragPipe (DIA-NN quant)": "#F89008",
"MSAID": "#bfef45",
"MetaMorpheus": "#637C7A",
"Proteome Discoverer": "#911eb4",
"PEAKS": "#f032e6",
"quantms": "#f5e830",
},
software_markers: Dict[str, str] = {
"MaxQuant": "circle",
"AlphaPept": "square",
"ProlineStudio": "diamond",
"MSAngel": "cross",
"FragPipe": "x",
"i2MassChroQ": "triangle-up",
"Sage": "triangle-down",
"WOMBAT": "pentagon",
"DIA-NN": "star",
"AlphaDIA": "star-triangle-up",
"Custom": "star-square",
"Spectronaut": "diamond-tall",
"FragPipe (DIA-NN quant)": "circle-x",
"MSAID": "square-cross",
"MetaMorpheus": "asterisk",
"Proteome Discoverer": "hash",
"PEAKS": "diamond-wide",
"quantms": "hexagram",
},
mapping: Dict[str, str] = {"old": 10, "new": 20},
highlight_color: str = "#d30067",
label: str = "",
legend_name_map: Dict[str, str] = {"AlphaPept": "AlphaPept (legacy tool)"},
hide_annot: bool = False,
colorblind_mode: bool = False,
default_cutoff_min_prec: int = 3,
annotation: str = "",
**kwargs,
) -> go.Figure:
"""
Generate the main plasma benchmarking scatterplot.
The plot uses four visual dimensions to represent the benchmarking results:
- X-axis: Absolute log2 fold-change error for yeast and E. coli spike-ins (median or mean based on metric)
- Y-axis: Number of quantified yeast and E. coli spike-in precursors
- Dot size: Dynamic range of human plasma precursors (quantification breadth)
- Dot opacity: Quantification accuracy for human plasma (alpha based on error, median or mean)
Parameters
----------
result_df : pd.DataFrame
DataFrame containing the results to plot.
metric : str, optional
Metric to use: "Median" or "Mean". Defaults to "Median".
mode : str, optional
Mode for metric calculation: "Global" or "Species-weighted". Currently both modes
use the same metrics for plasma. Defaults to "Species-weighted".
software_colors : Dict[str, str]
Mapping of software names to colors.
mapping : Dict[str, str]
Mapping for marker sizes.
highlight_color : str
Color to use for highlighting specific software.
label : str
Label for plot annotations.
legend_name_map : Dict[str, str]
Mapping for legend names.
hide_annot : bool
Whether to hide annotations on the plot.
default_cutoff_min_prec : int
Default min precursor threshold for extracting metrics.
annotation : str, optional
Text annotation to display on the plot (e.g., "-Alpha-", "-Beta-").
**kwargs : dict
Additional parameters.
Returns
-------
go.Figure
Plotly figure with the plasma scatterplot.
"""
fig = go.Figure()
# Determine which metric keys to use based on selected metric and mode
metric_lower = metric.lower()
mode_suffix = "global" if mode == "Global" else "eq_species"
# Construct metric keys with mode suffix
x_metric_key = f"{metric_lower}_abs_log2_fc_error_spike_ins_{mode_suffix}"
# Fallback to legacy key (without suffix) for backwards compatibility with old datapoints
x_metric_key_legacy = f"{metric_lower}_abs_log2_fc_error_spike_ins"
# Human plasma metrics don't have mode variants (single species)
opacity_metric_key = f"{metric_lower}_abs_epsilon_human_plasma"
# Pre-pass: collect raw dynamic-range values for data-driven size normalization.
# This ensures the full [8, 40] marker-size range is used regardless of where
# values cluster, maximising visual separation for small differences.
raw_size_vals = []
for _, row in result_df.iterrows():
m = self._get_metrics_at_cutoff(row.get("results"), default_cutoff_min_prec)
if m is not None:
sv = m.get("dynamic_range_human_plasma_mean", 0.0)
if sv > 0:
raw_size_vals.append(sv)
size_min = min(raw_size_vals) if raw_size_vals else 0.0
size_max = max(raw_size_vals) if raw_size_vals else 1.0
size_data_range = size_max - size_min if size_max > size_min else 1.0
# Create scatter plot with all four visual dimensions
# Group by software to create separate traces (allows colorblind markers)
software_data = {}
for idx, row in result_df.iterrows():
metrics = self._get_metrics_at_cutoff(row.get("results"), default_cutoff_min_prec)
if metrics is None:
continue
software = row["software_name"]
if software not in software_data:
software_data[software] = {
"x": [],
"y": [],
"sizes": [],
"opacities": [],
"colors": [],
"markers": [],
"hover_texts": [],
}
# Try new mode-specific key first, fall back to legacy key
x_val = metrics.get(x_metric_key)
if x_val is None:
x_val = metrics.get(x_metric_key_legacy, 0.0)
y_val = metrics.get("nr_quantified_spike_ins", 0)
size_val = metrics.get("dynamic_range_human_plasma_mean", 0.0)
opacity_val = metrics.get(opacity_metric_key, 0.0)
software_data[software]["x"].append(x_val)
software_data[software]["y"].append(y_val)
# Size scaling: min-max normalise across the loaded data so the full
# [8, 40] range is always used, making even small differences visible.
if size_val > 0:
normalized_size = 8 + ((size_val - size_min) / size_data_range) * 10
else:
normalized_size = 8
software_data[software]["sizes"].append(normalized_size)
# Opacity: lower error = higher opacity (higher alpha)
opacity = max(0.2, 0.9 - (opacity_val * 0.7))
software_data[software]["opacities"].append(opacity)
# Get software color
color = software_colors.get(software, "#000000")
if "Highlight" in result_df.columns and result_df.loc[idx, "Highlight"]:
color = highlight_color
software_data[software]["colors"].append(color)
# Get marker
marker = software_markers.get(software, "circle")
software_data[software]["markers"].append(marker)
# Build hover text
mode_label = "global" if mode == "Global" else "species-weighted"
hover_text = (
f"<b>{software} {row['software_version']}</b><br>"
f"Spike-in error ({metric_lower}, {mode_label}): {x_val:.3f}<br>"
f"Quantified spike-ins: {y_val}<br>"
f"Plasma dynamic range: {size_val:.2f}<br>"
f"Plasma accuracy error ({metric_lower}): {opacity_val:.3f}<br>"
f"ID: {row['id']}"
)
software_data[software]["hover_texts"].append(hover_text)
# Add traces for each software
for software, data in software_data.items():
# Get unique marker for this software
marker_symbol = data["markers"][0] if colorblind_mode else "circle"
fig.add_trace(
go.Scatter(
x=data["x"],
y=data["y"],
mode="markers",
marker=dict(
size=data["sizes"],
color=data["colors"],
opacity=data["opacities"],
symbol=marker_symbol,
line=dict(width=1, color="white"),
),
text=data["hover_texts"],
hovertemplate="%{text}<extra></extra>",
name=legend_name_map.get(software, software),
)
)
# Update layout
mode_description = "global" if mode == "Global" else "species-weighted"
fig.update_layout(
width=800,
height=700,
xaxis=dict(
title=f"{metric} absolute log2 fold-change error (spike-ins, {mode_description})",
gridcolor="lightgray",
gridwidth=1,
linecolor="black",
showgrid=True,
),
yaxis=dict(
title="Number of quantified spike-in precursors",
gridcolor="lightgray",
gridwidth=1,
linecolor="black",
showgrid=True,
),
)
# Add annotation explaining visual dimensions
# TODO: improve
annotation_text = (
"Dot size = dynamic range of quantified human precursors in plasma | "
"Opacity = plasma quantification accuracy (darker = better)"
)
fig.add_annotation(
text=annotation_text if not hide_annot else "",
xref="paper",
yref="paper",
x=0.5,
y=-0.15,
showarrow=False,
font=dict(size=10, color="gray"),
)
fig.add_annotation(
x=0.5,
y=0.5,
xref="paper",
yref="paper",
text=annotation if not hide_annot else "",
font=dict(size=50, color="rgba(0,0,0,0.1)"),
showarrow=False,
)
fig.update_layout(clickmode="event+select")
return fig
@staticmethod
def _get_metrics_at_cutoff(results: dict, cutoff: int) -> dict | None:
"""Get metrics for a given cutoff level from results with int or string keys."""
if not isinstance(results, dict):
return None
if cutoff in results and isinstance(results[cutoff], dict):
return results[cutoff]
cutoff_str = str(cutoff)
if cutoff_str in results and isinstance(results[cutoff_str], dict):
return results[cutoff_str]
return None
def _get_metric_column_name(self, metric: str, mode: str) -> Tuple[str, str, str]:
"""
Get the appropriate metric column names based on the specified metric and mode.
Note: For plasma (PYE) modules, this is primarily for compatibility with the UI
which offers metric and mode selectors. The plasma plot uses different metrics
(spike-in errors, quantification depth, dynamic range) rather than the
epsilon-based metrics used in HYE modules.
Parameters
----------
metric : str
The metric to plot: "Median" or "Mean".
mode : str
The mode for filtering: "Global" or "Species-weighted".
Returns
-------
Tuple[str, str, str]
A tuple containing (metric_lower, mode_suffix, plot_title).
"""
metric_lower = metric.lower()
mode_suffix = "global" if mode == "Global" else "eq_species"
mode_description = "globally" if mode == "Global" else "using equally weighted species averages"
plot_title = (
f"{metric} absolute difference between measured and expected log2-transformed fold change "
f"(calculated {mode_description})"
)
return metric_lower, mode_suffix, plot_title
def _filter_datapoints_with_metric(self, benchmark_metrics_df: pd.DataFrame, metric_col_name: str) -> pd.DataFrame:
"""
Filter datapoints to only include those that have the specified metric calculated.
For plasma modules, this ensures consistency with HYE module behavior when
filtering for species-weighted metrics.
Parameters
----------
benchmark_metrics_df : pd.DataFrame
DataFrame containing benchmark metrics for datapoints.
metric_col_name : str
The name of the metric column to filter on.
Returns
-------
pd.DataFrame
Filtered DataFrame containing only datapoints with the specified metric.
"""
def has_metric(results_dict):
"""Check if the results dictionary contains the specified metric."""
try:
for threshold_dict in results_dict.values():
if metric_col_name in threshold_dict:
return True
except (TypeError, AttributeError):
pass
return False
# Filter to only datapoints that have the specified metric calculated
return benchmark_metrics_df[benchmark_metrics_df["results"].apply(has_metric)].copy()