Source code for proteobench.plotting.plot_quant

"""
Module for plotting quantitative proteomics data.
"""

from typing import Dict

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.figure_factory import create_distplot


[docs] class PlotDataPoint: """ Class for plotting data points. """
[docs] @staticmethod def plot_fold_change_histogram( result_df: pd.DataFrame, species_ratio: Dict[str, Dict[str, str]], hide_annot: bool = False ) -> go.Figure: """ Plot smooth shaded density distributions of log2 fold changes using Plotly, color-coded by species. Parameters ---------- result_df : pd.DataFrame The results DataFrame containing fold changes and species data. species_ratio : Dict[str, Dict[str, str]] A dictionary mapping species to their respective colors and ratios. Returns ------- go.Figure A Plotly figure object representing the shaded density plots. """ species_list = list(species_ratio.keys()) # Filter to include only rows where any of the species columns are True result_df = result_df[result_df[species_list].any(axis=1)] result_df["kind"] = result_df[species_list].apply(lambda x: species_list[np.argmax(x)], axis=1) # Prepare lists for create_distplot data = [] group_labels = [] colors = [] for species in species_list: data.append(result_df.loc[result_df["kind"] == species, "log2_A_vs_B"].dropna().tolist()) group_labels.append(species) colors.append(species_ratio[species]["color"]) # Create the distplot without histogram or rug fig = create_distplot(data, group_labels, colors=colors, show_hist=False, show_rug=False) # Update each density trace to fill under the curve for trace in fig.data: if trace.mode == "lines": trace.update(fill="tozeroy", opacity=0.4) # adjust opacity as needed # Customize layout fig.update_layout( width=700, height=700, xaxis=dict(title="log2_A_vs_B", color="black", gridwidth=1, linecolor="black", range=[-4, 4]), yaxis=dict(title="Density", color="black", gridwidth=1, linecolor="black"), ) # Add vertical lines for expected ratios ratio_map = {species: np.log2(data["A_vs_B"]) for species, data in species_ratio.items()} for species, ratio in ratio_map.items(): fig.add_vline( x=ratio, line_dash="dash", line_color=species_ratio[species]["color"], annotation_text=species ) fig.add_annotation( x=0.5, y=0.5, xref="paper", yref="paper", text="-Beta-" if not hide_annot else "", font=dict(size=50, color="rgba(0,0,0,0.1)"), showarrow=False, ) return fig
[docs] @staticmethod def plot_metric( benchmark_metrics_df: pd.DataFrame, metric: str = "Median", software_colors: Dict[str, str] = { "MaxQuant": "#8bc6fd", "AlphaPept": "#17212b", "ProlineStudio": "#8b26ff", "MSAngel": "#C0FA7D", "FragPipe": "#F89008", "i2MassChroQ": "#108E2E", "Sage": "#E43924", "WOMBAT": "#663200", "DIA-NN": "#d42f2f", "AlphaDIA": "#1D2732", "Custom": "#000000", "Spectronaut": "#007548", "FragPipe (DIA-NN quant)": "#F89008", "MSAID": "#bfef45", "Proteome Discoverer": "#911eb4", "PEAKS": "#f032e6", "quantms": "#f5e830", }, mapping: Dict[str, int] = {"old": 10, "new": 20}, highlight_color: str = "#d30067", label: str = "None", legend_name_map: Dict[str, str] = { "AlphaPept": "AlphaPept (legacy tool)", }, hide_annot: bool = False, ) -> go.Figure: """ Plot mean metrics in a scatter plot with Plotly, highlighting specific data points. Parameters ---------- benchmark_metrics_df : pd.DataFrame The DataFrame containing benchmark metrics data. metric : str, optional The metric to plot, either "Median" or "Mean", by default "Median". software_colors : Dict[str, str], optional A dictionary mapping software names to their colors, by default predefined colors. mapping : Dict[str, int], optional A dictionary mapping categories to scatter plot sizes, by default {"old": 10, "new": 20}. highlight_color : str, optional The color used for highlighting certain points, by default "#d30067". label : str, optional The column name for labeling data points, by default "None". legend_name_map : Dict[str, str], optional A dictionary mapping software names to legend names, by default None. If None, the software names will be used as legend names. Returns ------- go.Figure A Plotly figure object representing the scatter plot. """ all_median_abs_epsilon = [ v2["median_abs_epsilon"] for v in benchmark_metrics_df["results"] for v2 in v.values() ] all_mean_abs_epsilon = [v2["mean_abs_epsilon"] for v in benchmark_metrics_df["results"] for v2 in v.values()] all_nr_prec = [v2["nr_prec"] for v in benchmark_metrics_df["results"] for v2 in v.values()] # Add hover text with detailed information for each data point hover_texts = [] for idx, _ in benchmark_metrics_df.iterrows(): datapoint_text = "" if benchmark_metrics_df.is_temporary[idx] == True: datapoint_text = ( f"ProteoBench ID: {benchmark_metrics_df.id[idx]}<br>" + f"Software tool: {benchmark_metrics_df.software_name[idx]} {benchmark_metrics_df.software_version[idx]}<br>" ) if "comments" in benchmark_metrics_df.columns: comment = benchmark_metrics_df.comments[idx] if isinstance(comment, str): datapoint_text = ( datapoint_text + f"Comment (private submission): {comment[:10] + '...' if len(comment) > 10 else comment}..." ) else: # TODO: Determine parameters based on module datapoint_text = ( f"ProteoBench ID: {benchmark_metrics_df.id[idx]}<br>" + f"Software tool: {benchmark_metrics_df.software_name[idx]} {benchmark_metrics_df.software_version[idx]}<br>" + f"Search engine: {benchmark_metrics_df.search_engine[idx]} {benchmark_metrics_df.search_engine_version[idx]}<br>" + f"FDR psm: {benchmark_metrics_df.ident_fdr_psm[idx]}<br>" + f"MBR: {benchmark_metrics_df.enable_match_between_runs[idx]}<br>" + f"Precursor Tolerance: {benchmark_metrics_df.precursor_mass_tolerance[idx]}<br>" + f"Fragment Tolerance: {benchmark_metrics_df.fragment_mass_tolerance[idx]}<br>" + f"Enzyme: {benchmark_metrics_df.enzyme[idx]} <br>" + f"Missed Cleavages: {benchmark_metrics_df.allowed_miscleavages[idx]}<br>" + f"Min peptide length: {benchmark_metrics_df.min_peptide_length[idx]}<br>" + f"Max peptide length: {benchmark_metrics_df.max_peptide_length[idx]}<br>" ) if "submission_comments" in benchmark_metrics_df.columns: comment = benchmark_metrics_df.submission_comments[idx] if isinstance(comment, str): datapoint_text = ( datapoint_text + f"Comment (public submission): {comment[:10] + '...' if len(comment) > 10 else comment}..." ) hover_texts.append(datapoint_text) scatter_size = [mapping[item] for item in benchmark_metrics_df["old_new"]] if "Highlight" in benchmark_metrics_df.columns: scatter_size = [ item * 2 if highlight else item for item, highlight in zip(scatter_size, benchmark_metrics_df["Highlight"]) ] # Color plot based on software tool colors = [software_colors[software] for software in benchmark_metrics_df["software_name"]] if "Highlight" in benchmark_metrics_df.columns: colors = [ highlight_color if highlight else item for item, highlight in zip(colors, benchmark_metrics_df["Highlight"]) ] benchmark_metrics_df["color"] = colors benchmark_metrics_df["hover_text"] = hover_texts benchmark_metrics_df["scatter_size"] = scatter_size if metric == "Median": layout_xaxis_range = [ min(all_median_abs_epsilon) - min(all_median_abs_epsilon) * 0.05, max(all_median_abs_epsilon) + max(all_median_abs_epsilon) * 0.05, ] layout_xaxis_title = ( "Median absolute difference between measured and expected log2-transformed fold change." ) elif metric == "Mean": layout_xaxis_range = [ min(all_mean_abs_epsilon) - min(all_mean_abs_epsilon) * 0.05, max(all_mean_abs_epsilon) + max(all_mean_abs_epsilon) * 0.05, ] layout_xaxis_title = "Mean absolute difference between measured and expected log2-transformed fold change." fig = go.Figure( layout_yaxis_range=[ min(all_nr_prec) - min(max(all_nr_prec) * 0.05, 2000), max(all_nr_prec) + min(max(all_nr_prec) * 0.05, 2000), ], layout_xaxis_range=layout_xaxis_range, ) # Get all unique color-software combinations (necessary for highlighting) color_software_combinations = benchmark_metrics_df[["color", "software_name"]].drop_duplicates() benchmark_metrics_df["enable_match_between_runs"] = benchmark_metrics_df["enable_match_between_runs"].astype( str ) # plot the data points, one trace per software tool for _, row in color_software_combinations.iterrows(): color = row["color"] software = row["software_name"] tmp_df = benchmark_metrics_df[ (benchmark_metrics_df["color"] == color) & (benchmark_metrics_df["software_name"] == software) ] # to do: remove this line as soon as parameters are homogeneous, see #380 # tmp_df["enable_match_between_runs"] = tmp_df["enable_match_between_runs"].astype(str) fig.add_trace( go.Scatter( x=tmp_df["{}_abs_epsilon".format(metric.lower())], y=tmp_df["nr_prec"], mode="markers" if label == "None" else "markers+text", hovertext=tmp_df["hover_text"], text=tmp_df[label] if label != "None" else None, marker=dict(color=tmp_df["color"], showscale=False), marker_size=tmp_df["scatter_size"], name=legend_name_map.get(tmp_df["software_name"].iloc[0], tmp_df["software_name"].iloc[0]), ) ) fig.update_layout( width=700, height=700, xaxis=dict( title=layout_xaxis_title, gridcolor="white", gridwidth=2, linecolor="black", ), yaxis=dict( title="Total number of precursor ions quantified in the selected number of raw files", gridcolor="white", gridwidth=2, linecolor="black", ), ) fig.update_xaxes(showgrid=True, gridcolor="lightgray", gridwidth=1) fig.update_yaxes(showgrid=True, gridcolor="lightgray", gridwidth=1) fig.add_annotation( x=0.5, y=0.5, xref="paper", yref="paper", text="-Beta-" if not hide_annot else "", font=dict(size=50, color="rgba(0,0,0,0.1)"), showarrow=False, ) fig.update_layout(clickmode="event+select") return fig
[docs] @staticmethod def plot_CV_violinplot(result_df: pd.DataFrame) -> go.Figure: """ Plot the coefficient of variation (CV) for A and B groups using a violin plot. Parameters ---------- result_df : pd.DataFrame The results DataFrame containing the CV data. Returns ------- go.Figure: A Plotly figure object representing the violin plot. """ fig = px.violin(result_df, y=["CV_A", "CV_B"], box=True, title=None, points=False) fig.update_layout( xaxis_title="Condition", yaxis_title="CV", xaxis=dict(linecolor="black"), # Set the X axis line color to black yaxis=dict(linecolor="black"), # Set the Y axis line color to black ) return fig
[docs] @staticmethod def plot_ma_plot(result_df: pd.DataFrame, species_ratio: Dict[str, Dict[str, str]]) -> go.Figure: """ Plot a MA plot using Plotly. Parameters ---------- result_df : pd.DataFrame The results DataFrame containing the MA plot data. species_ratio : Dict[str, Dict[str, str]] A dictionary mapping species to their respective colors and ratios. Returns ------- go.Figure A Plotly figure object representing the MA plot. """ color_map = {species: data["color"] for species, data in species_ratio.items()} # take mean of log intensity mean a and log intensity mean b result_df["logIntensityMean"] = (result_df["log_Intensity_mean_A"] + result_df["log_Intensity_mean_B"]) / 2 fig = px.scatter( result_df, x="log2_A_vs_B", y="logIntensityMean", color="species", color_discrete_map=color_map, labels={"log2_A_vs_B": "log2_FC(A:B)", "logIntensityMean": "log2_Intensity_Mean", "species": "Organism"}, title="log2FC vs logIntensityMean", size_max=10, opacity=0.6, ) # Add vertical lines as shapes fig.add_shape( type="line", x0=0, x1=0, y0=result_df["logIntensityMean"].min(), y1=result_df["logIntensityMean"].max(), line=dict(color="green", dash="dash"), xref="x", yref="y", name="log2FC = 0", ) fig.add_shape( type="line", x0=1, x1=1, y0=result_df["logIntensityMean"].min(), y1=result_df["logIntensityMean"].max(), line=dict(color="red", dash="dash"), xref="x", yref="y", name="log2FC = 1", ) fig.add_shape( type="line", x0=-2, x1=-2, y0=result_df["logIntensityMean"].min(), y1=result_df["logIntensityMean"].max(), line=dict(color="blue", dash="dash"), xref="x", yref="y", name="log2FC = -2", ) # To show vertical lines in the legend, add dummy traces fig.add_trace( go.Scatter(x=[None], y=[None], mode="lines", line=dict(color="green", dash="dash"), name="log2FC = 0") ) fig.add_trace( go.Scatter(x=[None], y=[None], mode="lines", line=dict(color="red", dash="dash"), name="log2FC = 1") ) fig.add_trace( go.Scatter(x=[None], y=[None], mode="lines", line=dict(color="blue", dash="dash"), name="log2FC = -2") ) fig.update_traces(marker=dict(size=6)) # Marker size approx. equivalent to s=10 in seaborn return fig