Source code for proteobench.score.quant.quantscores

"""
Module containing quantification score calculators.
"""

from typing import Dict

import numpy as np
import pandas as pd

from proteobench.score.quant.score_base import ScoreBase


[docs] class QuantScoresHYE(ScoreBase): """ Class for computing quantification scores for LFQ benchmarking. This class implements the ScoreBase interface to compute quantification-specific metrics including condition statistics, fold changes, and epsilon (difference) values. Parameters ---------- precursor_column_name : str Name of the precursor column. species_expected_ratio : dict Dictionary containing the expected ratios for each species. species_dict : dict Dictionary containing the species names and their column mappings. """ def __init__(self, precursor_column_name: str, species_expected_ratio, species_dict: Dict[str, str]): """ Initialize the QuantScoresHYE object. Parameters ---------- precursor_column_name : str Name of the precursor. species_expected_ratio : dict Dictionary containing the expected ratios for each species. species_dict : dict Dictionary containing the species names. """ self.precursor_column_name = precursor_column_name self.species_expected_ratio = species_expected_ratio self.species_dict = species_dict
[docs] def generate_intermediate( self, filtered_df: pd.DataFrame, replicate_to_raw: dict, ) -> pd.DataFrame: """ Generate intermediate data structure for quantification scores. Parameters ---------- filtered_df : pd.DataFrame DataFrame containing the filtered data. replicate_to_raw : dict Dictionary containing the replicate to raw mapping. Returns ------- pd.DataFrame DataFrame containing the intermediate data structure. """ # select columns which are relavant for the statistics # TODO, this should be handled different, probably in the parse settings relevant_columns_df = filtered_df[["Raw file", self.precursor_column_name, "Intensity"]].copy() replicate_to_raw_df = QuantScoresHYE.convert_replicate_to_raw(replicate_to_raw) all_present = all( item in list(filtered_df.columns) for sublist in replicate_to_raw.values() for item in sublist ) if not all_present: raise Exception("Not all runs are present in the quantification file") # add column "Condition" to filtered_df_p1 using inner join on "Raw file" relevant_columns_df = pd.merge(relevant_columns_df, replicate_to_raw_df, on="Raw file", how="inner") quant_df = QuantScoresHYE.compute_condition_stats( relevant_columns_df, min_intensity=0, precursor=self.precursor_column_name, ) species_prec_ion = list(self.species_dict.values()) species_prec_ion.append(self.precursor_column_name) prec_ion_to_species = filtered_df[species_prec_ion].drop_duplicates() # merge dataframes quant_df and species_quant_df and prec_ion_to_species using pepdidoform as index quant_df_withspecies = pd.merge(quant_df, prec_ion_to_species, on=self.precursor_column_name, how="inner") species_expected_ratio = self.species_expected_ratio res = QuantScoresHYE.compute_epsilon(quant_df_withspecies, species_expected_ratio) return res
[docs] @staticmethod def convert_replicate_to_raw(replicate_to_raw: dict) -> pd.DataFrame: """ Convert replicate_to_raw dictionary into a dataframe. Parameters ---------- replicate_to_raw : dict Dictionary containing the replicate to raw mapping. Returns ------- pd.DataFrame DataFrame containing the replicate to raw mapping. """ replicate_to_raw_df = pd.DataFrame(replicate_to_raw.items(), columns=["Condition", "Raw file"]) replicate_to_raw_df = replicate_to_raw_df.explode("Raw file") return replicate_to_raw_df
[docs] @staticmethod def compute_condition_stats( relevant_columns_df: pd.DataFrame, min_intensity=0, precursor="precursor ion", ) -> pd.DataFrame: """ Method used to precursor statistics, such as number of observations, CV, mean per condition etc. Parameters ---------- relevant_columns_df : pd.DataFrame DataFrame containing the relevant columns for the statistics. min_intensity : int, optional Minimum intensity value to filter for. Defaults to 0. precursor : str, optional Name of the precursor column. Defaults to "precursor ion. Returns ------- pd.DataFrame DataFrame containing the precursor statistics. """ # fiter for min_intensity relevant_columns_df = relevant_columns_df[relevant_columns_df["Intensity"] > min_intensity] # TODO: check if this is still needed # sum intensity values of the same precursor and "Raw file" using the sum quant_raw_df_int = ( relevant_columns_df.groupby([precursor, "Raw file", "Condition"])["Intensity"] .agg(Intensity="sum", Count="size") .reset_index() ) # add column "log_Intensity" to quant_raw_df quant_raw_df_int["log_Intensity"] = np.log2(quant_raw_df_int["Intensity"]) # compute the mean of the log_Intensity per precursor and "Condition" quant_raw_df_count = (quant_raw_df_int.groupby([precursor])).agg(nr_observed=("Raw file", "size")) # pivot filtered_df_p1 to wide where index peptide ion, columns Raw file and values Intensity intensities_wide = quant_raw_df_int.pivot(index=precursor, columns="Raw file", values="Intensity").reset_index() quant_raw_df = ( quant_raw_df_int.groupby([precursor, "Condition"]) .agg( log_Intensity_mean=("log_Intensity", "mean"), log_Intensity_std=("log_Intensity", "std"), Intensity_mean=("Intensity", "mean"), Intensity_std=("Intensity", "std"), Sum=("Intensity", "sum"), nr_obs_group=("Intensity", "size"), ) .reset_index() ) # compute coefficient of variation (CV) of the log_Intensity_mean and log_Intensity_std quant_raw_df["CV"] = quant_raw_df["Intensity_std"] / quant_raw_df["Intensity_mean"] # pivot dataframe wider so for each condition variable there is a column with log_Intensity_mean, log_Intensity_std, Intensity_mean, Intensity_std and CV quant_raw_df = quant_raw_df.pivot( index=precursor, columns="Condition", values=[ "log_Intensity_mean", "log_Intensity_std", "Intensity_mean", "Intensity_std", "CV", ], ).reset_index() quant_raw_df.columns = [f"{x[0]}_{x[1]}" if len(str(x[1])) > 0 else x[0] for x in quant_raw_df.columns] quant_raw_df["log2_A_vs_B"] = quant_raw_df["log_Intensity_mean_A"] - quant_raw_df["log_Intensity_mean_B"] quant_raw_df = pd.merge(quant_raw_df, intensities_wide, on=precursor, how="inner") quant_raw_df = pd.merge(quant_raw_df, quant_raw_df_count, on=precursor, how="inner") return quant_raw_df
[docs] @staticmethod def compute_epsilon(withspecies, species_expected_ratio) -> pd.DataFrame: """ Compute epsilon for each species in species_expected_ratio. Parameters ---------- withspecies : pd.DataFrame DataFrame containing the species columns and the log2_A_vs_B column. species_expected_ratio : dict Dictionary containing the expected ratios for each species. Returns ------- pd.DataFrame DataFrame containing the epsilon values. """ # for all columns named parse_settings.species_dict.values() compute the sum over the rows and add it to a new column "unique" withspecies["unique"] = withspecies[species_expected_ratio.keys()].sum(axis=1) # now remove all rows with withspecies["unique"] > 1 withspecies_unique = withspecies[withspecies["unique"] == 1].copy() # for species in parse_settings.species_dict.values(), set all values in new column "species" to species if withe species is True for species in species_expected_ratio.keys(): withspecies_unique.loc[withspecies_unique[species] == True, "species"] = species withspecies_unique.loc[withspecies_unique[species] == True, "log2_expectedRatio"] = np.log2( species_expected_ratio[species]["A_vs_B"] ) withspecies_unique["epsilon"] = withspecies_unique["log2_A_vs_B"] - withspecies_unique["log2_expectedRatio"] # Compute per-species empirical centers for precision metrics withspecies_unique["log2_empirical_median"] = withspecies_unique.groupby("species")["log2_A_vs_B"].transform( "median" ) withspecies_unique["log2_empirical_mean"] = withspecies_unique.groupby("species")["log2_A_vs_B"].transform( "mean" ) # Epsilon precision: deviation from empirical center (measures consistency, not accuracy) withspecies_unique["epsilon_precision_median"] = ( withspecies_unique["log2_A_vs_B"] - withspecies_unique["log2_empirical_median"] ) withspecies_unique["epsilon_precision_mean"] = ( withspecies_unique["log2_A_vs_B"] - withspecies_unique["log2_empirical_mean"] ) return withspecies_unique