Source code for proteobench.score.denovo.denovoscores

"""
Module containing the DenovoScores class.
"""

from typing import List, Tuple

import numpy as np
import pandas as pd
from psm_utils import Peptidoform

from proteobench.score.score_base import ScoreBase


[docs] class DenovoScores(ScoreBase): """ Class for computing de novo scores. Parameters ---------- """ def __init__(self): self.AA_MASSES = { "": 0.0, "G": 57.021463719204, "A": 71.037113783, "S": 87.03202840226, "P": 97.052763846796, "V": 99.068413910592, "T": 101.047678466056, "C": 103.00918495654, "L": 113.084063974388, "I": 113.084063974388, "N": 114.042927438408, "D": 115.02694302152, "Q": 128.058577502204, "K": 128.094963010536, "E": 129.04259308531599, "M": 131.040485084132, "H": 137.058911855296, "F": 147.068413910592, "R": 156.10111101903598, "Y": 163.06332852985201, "W": 186.07931294673998, }
[docs] def generate_intermediate(self, filtered_df: pd.DataFrame, replicate_to_raw=None) -> pd.DataFrame: # TODO: Evaluate which PSMs match, and which don't and return new table # Add match type label (exact, mass, mismatch) and the amino acid-level evaluations filtered_df["match_dict"] = filtered_df.apply( lambda x: self.evaluate_match(ground_truth=x["peptidoform_ground_truth"], de_novo=x["peptidoform"]), axis=1 ) filtered_df["match_type"] = filtered_df["match_dict"].apply(lambda x: x["match_type"]) filtered_df["aa_matches_dn"] = filtered_df["match_dict"].apply(lambda x: x["aa_matches_dn"]) filtered_df["aa_matches_gt"] = filtered_df["match_dict"].apply(lambda x: x["aa_matches_gt"]) filtered_df["aa_exact_gt"] = filtered_df["match_dict"].apply(lambda x: x["aa_exact_gt"]) filtered_df["aa_exact_dn"] = filtered_df["match_dict"].apply(lambda x: x["aa_exact_dn"]) filtered_df["pep_match"] = filtered_df["match_dict"].apply(lambda x: x["pep_match"]) _ = filtered_df.pop("match_dict") return filtered_df
[docs] def evaluate_match(self, ground_truth: Peptidoform, de_novo: Peptidoform): """ Return the match type between two peptide sequences. """ gt = self.convert_peptidoform(ground_truth) dn = self.convert_peptidoform(de_novo) if dn is None: return { "match_type": "mismatch", "aa_matches_gt": np.full(len(gt), False), "aa_matches_dn": np.full(len(gt), False), "aa_exact_gt": np.full(len(gt), False), "aa_exact_dn": np.full(len(gt), False), "pep_match": False, } if ground_truth == de_novo: return { "match_type": "exact", "aa_matches_gt": np.full(len(gt), True), "aa_matches_dn": np.full(len(dn), True), "aa_exact_gt": np.full(len(gt), True), "aa_exact_dn": np.full(len(dn), True), "pep_match": True, } aa_matches, pep_match, (aa_matches_1, aa_matches_2), (exact_match_1, exact_match_2) = self.aa_match( gt, dn, ) if pep_match: return { "match_type": "mass", "aa_matches_gt": aa_matches_1, "aa_matches_dn": aa_matches_2, "aa_exact_gt": exact_match_1, "aa_exact_dn": exact_match_2, "pep_match": pep_match, } else: return { "match_type": "mismatch", "aa_matches_gt": aa_matches_1, "aa_matches_dn": aa_matches_2, "aa_exact_gt": exact_match_1, "aa_exact_dn": exact_match_2, "pep_match": pep_match, }
[docs] def aa_match( self, peptide1: List[str], peptide2: List[str], cum_mass_threshold: float = 50, ind_mass_threshold: float = 20, ) -> Tuple[np.ndarray, bool, Tuple[np.ndarray], Tuple[np.ndarray]]: """ Find the matching prefix and suffix amino acids between two peptide sequences. Parameters ---------- peptide1 : List[str] The first tokenized peptide sequence to be compared. peptide2 : List[str] The second tokenized peptide sequence to be compared. cum_mass_threshold : float Mass threshold in Dalton to accept cumulative mass-matching amino acid sequences. ind_mass_threshold : float Mass threshold in Dalton to accept individual mass-matching amino acids. Returns ------- aa_matches : np.ndarray of length max(len(peptide1), len(peptide2)) Boolean flag indicating whether each paired-up amino acid matches across both peptide sequences. pep_match : bool Boolean flag to indicate whether the two peptide sequences fully match. per_seq_aa_matches : Tuple[np.ndarray] TODO. """ # Find longest mass-matching prefix. aa_matches, pep_match, (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2) = self.aa_match_prefix( peptide1, peptide2, cum_mass_threshold, ind_mass_threshold ) # No need to evaluate the suffixes if the sequences already fully match. if pep_match: return aa_matches, pep_match, (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2) # Find longest mass-matching suffix. i1, i2 = len(peptide1) - 1, len(peptide2) - 1 i_stop = np.argwhere(~aa_matches)[0] cum_mass1, cum_mass2 = 0.0, 0.0 while i1 >= i_stop and i2 >= i_stop: aa_mass1 = self.get_token_mass(peptide1[i1]) aa_mass2 = self.get_token_mass(peptide2[i2]) tol_suffix = abs(self.mass_diff(cum_mass1 + aa_mass1, cum_mass2 + aa_mass2, False)) tol_aa = abs(self.mass_diff(aa_mass1, aa_mass2, False)) if tol_suffix < cum_mass_threshold: match = tol_aa < ind_mass_threshold aa_matches[max(i1, i2)] = match aa_matches_1[i1] = match aa_matches_2[i2] = match i1, i2 = i1 - 1, i2 - 1 cum_mass1, cum_mass2 = cum_mass1 + aa_mass1, cum_mass2 + aa_mass2 elif cum_mass2 + aa_mass2 > cum_mass1 + aa_mass1: i1, cum_mass1 = i1 - 1, cum_mass1 + aa_mass1 else: i2, cum_mass2 = i2 - 1, cum_mass2 + aa_mass2 return aa_matches, aa_matches.all(), (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)
[docs] def aa_match_prefix( self, peptide1: List[str], peptide2: List[str], cum_mass_threshold: float = 50, ind_mass_threshold: float = 20, ) -> Tuple[np.ndarray, bool, Tuple[np.ndarray], Tuple[np.ndarray]]: """ Find the matching prefix amino acids between two peptide sequences. Parameters ---------- peptide1 : List[str] The first tokenized peptide sequence to be compared. peptide2 : List[str] The second tokenized peptide sequence to be compared. cum_mass_threshold : float Mass threshold in Dalton to accept cumulative mass-matching amino acid sequences. ind_mass_threshold : float Mass threshold in Dalton to accept individual mass-matching amino acids. Returns ------- aa_matches : np.ndarray of length max(len(peptide1), len(peptide2)) Boolean flag indicating whether each paired-up amino acid matches across both peptide sequences. pep_match : bool Boolean flag to indicate whether the two peptide sequences fully match. per_seq_aa_matches : Tuple[np.ndarray] TODO. """ aa_matches = np.zeros(max(len(peptide1), len(peptide2)), np.bool_) aa_exact_1 = np.zeros(len(peptide1), np.bool_) aa_exact_2 = np.zeros(len(peptide2), np.bool_) aa_matches_1 = np.zeros(len(peptide1), np.bool_) aa_matches_2 = np.zeros(len(peptide2), np.bool_) # Find longest mass-matching prefix. i1, i2, cum_mass1, cum_mass2 = 0, 0, 0.0, 0.0 while i1 < len(peptide1) and i2 < len(peptide2): # Exact aa_str1 = self.get_token_str(peptide1[i1]) aa_str2 = self.get_token_str(peptide2[i2]) exact_match = aa_str1 == aa_str2 aa_exact_1[i1] = exact_match aa_exact_2[i2] = exact_match # mass-based aa_mass1 = self.get_token_mass(peptide1[i1]) aa_mass2 = self.get_token_mass(peptide2[i2]) tol_prefix = abs(self.mass_diff(cum_mass1 + aa_mass1, cum_mass2 + aa_mass2, False)) tol = abs(self.mass_diff(aa_mass1, aa_mass2, False)) if tol_prefix < cum_mass_threshold: match = tol < ind_mass_threshold aa_matches[max(i1, i2)] = match aa_matches_1[i1] = match aa_matches_2[i2] = match i1, i2 = i1 + 1, i2 + 1 cum_mass1, cum_mass2 = cum_mass1 + aa_mass1, cum_mass2 + aa_mass2 elif cum_mass2 + aa_mass2 > cum_mass1 + aa_mass1: i1, cum_mass1 = i1 + 1, cum_mass1 + aa_mass1 else: i2, cum_mass2 = i2 + 1, cum_mass2 + aa_mass2 return aa_matches, aa_matches.all(), (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)
[docs] def convert_peptidoform(self, peptidoform: Peptidoform): if not isinstance(peptidoform, Peptidoform): return None out = [] n_mod = peptidoform.properties["n_term"] if n_mod is None: n_mod = [None] # If there is an N-terminal mod, this is separately tokenized. else: out.append(("", n_mod)) for i, aa_mod in enumerate(peptidoform): aa, mod = aa_mod if mod is None: mod = [mod] out.append((aa, mod)) return out
[docs] def get_token_mass(self, token: tuple) -> float: """ Convert the amino acid to a mass while considering modifications as well. """ aa, mods = token mass = self.AA_MASSES[aa] for mod in mods: if mod is None: continue mass += mod.mass return mass
[docs] def get_token_str(self, token: tuple) -> str: """ Convert the amino acid to string format including the modification if present. """ aa, mods = token token_str = aa for mod in mods: if mod is None: continue token_str += "[{}]".format(mod.value) return token_str
[docs] def mass_diff(self, mz1, mz2, mode_is_da): """ Calculate the mass difference(s). Parameters ---------- mz1 First m/z value(s). mz2 Second m/z value(s). mode_is_da : bool Mass difference in Dalton (True) or in ppm (False). Returns ------- The mass difference(s) between the given m/z values. """ return mz1 - mz2 if mode_is_da else (mz1 - mz2) / mz2 * 10**6