"""
Module containing the DenovoScores class.
"""
from typing import List, Tuple
import numpy as np
import pandas as pd
from psm_utils import Peptidoform
from proteobench.score.score_base import ScoreBase
[docs]
class DenovoScores(ScoreBase):
"""
Class for computing de novo scores.
Parameters
----------
"""
def __init__(self):
self.AA_MASSES = {
"": 0.0,
"G": 57.021463719204,
"A": 71.037113783,
"S": 87.03202840226,
"P": 97.052763846796,
"V": 99.068413910592,
"T": 101.047678466056,
"C": 103.00918495654,
"L": 113.084063974388,
"I": 113.084063974388,
"N": 114.042927438408,
"D": 115.02694302152,
"Q": 128.058577502204,
"K": 128.094963010536,
"E": 129.04259308531599,
"M": 131.040485084132,
"H": 137.058911855296,
"F": 147.068413910592,
"R": 156.10111101903598,
"Y": 163.06332852985201,
"W": 186.07931294673998,
}
[docs]
def evaluate_match(self, ground_truth: Peptidoform, de_novo: Peptidoform):
"""
Return the match type between two peptide sequences.
"""
gt = self.convert_peptidoform(ground_truth)
dn = self.convert_peptidoform(de_novo)
if dn is None:
return {
"match_type": "mismatch",
"aa_matches_gt": np.full(len(gt), False),
"aa_matches_dn": np.full(len(gt), False),
"aa_exact_gt": np.full(len(gt), False),
"aa_exact_dn": np.full(len(gt), False),
"pep_match": False,
}
if ground_truth == de_novo:
return {
"match_type": "exact",
"aa_matches_gt": np.full(len(gt), True),
"aa_matches_dn": np.full(len(dn), True),
"aa_exact_gt": np.full(len(gt), True),
"aa_exact_dn": np.full(len(dn), True),
"pep_match": True,
}
aa_matches, pep_match, (aa_matches_1, aa_matches_2), (exact_match_1, exact_match_2) = self.aa_match(
gt,
dn,
)
if pep_match:
return {
"match_type": "mass",
"aa_matches_gt": aa_matches_1,
"aa_matches_dn": aa_matches_2,
"aa_exact_gt": exact_match_1,
"aa_exact_dn": exact_match_2,
"pep_match": pep_match,
}
else:
return {
"match_type": "mismatch",
"aa_matches_gt": aa_matches_1,
"aa_matches_dn": aa_matches_2,
"aa_exact_gt": exact_match_1,
"aa_exact_dn": exact_match_2,
"pep_match": pep_match,
}
[docs]
def aa_match(
self,
peptide1: List[str],
peptide2: List[str],
cum_mass_threshold: float = 50,
ind_mass_threshold: float = 20,
) -> Tuple[np.ndarray, bool, Tuple[np.ndarray], Tuple[np.ndarray]]:
"""
Find the matching prefix and suffix amino acids between two peptide
sequences.
Parameters
----------
peptide1 : List[str]
The first tokenized peptide sequence to be compared.
peptide2 : List[str]
The second tokenized peptide sequence to be compared.
cum_mass_threshold : float
Mass threshold in Dalton to accept cumulative mass-matching amino acid
sequences.
ind_mass_threshold : float
Mass threshold in Dalton to accept individual mass-matching amino acids.
Returns
-------
aa_matches : np.ndarray of length max(len(peptide1), len(peptide2))
Boolean flag indicating whether each paired-up amino acid matches across
both peptide sequences.
pep_match : bool
Boolean flag to indicate whether the two peptide sequences fully match.
per_seq_aa_matches : Tuple[np.ndarray]
TODO.
"""
# Find longest mass-matching prefix.
aa_matches, pep_match, (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2) = self.aa_match_prefix(
peptide1, peptide2, cum_mass_threshold, ind_mass_threshold
)
# No need to evaluate the suffixes if the sequences already fully match.
if pep_match:
return aa_matches, pep_match, (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)
# Find longest mass-matching suffix.
i1, i2 = len(peptide1) - 1, len(peptide2) - 1
i_stop = np.argwhere(~aa_matches)[0]
cum_mass1, cum_mass2 = 0.0, 0.0
while i1 >= i_stop and i2 >= i_stop:
aa_mass1 = self.get_token_mass(peptide1[i1])
aa_mass2 = self.get_token_mass(peptide2[i2])
tol_suffix = abs(self.mass_diff(cum_mass1 + aa_mass1, cum_mass2 + aa_mass2, False))
tol_aa = abs(self.mass_diff(aa_mass1, aa_mass2, False))
if tol_suffix < cum_mass_threshold:
match = tol_aa < ind_mass_threshold
aa_matches[max(i1, i2)] = match
aa_matches_1[i1] = match
aa_matches_2[i2] = match
i1, i2 = i1 - 1, i2 - 1
cum_mass1, cum_mass2 = cum_mass1 + aa_mass1, cum_mass2 + aa_mass2
elif cum_mass2 + aa_mass2 > cum_mass1 + aa_mass1:
i1, cum_mass1 = i1 - 1, cum_mass1 + aa_mass1
else:
i2, cum_mass2 = i2 - 1, cum_mass2 + aa_mass2
return aa_matches, aa_matches.all(), (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)
[docs]
def aa_match_prefix(
self,
peptide1: List[str],
peptide2: List[str],
cum_mass_threshold: float = 50,
ind_mass_threshold: float = 20,
) -> Tuple[np.ndarray, bool, Tuple[np.ndarray], Tuple[np.ndarray]]:
"""
Find the matching prefix amino acids between two peptide sequences.
Parameters
----------
peptide1 : List[str]
The first tokenized peptide sequence to be compared.
peptide2 : List[str]
The second tokenized peptide sequence to be compared.
cum_mass_threshold : float
Mass threshold in Dalton to accept cumulative mass-matching amino acid
sequences.
ind_mass_threshold : float
Mass threshold in Dalton to accept individual mass-matching amino acids.
Returns
-------
aa_matches : np.ndarray of length max(len(peptide1), len(peptide2))
Boolean flag indicating whether each paired-up amino acid matches across
both peptide sequences.
pep_match : bool
Boolean flag to indicate whether the two peptide sequences fully match.
per_seq_aa_matches : Tuple[np.ndarray]
TODO.
"""
aa_matches = np.zeros(max(len(peptide1), len(peptide2)), np.bool_)
aa_exact_1 = np.zeros(len(peptide1), np.bool_)
aa_exact_2 = np.zeros(len(peptide2), np.bool_)
aa_matches_1 = np.zeros(len(peptide1), np.bool_)
aa_matches_2 = np.zeros(len(peptide2), np.bool_)
# Find longest mass-matching prefix.
i1, i2, cum_mass1, cum_mass2 = 0, 0, 0.0, 0.0
while i1 < len(peptide1) and i2 < len(peptide2):
# Exact
aa_str1 = self.get_token_str(peptide1[i1])
aa_str2 = self.get_token_str(peptide2[i2])
exact_match = aa_str1 == aa_str2
aa_exact_1[i1] = exact_match
aa_exact_2[i2] = exact_match
# mass-based
aa_mass1 = self.get_token_mass(peptide1[i1])
aa_mass2 = self.get_token_mass(peptide2[i2])
tol_prefix = abs(self.mass_diff(cum_mass1 + aa_mass1, cum_mass2 + aa_mass2, False))
tol = abs(self.mass_diff(aa_mass1, aa_mass2, False))
if tol_prefix < cum_mass_threshold:
match = tol < ind_mass_threshold
aa_matches[max(i1, i2)] = match
aa_matches_1[i1] = match
aa_matches_2[i2] = match
i1, i2 = i1 + 1, i2 + 1
cum_mass1, cum_mass2 = cum_mass1 + aa_mass1, cum_mass2 + aa_mass2
elif cum_mass2 + aa_mass2 > cum_mass1 + aa_mass1:
i1, cum_mass1 = i1 + 1, cum_mass1 + aa_mass1
else:
i2, cum_mass2 = i2 + 1, cum_mass2 + aa_mass2
return aa_matches, aa_matches.all(), (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)
[docs]
def get_token_mass(self, token: tuple) -> float:
"""
Convert the amino acid to a mass while considering modifications as well.
"""
aa, mods = token
mass = self.AA_MASSES[aa]
for mod in mods:
if mod is None:
continue
mass += mod.mass
return mass
[docs]
def get_token_str(self, token: tuple) -> str:
"""
Convert the amino acid to string format including the modification if present.
"""
aa, mods = token
token_str = aa
for mod in mods:
if mod is None:
continue
token_str += "[{}]".format(mod.value)
return token_str
[docs]
def mass_diff(self, mz1, mz2, mode_is_da):
"""
Calculate the mass difference(s).
Parameters
----------
mz1
First m/z value(s).
mz2
Second m/z value(s).
mode_is_da : bool
Mass difference in Dalton (True) or in ppm (False).
Returns
-------
The mass difference(s) between the given m/z values.
"""
return mz1 - mz2 if mode_is_da else (mz1 - mz2) / mz2 * 10**6