Source code for proteobench.score.denovo.denovoscores

"""
Module containing the DenovoScores class.
"""

from typing import List, Tuple

import numpy as np
import pandas as pd
from psm_utils import Peptidoform

from proteobench.score.score_base import ScoreBase



[docs]
class DenovoScores(ScoreBase):
    """
    Class for computing de novo scores.

    Parameters
    ----------
    """

    def __init__(self):
        self.AA_MASSES = {
            "": 0.0,
            "G": 57.021463719204,
            "A": 71.037113783,
            "S": 87.03202840226,
            "P": 97.052763846796,
            "V": 99.068413910592,
            "T": 101.047678466056,
            "C": 103.00918495654,
            "L": 113.084063974388,
            "I": 113.084063974388,
            "N": 114.042927438408,
            "D": 115.02694302152,
            "Q": 128.058577502204,
            "K": 128.094963010536,
            "E": 129.04259308531599,
            "M": 131.040485084132,
            "H": 137.058911855296,
            "F": 147.068413910592,
            "R": 156.10111101903598,
            "Y": 163.06332852985201,
            "W": 186.07931294673998,
        }


[docs]
    def generate_intermediate(self, filtered_df: pd.DataFrame, replicate_to_raw=None) -> pd.DataFrame:
        # TODO: Evaluate which PSMs match, and which don't and return new table

        # Add match type label (exact, mass, mismatch) and the amino acid-level evaluations
        filtered_df["match_dict"] = filtered_df.apply(
            lambda x: self.evaluate_match(ground_truth=x["peptidoform_ground_truth"], de_novo=x["peptidoform"]), axis=1
        )
        filtered_df["match_type"] = filtered_df["match_dict"].apply(lambda x: x["match_type"])
        filtered_df["aa_matches_dn"] = filtered_df["match_dict"].apply(lambda x: x["aa_matches_dn"])
        filtered_df["aa_matches_gt"] = filtered_df["match_dict"].apply(lambda x: x["aa_matches_gt"])
        filtered_df["aa_exact_gt"] = filtered_df["match_dict"].apply(lambda x: x["aa_exact_gt"])
        filtered_df["aa_exact_dn"] = filtered_df["match_dict"].apply(lambda x: x["aa_exact_dn"])
        filtered_df["pep_match"] = filtered_df["match_dict"].apply(lambda x: x["pep_match"])
        _ = filtered_df.pop("match_dict")
        return filtered_df



[docs]
    def evaluate_match(self, ground_truth: Peptidoform, de_novo: Peptidoform):
        """
        Return the match type between two peptide sequences.
        """
        gt = self.convert_peptidoform(ground_truth)
        dn = self.convert_peptidoform(de_novo)

        if dn is None:
            return {
                "match_type": "mismatch",
                "aa_matches_gt": np.full(len(gt), False),
                "aa_matches_dn": np.full(len(gt), False),
                "aa_exact_gt": np.full(len(gt), False),
                "aa_exact_dn": np.full(len(gt), False),
                "pep_match": False,
            }

        if ground_truth == de_novo:
            return {
                "match_type": "exact",
                "aa_matches_gt": np.full(len(gt), True),
                "aa_matches_dn": np.full(len(dn), True),
                "aa_exact_gt": np.full(len(gt), True),
                "aa_exact_dn": np.full(len(dn), True),
                "pep_match": True,
            }

        aa_matches, pep_match, (aa_matches_1, aa_matches_2), (exact_match_1, exact_match_2) = self.aa_match(
            gt,
            dn,
        )
        if pep_match:
            return {
                "match_type": "mass",
                "aa_matches_gt": aa_matches_1,
                "aa_matches_dn": aa_matches_2,
                "aa_exact_gt": exact_match_1,
                "aa_exact_dn": exact_match_2,
                "pep_match": pep_match,
            }
        else:
            return {
                "match_type": "mismatch",
                "aa_matches_gt": aa_matches_1,
                "aa_matches_dn": aa_matches_2,
                "aa_exact_gt": exact_match_1,
                "aa_exact_dn": exact_match_2,
                "pep_match": pep_match,
            }



[docs]
    def aa_match(
        self,
        peptide1: List[str],
        peptide2: List[str],
        cum_mass_threshold: float = 50,
        ind_mass_threshold: float = 20,
    ) -> Tuple[np.ndarray, bool, Tuple[np.ndarray], Tuple[np.ndarray]]:
        """
        Find the matching prefix and suffix amino acids between two peptide
        sequences.

        Parameters
        ----------
        peptide1 : List[str]
            The first tokenized peptide sequence to be compared.
        peptide2 : List[str]
            The second tokenized peptide sequence to be compared.
        cum_mass_threshold : float
            Mass threshold in Dalton to accept cumulative mass-matching amino acid
            sequences.
        ind_mass_threshold : float
            Mass threshold in Dalton to accept individual mass-matching amino acids.

        Returns
        -------
        aa_matches : np.ndarray of length max(len(peptide1), len(peptide2))
            Boolean flag indicating whether each paired-up amino acid matches across
            both peptide sequences.
        pep_match : bool
            Boolean flag to indicate whether the two peptide sequences fully match.
        per_seq_aa_matches : Tuple[np.ndarray]
            TODO.
        """
        # Find longest mass-matching prefix.
        aa_matches, pep_match, (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2) = self.aa_match_prefix(
            peptide1, peptide2, cum_mass_threshold, ind_mass_threshold
        )

        # No need to evaluate the suffixes if the sequences already fully match.
        if pep_match:
            return aa_matches, pep_match, (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)

        # Find longest mass-matching suffix.
        i1, i2 = len(peptide1) - 1, len(peptide2) - 1
        i_stop = np.argwhere(~aa_matches)[0]
        cum_mass1, cum_mass2 = 0.0, 0.0

        while i1 >= i_stop and i2 >= i_stop:
            aa_mass1 = self.get_token_mass(peptide1[i1])
            aa_mass2 = self.get_token_mass(peptide2[i2])
            tol_suffix = abs(self.mass_diff(cum_mass1 + aa_mass1, cum_mass2 + aa_mass2, False))
            tol_aa = abs(self.mass_diff(aa_mass1, aa_mass2, False))

            if tol_suffix < cum_mass_threshold:
                match = tol_aa < ind_mass_threshold
                aa_matches[max(i1, i2)] = match
                aa_matches_1[i1] = match
                aa_matches_2[i2] = match

                i1, i2 = i1 - 1, i2 - 1
                cum_mass1, cum_mass2 = cum_mass1 + aa_mass1, cum_mass2 + aa_mass2

            elif cum_mass2 + aa_mass2 > cum_mass1 + aa_mass1:
                i1, cum_mass1 = i1 - 1, cum_mass1 + aa_mass1
            else:
                i2, cum_mass2 = i2 - 1, cum_mass2 + aa_mass2

        return aa_matches, aa_matches.all(), (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)



[docs]
    def aa_match_prefix(
        self,
        peptide1: List[str],
        peptide2: List[str],
        cum_mass_threshold: float = 50,
        ind_mass_threshold: float = 20,
    ) -> Tuple[np.ndarray, bool, Tuple[np.ndarray], Tuple[np.ndarray]]:
        """
        Find the matching prefix amino acids between two peptide sequences.

        Parameters
        ----------
        peptide1 : List[str]
            The first tokenized peptide sequence to be compared.
        peptide2 : List[str]
            The second tokenized peptide sequence to be compared.
        cum_mass_threshold : float
            Mass threshold in Dalton to accept cumulative mass-matching amino acid
            sequences.
        ind_mass_threshold : float
            Mass threshold in Dalton to accept individual mass-matching amino acids.

        Returns
        -------
        aa_matches : np.ndarray of length max(len(peptide1), len(peptide2))
            Boolean flag indicating whether each paired-up amino acid matches across
            both peptide sequences.
        pep_match : bool
            Boolean flag to indicate whether the two peptide sequences fully match.
        per_seq_aa_matches : Tuple[np.ndarray]
            TODO.
        """
        aa_matches = np.zeros(max(len(peptide1), len(peptide2)), np.bool_)

        aa_exact_1 = np.zeros(len(peptide1), np.bool_)
        aa_exact_2 = np.zeros(len(peptide2), np.bool_)
        aa_matches_1 = np.zeros(len(peptide1), np.bool_)
        aa_matches_2 = np.zeros(len(peptide2), np.bool_)

        # Find longest mass-matching prefix.
        i1, i2, cum_mass1, cum_mass2 = 0, 0, 0.0, 0.0
        while i1 < len(peptide1) and i2 < len(peptide2):
            # Exact
            aa_str1 = self.get_token_str(peptide1[i1])
            aa_str2 = self.get_token_str(peptide2[i2])
            exact_match = aa_str1 == aa_str2
            aa_exact_1[i1] = exact_match
            aa_exact_2[i2] = exact_match

            # mass-based
            aa_mass1 = self.get_token_mass(peptide1[i1])
            aa_mass2 = self.get_token_mass(peptide2[i2])
            tol_prefix = abs(self.mass_diff(cum_mass1 + aa_mass1, cum_mass2 + aa_mass2, False))
            tol = abs(self.mass_diff(aa_mass1, aa_mass2, False))
            if tol_prefix < cum_mass_threshold:
                match = tol < ind_mass_threshold
                aa_matches[max(i1, i2)] = match
                aa_matches_1[i1] = match
                aa_matches_2[i2] = match

                i1, i2 = i1 + 1, i2 + 1
                cum_mass1, cum_mass2 = cum_mass1 + aa_mass1, cum_mass2 + aa_mass2

            elif cum_mass2 + aa_mass2 > cum_mass1 + aa_mass1:
                i1, cum_mass1 = i1 + 1, cum_mass1 + aa_mass1
            else:
                i2, cum_mass2 = i2 + 1, cum_mass2 + aa_mass2
        return aa_matches, aa_matches.all(), (aa_matches_1, aa_matches_2), (aa_exact_1, aa_exact_2)



[docs]
    def convert_peptidoform(self, peptidoform: Peptidoform):
        if not isinstance(peptidoform, Peptidoform):
            return None

        out = []
        n_mod = peptidoform.properties["n_term"]
        if n_mod is None:
            n_mod = [None]

        # If there is an N-terminal mod, this is separately tokenized.
        else:
            out.append(("", n_mod))

        for i, aa_mod in enumerate(peptidoform):
            aa, mod = aa_mod
            if mod is None:
                mod = [mod]

            out.append((aa, mod))
        return out



[docs]
    def get_token_mass(self, token: tuple) -> float:
        """
        Convert the amino acid to a mass while considering modifications as well.
        """
        aa, mods = token
        mass = self.AA_MASSES[aa]
        for mod in mods:
            if mod is None:
                continue
            mass += mod.mass
        return mass



[docs]
    def get_token_str(self, token: tuple) -> str:
        """
        Convert the amino acid to string format including the modification if present.
        """
        aa, mods = token
        token_str = aa
        for mod in mods:
            if mod is None:
                continue
            token_str += "[{}]".format(mod.value)
        return token_str



[docs]
    def mass_diff(self, mz1, mz2, mode_is_da):
        """
        Calculate the mass difference(s).

        Parameters
        ----------
        mz1
            First m/z value(s).
        mz2
            Second m/z value(s).
        mode_is_da : bool
            Mass difference in Dalton (True) or in ppm (False).

        Returns
        -------
            The mass difference(s) between the given m/z values.
        """
        return mz1 - mz2 if mode_is_da else (mz1 - mz2) / mz2 * 10**6