Source code for proteobench.validation.protein_ids

"""
Protein-identifier extraction helpers for submission validation.

ProteoBench tool outputs store protein identifiers in the standardized
``Proteins`` column. The representation is not fully normalized across tools:

* a single protein may be a UniProt-style triplet such as ``sp|P49327|FAS_HUMAN``
  (the ``|`` separates database/accession/entry-name), a bare accession such as
  ``P49327``, or an isoform such as ``P49327-2``;
* multiple proteins (protein groups) are joined with ``;`` (e.g. MaxQuant) or
  ``,`` (e.g. the FragPipe loader combines ``Protein`` and ``Mapped Proteins``).

These helpers split protein-group strings into individual proteins and extract
the candidate identifiers (accession, entry name, isoform base) used to match
against a FASTA-derived accession set. They are deliberately generic so the
core validator does not embed tool-specific assumptions.
"""

from __future__ import annotations

import re
from typing import Iterable, List, Set

#: Default separators used to split a protein-group string into individual proteins.
#: The ``|`` character is intentionally excluded because it is a *within-protein*
#: separator in UniProt identifiers (``db|accession|entryname``).
DEFAULT_GROUP_SEPARATORS = (";", ",")

#: Matches a UniProt-style ``db|accession|entryname`` token.
_UNIPROT_TRIPLET = re.compile(r"^(?:sp|tr|up)\|([^|]+)\|(\S+)$", re.IGNORECASE)

#: Matches a trailing isoform suffix such as ``-2`` on an accession.
_ISOFORM_SUFFIX = re.compile(r"-\d+$")


[docs] def split_protein_groups(value: str, separators: Iterable[str] = DEFAULT_GROUP_SEPARATORS) -> List[str]: """ Split a protein-group cell into individual protein tokens. Parameters ---------- value : str The raw value of a ``Proteins`` cell (may contain several proteins). separators : iterable of str, optional Characters that separate proteins within a group. Defaults to :data:`DEFAULT_GROUP_SEPARATORS` (``;`` and ``,``). Returns ------- list of str Stripped, non-empty individual protein tokens. """ if value is None: return [] text = str(value).strip() if not text: return [] seps = [s for s in separators if s] if not seps: tokens = [text] else: pattern = "|".join(re.escape(s) for s in seps) tokens = re.split(pattern, text) return [t.strip() for t in tokens if t and t.strip()]
[docs] def extract_identifiers(protein_token: str) -> Set[str]: """ Extract candidate identifiers from a single protein token. For a UniProt triplet such as ``sp|P49327|FAS_HUMAN`` this returns the accession (``P49327``), the entry name (``FAS_HUMAN``), and (for isoforms) the isoform base accession. For a bare accession it returns the accession and its isoform base. For any other token it returns the token unchanged. Parameters ---------- protein_token : str A single protein identifier (one element of a protein group). Returns ------- set of str Candidate identifiers usable for FASTA membership testing. """ if protein_token is None: return set() token = str(protein_token).strip() if not token: return set() identifiers: Set[str] = {token} triplet = _UNIPROT_TRIPLET.match(token) if triplet: accession, entry_name = triplet.group(1), triplet.group(2) identifiers.add(accession) identifiers.add(entry_name) base = _ISOFORM_SUFFIX.sub("", accession) if base != accession: identifiers.add(base) elif "|" in token: # Unknown ``a|b|c`` shape: keep every part as a candidate. for part in token.split("|"): part = part.strip() if part: identifiers.add(part) else: base = _ISOFORM_SUFFIX.sub("", token) if base != token: identifiers.add(base) return identifiers
[docs] def is_decoy_or_contaminant( protein_token: str, contaminant_flag: str = None, decoy_prefixes: Iterable[str] = (), ) -> bool: """ Determine whether a protein token is a decoy or contaminant marker. The check is case-insensitive and matches the contaminant flag as a substring (mirroring ParseSettings contaminant detection) and the decoy markers as case-insensitive prefixes. Parameters ---------- protein_token : str A single protein identifier. contaminant_flag : str, optional Substring marking contaminant proteins (from the tool parse settings, e.g. ``"Cont_"``). ``None`` disables contaminant detection. decoy_prefixes : iterable of str, optional Prefixes marking decoy proteins (e.g. ``"rev_"``, ``"DECOY_"``). Returns ------- bool ``True`` if the token is a decoy or contaminant identifier. """ if protein_token is None: return False token = str(protein_token).strip() if not token: return False lowered = token.lower() if contaminant_flag and str(contaminant_flag).lower() in lowered: return True for prefix in decoy_prefixes: if prefix and lowered.startswith(str(prefix).lower()): return True return False