Source code for proteobench.io.parsing.parse_peptidoform

"""
Module for parsing peptidoform strings and extracting modifications.
"""

import math
import os
import re
import warnings
from typing import Dict

import pandas as pd


[docs] def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame: """ Load a dataframe from a CSV file depending on its format. Parameters ---------- input_csv : str The path to the CSV file. input_format : str The format of the input file (e.g., "WOMBAT", "Custom"). Returns ------- pd.DataFrame The loaded dataframe with the required columns added (like "proforma"). """ try: if input_format == "MaxQuant": warnings.warn( """ WARNING: MaxQuant proforma parsing does not take into account fixed modifications\n because they are implicit. Only after providing the appropriate parameter file,\n fixed modifications will be added correctly. """ ) load_function = _LOAD_FUNCTIONS[input_format] except KeyError as e: raise ValueError(f"Invalid input format: {input_format}") from e return load_function(input_csv)
[docs] def aggregate_modification_column( input_string_seq: str, input_string_modifications: str, special_locations: Dict[str, int] = { "Any N-term": 0, "Any C-term": -1, "Protein N-term": 0, "Protein C-term": -1, "N-Term": 0, # Added to handle "N-Term" "C-Term": -1, # If you also expect "C-Term" }, ) -> str: """ Aggregate modifications into a string representing the modified sequence. This version handles both: - Original format (e.g. "Methylation (C11)" or "Carbamidomethyl (Any N-term)") - New format (e.g. "1xCarbamidomethyl [C11]", "1xOxidation [M4]", "1xAcetyl [N-Term]") Parameters ---------- input_string_seq : str The input sequence string. input_string_modifications : str The modifications applied to the sequence. special_locations : dict, optional A dictionary specifying special locations for modifications. Returns ------- str The modified sequence string with aggregated modifications. """ # If no modifications, return the original sequence unchanged if not input_string_modifications.strip(): return input_string_seq # Split modifications by ';' to handle multiple modifications raw_mods = [x.strip() for x in input_string_modifications.split(";") if x.strip()] all_mods = [] for m in raw_mods: # Detect format by checking for '(' or '[' if "(" in m and "[" not in m: # Original format (e.g. "Carbamidomethyl (C11)" or "Methylation (Any N-term)") parts = m.split(" (") if len(parts) < 2: continue m_name = parts[0].strip() m_stripped = parts[1].rstrip(")") # Check if this is a special location if m_stripped in special_locations: loc = special_locations[m_stripped] if loc == -1: loc = len(input_string_seq) # C-term all_mods.append((m_name, loc)) else: # Assume format like C11 means position 11 loc = int(m_stripped[1:]) all_mods.append((m_name, loc)) else: # New format, e.g. "1xCarbamidomethyl [C11]", "1xAcetyl [N-Term]" # Remove any count prefix like "1x" entry = re.sub(r"\d+x", "", m).strip() # Extract modification name and bracketed portion mod_name_match = re.match(r"([A-Za-z]+)\s*\[(.+)\]", entry) if not mod_name_match: continue mod_name = mod_name_match.group(1) positions_str = mod_name_match.group(2).strip() # Positions could be multiple (e.g. "C10; C13") pos_parts = [p.strip() for p in positions_str.split(";") if p.strip()] if not pos_parts: # If there's nothing after the brackets, skip continue for pos_part in pos_parts: # Check if pos_part is a known special location (e.g. "N-Term") if pos_part in special_locations: loc = special_locations[pos_part] if loc == -1: loc = len(input_string_seq) all_mods.append((mod_name, loc)) else: # Otherwise, assume format like C11 or M4 if len(pos_part) > 1: loc = int(pos_part[1:]) all_mods.append((mod_name, loc)) # Sort modifications by descending position so we insert from the end all_mods.sort(key=lambda x: x[1], reverse=True) for name, loc in all_mods: # Insert the modification into the sequence. # 'loc' is a 1-based index if it's a residue position. # For terminal modifications, special_locations will have adjusted it. # If loc is -1 or at sequence end, we've already resolved it to len(sequence). # Insert the modification brackets at position 'loc'. # Note: If loc == 0 (N-term), insert at start of sequence. # If loc == len(sequence), insert at end (C-term). input_string_seq = input_string_seq[:loc] + f"[{name}]" + input_string_seq[loc:] return input_string_seq
[docs] def count_chars(input_string: str, isalpha: bool = True, isupper: bool = True) -> int: """ Count the number of characters in the string that match the given criteria. Parameters ---------- input_string : str The input string. isalpha : bool, optional Whether to count alphabetic characters. Defaults to True. isupper : bool, optional Whether to count uppercase characters. Defaults to True. Returns ------- int The count of characters that match the criteria. """ if isalpha and isupper: return sum(1 for char in input_string if char.isalpha() and char.isupper()) if isalpha: return sum(1 for char in input_string if char.isalpha()) if isupper: return sum(1 for char in input_string if char.isupper())
[docs] def get_stripped_seq(input_string: str, isalpha: bool = True, isupper: bool = True) -> str: """ Get a stripped version of the sequence containing only characters that match the given criteria. Parameters ---------- input_string : str The input string. isalpha : bool, optional Whether to include alphabetic characters. Defaults to True. isupper : bool, optional Whether to include uppercase characters. Defaults to True. Returns ------- str The stripped sequence. """ if isalpha and isupper: return "".join(char for char in input_string if char.isalpha() and char.isupper()) if isalpha: return "".join(char for char in input_string if char.isalpha()) if isupper: return "".join(char for char in input_string if char.isupper())
[docs] def match_brackets( input_string: str, pattern: str = r"\[([^]]+)\]", isalpha: bool = True, isupper: bool = True, ) -> tuple: """ Match and extract bracketed modifications from the string. Parameters ---------- input_string : str The input string. pattern : str, optional The regular expression pattern for matching modifications. Defaults to `r"\\[([^]]+)\\]"`. isalpha : bool, optional Whether to match alphabetic characters. Defaults to True. isupper : bool, optional Whether to match uppercase characters. Defaults to True. Returns ------- tuple A tuple containing the matched modifications and their positions. """ matches = [(match.group(), match.start(), match.end()) for match in re.finditer(pattern, input_string)] positions = (count_chars(input_string[0 : m[1]], isalpha=isalpha, isupper=isupper) for m in matches) mods = (m[0] for m in matches) return mods, positions
[docs] def to_lowercase(match: re.Match) -> str: """ Convert a match to lowercase. Parameters ---------- match : re.Match The match object from a regular expression. Returns ------- str The lowercase version of the matched string. """ return match.group(0).lower()
[docs] def get_proforma_bracketed( input_string: str, before_aa: bool = True, isalpha: bool = True, isupper: bool = True, pattern: str = r"\[([^]]+)\]", modification_dict: Dict[str, str] = { "+57.0215": "Carbamidomethyl", "+15.9949": "Oxidation", "-17.026548": "Gln->pyro-Glu", "-18.010565": "Glu->pyro-Glu", "+42": "Acetyl", }, ) -> str: """ Get the proforma sequence with bracketed modifications. Parameters ---------- input_string : str The input sequence string. before_aa : bool, optional Whether to add the modification before the amino acid. Defaults to True. isalpha : bool, optional Whether to include alphabetic characters. Defaults to True. isupper : bool, optional Whether to include uppercase characters. Defaults to True. pattern : str, optional The regular expression pattern for matching modifications. Defaults to `r"\\[([^]]+)\\]"`. modification_dict : dict, optional A dictionary of modifications and their names. Returns ------- str The proforma sequence with bracketed modifications. """ input_string = re.sub(pattern, to_lowercase, input_string) modifications, positions = match_brackets(input_string, pattern=pattern, isalpha=isalpha, isupper=isupper) new_modifications = [] for m in modifications: if m in modification_dict: new_modifications.append(modification_dict[m]) else: new_modifications.append(m) modifications = new_modifications pos_mod_dict = dict(zip(positions, modifications)) stripped_seq = get_stripped_seq(input_string, isalpha=isalpha, isupper=isupper) new_seq = "" for idx, aa in enumerate(stripped_seq): if before_aa: new_seq += aa if idx in pos_mod_dict: if idx == 0: new_seq += f"[{pos_mod_dict[idx]}]-" elif idx == len(stripped_seq): new_seq += f"-[{pos_mod_dict[idx]}]" else: new_seq += f"[{pos_mod_dict[idx]}]" if not before_aa: new_seq += aa return new_seq
def _load_proteome_discoverer(input_csv: str) -> pd.DataFrame: """ Load a Proteome Discoverer output file. Parameters ---------- input_csv : str The path to the Proteome Discoverer output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") input_data_frame["Modifications"].fillna("", inplace=True) input_data_frame["proforma"] = input_data_frame.apply( lambda x: aggregate_modification_column(x["Sequence"], x["Modifications"]), axis=1, ) return input_data_frame def _load_wombat(input_csv: str) -> pd.DataFrame: """ Load a WOMBAT output file. Parameters ---------- input_csv : str The path to the WOMBAT output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",") mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() non_strings = input_data_frame["protein_group"][ ~input_data_frame["protein_group"].apply(lambda x: isinstance(x, str)) ] input_data_frame["protein_group"] = input_data_frame["protein_group"].map( lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in str(x).split(",")]) ) input_data_frame["proforma"] = input_data_frame["modified_peptide"] return input_data_frame def _load_custom(input_csv: str) -> pd.DataFrame: """ Load a custom output file. Parameters ---------- input_csv : str The path to the custom output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") input_data_frame["proforma"] = input_data_frame["Modified sequence"] return input_data_frame def _load_peaks(input_csv: str) -> pd.DataFrame: """ Load a PEAKS output file. Parameters ---------- input_csv : str The path to the PEAKS output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",") return input_data_frame _LOAD_FUNCTIONS = { "Proteome Discoverer": _load_proteome_discoverer, "WOMBAT": _load_wombat, "Custom": _load_custom, "PEAKS": _load_peaks, }