Source code for proteobench.io.parsing.parse_ion

"""
Module for parsing precursor ion data from various formats.
"""

import math
import os
import re
import warnings

import pandas as pd


[docs] def load_input_file(input_csv: str, input_format: str, input_csv_secondary: str = None) -> pd.DataFrame: """ Load a dataframe from a CSV file depending on its format. Parameters ---------- input_csv : str The path to the CSV file. input_format : str The format of the input file (e.g., "MaxQuant", "AlphaPept", etc.). input_csv_secondary : str, optional The path to a secondary CSV file (used for some formats like AlphaDIA). Returns ------- pd.DataFrame The loaded dataframe. """ try: if input_format == "MaxQuant": warnings.warn( """ WARNING: MaxQuant proforma parsing does not take into account fixed modifications\n because they are implicit. Only after providing the appropriate parameter file,\n fixed modifications will be added correctly. """ ) load_function = _LOAD_FUNCTIONS[input_format] except KeyError as e: raise ValueError(f"Invalid input format: {input_format}") from e # For AlphaDIA, pass the secondary file if provided if input_format == "AlphaDIA" and input_csv_secondary: return load_function(input_csv, input_csv_secondary) return load_function(input_csv)
[docs] def aggregate_modification_column( input_string_seq: str, input_string_modifications: str, special_locations: dict = { "Any N-term": 0, "Any C-term": -1, "Protein N-term": 0, "Protein C-term": -1, }, ) -> str: """ Aggregate modifications into a string representing the modified sequence. Parameters ---------- input_string_seq : str The input sequence string. input_string_modifications : str The modifications applied to the sequence. special_locations : dict, optional A dictionary specifying special locations for modifications. Returns ------- str The modified sequence string with aggregated modifications. """ all_mods = [] for m in input_string_modifications.split("; "): if len(m) == 0: continue mod_location = m.split(" (")[1].rstrip(")") mod_name = m.split(" (")[0] if mod_location in special_locations.keys(): if special_locations[mod_location] == -1: # C-Term all_mods.append(("-[" + mod_name + "]", len(input_string_seq))) else: # N-Term all_mods.append(("[" + mod_name + "]-", special_locations[mod_location])) continue all_mods.append(("[" + mod_name + "]", int(mod_location[1:]))) all_mods.sort(key=lambda x: x[1], reverse=True) for name, loc in all_mods: input_string_seq = input_string_seq[:loc] + name + input_string_seq[loc:] return input_string_seq
[docs] def aggregate_modification_sites_column( input_string_seq: str, input_string_modifications: str, input_string_sites: str, ) -> str: """ Aggregate modification sites into a string representing the modified sequence with sites. Parameters ---------- input_string_seq : str The input sequence string. input_string_modifications : str The modifications applied to the sequence. input_string_sites : str The positions of the modifications. Returns ------- str The modified sequence string with modification sites. """ # In V1, mods sites column of unmodified peptides is NaN, in V2 it is empty string if isinstance(input_string_modifications, float) and math.isnan(input_string_modifications): return input_string_seq if not input_string_modifications: return input_string_seq mods_list = input_string_modifications.split(";") sites_list = list(map(int, str(input_string_sites).split(";"))) mods_and_sites = sorted(zip(mods_list, sites_list), key=lambda x: x[1], reverse=True) for mod, site in mods_and_sites: if not mod: continue mod_name = mod.split("@")[0] if site == 0: input_string_seq = input_string_seq[:site] + f"[{mod_name}]-" + input_string_seq[site:] elif site == -1: input_string_seq = input_string_seq[:site] + f"-[{mod_name}]" + input_string_seq[site:] else: input_string_seq = input_string_seq[:site] + f"[{mod_name}]" + input_string_seq[site:] return input_string_seq
[docs] def count_chars(input_string: str, isalpha: bool = True, isupper: bool = True) -> int: """ Count the number of characters in the string that match the given criteria. Parameters ---------- input_string : str The input string. isalpha : bool, optional Whether to count alphabetic characters. Defaults to True. isupper : bool, optional Whether to count uppercase characters. Defaults to True. Returns ------- int The count of characters that match the criteria. """ if isalpha and isupper: return sum(1 for char in input_string if char.isalpha() and char.isupper()) if isalpha: return sum(1 for char in input_string if char.isalpha()) if isupper: return sum(1 for char in input_string if char.isupper())
[docs] def get_stripped_seq(input_string: str, isalpha: bool = True, isupper: bool = True) -> str: """ Get a stripped version of the sequence containing only characters that match the given criteria. Parameters ---------- input_string : str The input string. isalpha : bool, optional Whether to include alphabetic characters. Defaults to True. isupper : bool, optional Whether to include uppercase characters. Defaults to True. Returns ------- str The stripped sequence. """ if isalpha and isupper: return "".join(char for char in input_string if char.isalpha() and char.isupper()) if isalpha: return "".join(char for char in input_string if char.isalpha()) if isupper: return "".join(char for char in input_string if char.isupper())
[docs] def match_brackets( input_string: str, pattern: str = r"\[([^]]+)\]", isalpha: bool = True, isupper: bool = True, ) -> tuple: """ Match and extract bracketed modifications from the string. Parameters ---------- input_string : str The input string. pattern : str, optional The regular expression pattern for matching modifications. Defaults to "\\[([^]]+)\\]". isalpha : bool, optional Whether to match alphabetic characters. Defaults to True. isupper : bool, optional Whether to match uppercase characters. Defaults to True. Returns ------- tuple A tuple containing the matched modifications and their positions. """ matches = [(match.group(), match.start(), match.end()) for match in re.finditer(pattern, input_string)] positions = (count_chars(input_string[0 : m[1]], isalpha=isalpha, isupper=isupper) for m in matches) mods = (m[0] for m in matches) return mods, positions
[docs] def to_lowercase(match) -> str: """ Convert a match to lowercase. Parameters ---------- match : re.Match The match object from a regular expression. Returns ------- str The lowercase version of the matched string. """ return match.group(0).lower()
[docs] def get_proforma_bracketed( input_string: str, before_aa: bool = True, isalpha: bool = True, isupper: bool = True, pattern: str = r"\[([^]]+)\]", modification_dict: dict = { "+57.0215": "Carbamidomethyl", "+15.9949": "Oxidation", "-17.026548": "Gln->pyro-Glu", "-18.010565": "Glu->pyro-Glu", "+42": "Acetyl", }, ) -> str: """ Generate a proforma string with bracketed modifications. Parameters ---------- input_string : str The input sequence string. before_aa : bool, optional Whether to add the modification before the amino acid. Defaults to True. isalpha : bool, optional Whether to include alphabetic characters. Defaults to True. isupper : bool, optional Whether to include uppercase characters. Defaults to True. pattern : str, optional The regular expression pattern for matching modifications. Defaults to "\\[([^]]+)\\]". modification_dict : dict, optional A dictionary of modifications and their names. Returns ------- str The proforma sequence with bracketed modifications. """ input_string = re.sub(pattern, to_lowercase, input_string) modifications, positions = match_brackets(input_string, pattern=pattern, isalpha=isalpha, isupper=isupper) new_modifications = [] for m in modifications: if m in modification_dict: new_modifications.append(modification_dict[m]) else: new_modifications.append(m) modifications = new_modifications pos_mod_dict = dict(zip(positions, modifications)) stripped_seq = get_stripped_seq(input_string, isalpha=isalpha, isupper=isupper) new_seq = "" for idx, aa in enumerate(stripped_seq): if before_aa: new_seq += aa if idx in pos_mod_dict: if idx == 0: new_seq += f"[{pos_mod_dict[idx]}]-" elif idx == len(stripped_seq): new_seq += f"-[{pos_mod_dict[idx]}]" else: new_seq += f"[{pos_mod_dict[idx]}]" if not before_aa: new_seq += aa return new_seq
def _load_maxquant(input_csv: str) -> pd.DataFrame: """ Load a MaxQuant output file. Parameters ---------- input_csv : str The path to the MaxQuant output file. Returns ------- pd.DataFrame The loaded dataframe. """ data = pd.read_csv(input_csv, sep="\t", low_memory=False) # If Proteins is NaN for some entries, fill with "Leading proteins" column if it exists (TODO: Why are some entries Nan and then leading proteins not?) if "Proteins" in data.columns and "Leading proteins" in data.columns: data["Proteins"] = data["Proteins"].fillna(data["Leading proteins"]) # If NaN remain, remove those rows because they cannot be used for benchmarking: if "Proteins" in data.columns: data = data.dropna(subset=["Proteins"]) # Check if Proteins column contains species information if "Proteins" in data.columns and not any( data["Proteins"].str.contains("|", regex=False, na=False) ): # Not sure if this is best way to check for species information, problems is that we do not want to hardcode species names. # Map gene names to descriptions using the mapper.csv file mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() data["Proteins"] = data["Proteins"].map( lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(";")]) ) return data def _load_alphapept(input_csv: str) -> pd.DataFrame: """ Load a AlphaPept output file. Parameters ---------- input_csv : str The path to the AlphaPept output file. Returns ------- pd.DataFrame The loaded dataframe. """ return pd.read_csv(input_csv, low_memory=False, dtype={"charge": int}) def _load_sage(input_csv: str) -> pd.DataFrame: """ Load a Sage output file. Parameters ---------- input_csv : str The path to the Sage output file. Returns ------- pd.DataFrame The loaded dataframe. """ return pd.read_csv(input_csv, sep="\t", low_memory=False) def _load_fragpipe(input_csv: str) -> pd.DataFrame: """ Load a FragPipe output file. Parameters ---------- input_csv : str The path to the FragPipe output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") input_data_frame["Protein"] = input_data_frame["Protein"] + "," + input_data_frame["Mapped Proteins"].fillna("") return input_data_frame def _load_wombat(input_csv: str) -> pd.DataFrame: """ Load a WOMBAT output file. Parameters ---------- input_csv : str The path to the WOMBAT output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",") mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() non_strings = input_data_frame["protein_group"][ ~input_data_frame["protein_group"].apply(lambda x: isinstance(x, str)) ] input_data_frame["protein_group"] = input_data_frame["protein_group"].map( lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(",")]) ) input_data_frame["proforma"] = input_data_frame["modified_peptide"] return input_data_frame def _load_prolinestudio_msangel(input_csv: str) -> pd.DataFrame: """ Load a MSAngel/ProlineStudio output file. Parameters ---------- input_csv : str The path to the MSAngel/ProlineStudio output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_excel( input_csv, sheet_name="Quantified peptide ions", header=0, index_col=None, engine="calamine" ) input_data_frame["modifications"] = input_data_frame["modifications"].fillna("") input_data_frame["subsets_accessions"] = input_data_frame["subsets_accessions"].fillna("") input_data_frame["proforma"] = input_data_frame.apply( lambda x: aggregate_modification_column(x.sequence, x.modifications), axis=1, ) # combine the sameset and subset accessions: # first combine the accessions: input_data_frame["proteins"] = input_data_frame["samesets_accessions"] + input_data_frame[ "subsets_accessions" ].apply(lambda x: "; " + x if len(x) > 0 else "") # then sort the unique accessions: input_data_frame["proteins"] = input_data_frame["proteins"].apply(lambda x: "; ".join(sorted(x.split("; ")))) # drop the duplicates: input_data_frame.drop_duplicates(subset=["proforma", "master_quant_peptide_ion_charge", "proteins"], inplace=True) # combine the duplicated precursor ions because proline reports one row per precursor + accession: group_cols = ["proforma", "master_quant_peptide_ion_charge"] agg_funcs = {col: "first" for col in input_data_frame.columns if col not in group_cols + ["proteins"]} input_data_frame = ( input_data_frame.groupby(group_cols).agg({"proteins": lambda x: "; ".join(x), **agg_funcs}).reset_index() ) return input_data_frame def _load_i2masschroq(input_csv: str) -> pd.DataFrame: """ Load a i2MassChroQ output file. Parameters ---------- input_csv : str The path to the i2MassChroQ output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") input_data_frame["proforma"] = input_data_frame["ProForma"] return input_data_frame def _load_custom(input_csv: str) -> pd.DataFrame: """ Load a custom output file. Parameters ---------- input_csv : str The path to the custom output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") input_data_frame["proforma"] = input_data_frame["Modified sequence"] return input_data_frame def _load_diann(input_csv: str) -> pd.DataFrame: """ Load a DIA-NN output file. Parameters ---------- input_csv : str The path to the DIA-NN output file. Returns ------- pd.DataFrame The loaded dataframe. """ if isinstance(input_csv, str): filename = input_csv else: # streamlit OpenedFile object filename = input_csv.name if filename.endswith(".parquet"): input_data_frame = pd.read_parquet(input_csv) else: input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") # Map gene names to descriptions mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() input_data_frame["Protein.Ids"] = input_data_frame["Protein.Ids"].map( lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(";")]) ) return input_data_frame def _merge_alphadia_files( input_csv: str, input_csv_secondary: str, file1_sample: pd.DataFrame, file2_sample: pd.DataFrame ) -> pd.DataFrame: """ Merge two AlphaDIA files (precursor.matrix.tsv and precursors.tsv). This function automatically detects which file is the matrix (wide format) and which is the long format based on the presence of required metadata columns. Parameters ---------- input_csv : str The path to the first AlphaDIA output file. input_csv_secondary : str The path to the second AlphaDIA output file. file1_sample : pd.DataFrame A sample (first few rows) of the first file for column detection. file2_sample : pd.DataFrame A sample (first few rows) of the second file for column detection. Returns ------- pd.DataFrame The merged dataframe with precursor information. Raises ------ ValueError If the files cannot be identified or merged correctly. """ # Required columns for the merge required_merge_columns = [ "genes", "decoy", "mods", "mod_sites", "sequence", "charge", "mod_seq_charge_hash", ] # Detect which file is the matrix (wide format) and which is long format file1_cols = set(file1_sample.columns) file2_cols = set(file2_sample.columns) # Check which file has the required columns for merge file1_has_required = all(col in file1_cols for col in required_merge_columns) file2_has_required = all(col in file2_cols for col in required_merge_columns) # Determine which file is the long format if file1_has_required and not file2_has_required: # file1 is long format (precursors.tsv), file2 is matrix precursors_long = pd.read_csv( input_csv, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str}, header=0 ) precursor_matrix = pd.read_csv( input_csv_secondary, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str}, header=0 ) elif file2_has_required: # file2 is long format (precursors.tsv), file1 is matrix precursor_matrix = pd.read_csv( input_csv, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str}, header=0 ) precursors_long = pd.read_csv( input_csv_secondary, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str}, header=0 ) else: # Neither file has the required columns raise ValueError( f"Cannot identify the correct AlphaDIA files. Neither file contains all required columns: " f"{', '.join(required_merge_columns)}. " f"File 1 columns: {', '.join(sorted(file1_cols)[:10])}... " f"File 2 columns: {', '.join(sorted(file2_cols)[:10])}... " f"Please ensure you are uploading both precursor.matrix.tsv and precursors.tsv files from AlphaDIA output." ) # Select only the columns that exist in precursors_long available_merge_columns = [col for col in required_merge_columns if col in precursors_long.columns] if not available_merge_columns or "mod_seq_charge_hash" not in available_merge_columns: raise ValueError( f"Cannot merge AlphaDIA files. The long format file is missing required columns. " f"Required: {', '.join(required_merge_columns)}. " f"Available in long format: {', '.join(available_merge_columns)}. " f"All columns in long format: {', '.join(list(precursors_long.columns)[:20])}. " f"Please ensure you are uploading the correct precursors.tsv file." ) # Merge the matrix with precursor info merged_df = pd.merge( precursor_matrix, precursors_long[available_merge_columns], on="mod_seq_charge_hash", how="left" ) # Remove duplicates that might result from the merge merged_df.drop_duplicates(inplace=True) return merged_df def _load_alphadia(input_csv: str, input_csv_secondary: str = None) -> pd.DataFrame: """ Load AlphaDIA output files. Parameters ---------- input_csv : str The path to one of the AlphaDIA output files. input_csv_secondary : str, optional The path to the second AlphaDIA output file. If provided, the system will automatically detect which file is the precursor.matrix.tsv and which is the precursors.tsv (long format), then merge them. Returns ------- pd.DataFrame The loaded dataframe. """ v1 = False # If secondary file is provided, detect which is which and merge if input_csv_secondary: # Read samples from both files to detect their structure file1_sample = pd.read_csv( input_csv, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str}, nrows=5, header=0 ) file2_sample = pd.read_csv( input_csv_secondary, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str}, nrows=5, header=0 ) # Reset file pointers to beginning (only if they're file objects, not path strings) if hasattr(input_csv, "seek"): input_csv.seek(0) if hasattr(input_csv_secondary, "seek"): input_csv_secondary.seek(0) # Merge the two files using the helper function input_data_frame = _merge_alphadia_files(input_csv, input_csv_secondary, file1_sample, file2_sample) else: # Use the single file directly if no secondary file provided # Check file extension first for parquet if isinstance(input_csv, str) and input_csv.lower().endswith(".parquet"): input_data_frame = pd.read_parquet(input_csv) else: try: input_data_frame = pd.read_csv( input_csv, low_memory=False, sep="\t", dtype={"mod_seq_charge_hash": str, "precursor.mod_seq_charge_hash": str}, header=0, ) except UnicodeDecodeError: # Parquet input, possible from AlphaDIA v2 input_data_frame = pd.read_parquet(input_csv) # Map gene names to descriptions mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() if "pg.genes" not in input_data_frame.columns: # AlphaDIA v1 v1 = True gene_column = "genes" else: gene_column = "pg.genes" input_data_frame["genes"] = input_data_frame[gene_column].map( lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(";")]) ) if not v1: # AlphaDIA v2: Rename columns first input_data_frame.rename( columns={ "precursor.sequence": "sequence", "precursor.mods": "mods", "precursor.mod_sites": "mod_sites", "precursor.charge": "charge", "precursor.intensity": "Intensity", }, inplace=True, ) input_data_frame = input_data_frame.dropna(subset=["Intensity"]) # If data is in long format (has raw.name column), convert to wide format if "raw.name" in input_data_frame.columns: # Define columns to keep as identifiers (not pivot) id_columns = ["sequence", "mods", "mod_sites", "charge", "genes"] # Pivot from long to wide format input_data_frame = input_data_frame.pivot_table( index=id_columns, columns="raw.name", values="Intensity", aggfunc="first", # Use first value if duplicates exist ).reset_index() # Flatten column names after pivot input_data_frame.columns.name = None # For v1 or v2 TSV (already in wide format), generate proforma notation input_data_frame["proforma"] = input_data_frame.apply( lambda x: aggregate_modification_sites_column(x.sequence, x.mods, x.mod_sites), axis=1, ) return input_data_frame def _load_fragpipe_diann_quant(input_csv: str) -> pd.DataFrame: """ Load a FragPipe (DIA-NN) output file. Parameters ---------- input_csv : str The path to the FragPipe (DIA-NN) output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") if "All Mapped Proteins" in input_data_frame.columns: input_data_frame["Protein.Ids"] = input_data_frame["All Mapped Proteins"] mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() # Map Protein.Ids (gene names) to species-suffixed descriptions (e.g. "sp|Q86U42|PABP2_HUMAN"). input_data_frame["Protein.Ids"] = input_data_frame["Protein.Ids"].map( lambda x: ";".join([mapper.get(p, p) for p in x.split(";")]) if isinstance(x, str) else x ) return input_data_frame def _load_spectronaut(input_csv: str) -> pd.DataFrame: """ Load a Spectronaut output file. Parameters ---------- input_csv : str The path to the Spectronaut output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") if input_data_frame["FG.Quantity"].dtype == object: try: input_csv.seek(0) except AttributeError: # if input_csv is a PathPosix object, it does not have a seek method # This can occur when the io util functions are used. # Should probably be fixed some way in the future pass input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t", decimal=",") input_data_frame["FG.LabeledSequence"] = input_data_frame["FG.LabeledSequence"].str.strip("_") mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() input_data_frame["PG.ProteinGroups"] = input_data_frame["PG.ProteinGroups"].str.split(";") input_data_frame["PG.ProteinGroups"] = input_data_frame["PG.ProteinGroups"].map( lambda x: [mapper[protein] if protein in mapper.keys() else protein for protein in x] ) input_data_frame["PG.ProteinGroups"] = input_data_frame["PG.ProteinGroups"].str.join(";") return input_data_frame def _load_metamorpheus(input_csv: str) -> pd.DataFrame: """ Load a MetaMorpheus output file (FlashLFQ: AllQuantifiedPeaks.tsv). Parameters ---------- input_csv : str The path to the MetaMorpheus output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() input_data_frame["Proteins"] = input_data_frame["Protein Group"].map( lambda x: ";".join([mapper.get(protein, protein) for protein in x.split(";")]) ) # TODO: discuss how to handle multiple mapped precursors input_data_frame = input_data_frame[input_data_frame["Full Sequences Mapped"] == 1] return input_data_frame def _load_msaid(input_csv: str) -> pd.DataFrame: """ Load a MSAID output file. Parameters ---------- input_csv : str The path to the MSAID output file. Returns ------- pd.DataFrame The loaded dataframe. """ return pd.read_csv(input_csv, low_memory=False, sep="\t") def _load_peaks(input_csv: str) -> pd.DataFrame: """ Load a PEAKS output file. Parameters ---------- input_csv : str The path to the PEAKS output file. Returns ------- pd.DataFrame The loaded dataframe. """ df = pd.read_csv(input_csv, low_memory=False, sep=",") # Strip .raw or .mzML suffixes that PEAKS may add to sample names (e.g. "Sample.raw Normalized Area") df.columns = [re.sub(r"\.(raw|mzML)(\s)", r"\2", c) for c in df.columns] return df def _load_quantms(input_csv: str) -> pd.DataFrame: """ Load a QuantMS output file. Parameters ---------- input_csv : str The path to the QuantMS output file. Returns ------- pd.DataFrame The loaded dataframe. """ input_data_frame = pd.read_csv(input_csv, low_memory=False) input_data_frame = input_data_frame.assign( proforma=input_data_frame["PeptideSequence"].str.replace( r"\(([^)]+)\)", r"", regex=True, ), ) input_data_frame["Sequence"] = input_data_frame["PeptideSequence"].str.replace(r"\(([^)]+)\)", r"", regex=True) return input_data_frame _LOAD_FUNCTIONS = { "MaxQuant": _load_maxquant, "AlphaPept": _load_alphapept, "Sage": _load_sage, "FragPipe": _load_fragpipe, "WOMBAT": _load_wombat, "ProlineStudio": _load_prolinestudio_msangel, "MSAngel": _load_prolinestudio_msangel, "i2MassChroQ": _load_i2masschroq, "Custom": _load_custom, "DIA-NN": _load_diann, "AlphaDIA": _load_alphadia, "FragPipe (DIA-NN quant)": _load_fragpipe_diann_quant, "Spectronaut": _load_spectronaut, "MSAID": _load_msaid, "PEAKS": _load_peaks, "quantms": _load_quantms, "MetaMorpheus": _load_metamorpheus, }