Source code for proteobench.io.params.maxquant
"""Functionality to parse Maxqunt mqpar.xml parameter files."""
from __future__ import annotations
import collections
import logging
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import numpy as np
import pandas as pd
from proteobench.io.params import ProteoBenchParameters
logger = logging.getLogger()
[docs]
def extend_tuple(t, target_length: int):
"""
Extend tuple with None values to match target length.
Parameters
----------
t : tuple
The tuple to extend.
target_length : int
The target length of the tuple.
Returns
-------
tuple
The extended tuple.
Raises
------
TypeError
If the input is not a tuple.
ValueError
If the tuple is longer than the target length.
"""
if not isinstance(t, tuple):
raise TypeError(f"Wrong type provided. Expected tuple, got {type(t)} : {t!r}")
if len(t) > target_length:
raise ValueError(f"Tuple is too long (got {len(t)}, expected {target_length}: {t!r}")
return t + (None,) * (target_length - len(t))
[docs]
def extend_tuples_with_none(list_of_tuples: list[tuple], target_length: int):
"""
Extend the tuples in a list of tuples with None values to match target length.
Parameters
----------
list_of_tuples : list of tuple
The list of tuples to extend.
target_length : int
The target length of the tuples.
Returns
-------
list of tuple
The list of extended tuples.
"""
extended_tuples = []
for tuple_ in list_of_tuples:
# if len(tuple_) > target_length:
# raise ValueError(f"tuple is too long: {len(tuple_)}")
extended_tuple = extend_tuple(tuple_, target_length)
extended_tuples.append(extended_tuple)
return extended_tuples
[docs]
def add_record(data: dict, tag: str, record) -> dict:
"""
Add tag and record to data dict.
Parameters
----------
data : dict
The data dictionary to add the record to.
tag : str
The tag for the record.
record : any
The record to add.
Returns
-------
dict
The updated data dictionary.
"""
if tag in data:
if isinstance(data[tag], list):
data[tag].append(record)
else:
data[tag] = [data[tag], record]
else:
data[tag] = record
return data
[docs]
def read_xml_record(element: ET.Element) -> dict:
"""
Read entire record in a nested dict structure.
Parameters
----------
element : xml.etree.ElementTree.Element
The XML element to read.
Returns
-------
dict
The nested dictionary structure of the XML element.
"""
data = dict()
if element.attrib:
data.update(element.attrib)
for child in element:
if len(child) > 1 and child.tag:
# if there is a list, process each element one by one
# either nested or a plain text
data[child.tag] = [
add_record(
dict(),
tag=child.tag,
record=read_xml_record(child) if not (child.text and child.text.strip()) else child.text.strip(),
)
for child in child
]
elif child.text and child.text.strip():
data = add_record(data=data, tag=child.tag, record=child.text.strip())
else:
record = read_xml_record(child)
data = add_record(data, child.tag, record)
if not data:
# empty strings and None are normalzied to None
return None
return data
[docs]
def read_file(file: str) -> dict:
"""
Read all entries in a MaxQuant xml file.
Parameters
----------
file : str
The path to the XML file.
Returns
-------
dict
The parsed XML data as a dictionary.
"""
tree: ET.ElementTree = ET.parse(file)
root: ET.Element = tree.getroot()
params: dict = read_xml_record(root)
return params
[docs]
def flatten_dict_of_dicts(d: dict, parent_key: str = "") -> dict:
"""
Build tuples for nested dictionaries for use as `pandas.MultiIndex`.
Parameters
----------
d : dict
Nested dictionary for which all keys are flattened to tuples.
parent_key : str, optional
Outer key (used for recursion), by default ''.
Returns
-------
dict
Flattened dictionary with tuple keys: {(outer_key, ..., inner_key) : value}.
"""
items = []
for k, v in d.items():
new_key = parent_key + (k,) if parent_key else (k,)
if isinstance(v, collections.abc.MutableMapping):
items.extend(flatten_dict_of_dicts(v, parent_key=new_key))
elif isinstance(v, list):
for item in v:
if isinstance(item, collections.abc.MutableMapping):
items.extend(flatten_dict_of_dicts(item, parent_key=new_key))
elif isinstance(item, str):
items.append((new_key, item))
else:
raise ValueError(f"Unknown item: {item:r}")
else:
items.append((new_key, v))
return items
[docs]
def build_Series_from_records(records, index_length=4):
"""
Build a pandas Series from records.
Parameters
----------
records : dict
The records to build the Series from.
index_length : int, optional
The length of the index, by default 4.
Returns
-------
pandas.Series
The pandas Series built from the records.
"""
records = flatten_dict_of_dicts(records)
idx = pd.MultiIndex.from_tuples((extend_tuple(k, index_length) for (k, _) in records))
return pd.Series((v for (k, v) in records), index=idx)
[docs]
def extract_params(
fname, ms2frac="FTMS", json_file=os.path.join(os.path.dirname(__file__), "json/Quant/quant_lfq_DDA_ion.json")
) -> ProteoBenchParameters:
"""
Extract parameters from a MaxQuant XML file.
Parameters
----------
fname : str
The path to the XML file.
ms2frac : str, optional
The MS2 fragmentation method, by default "FTMS".
Returns
-------
ProteoBenchParameters
The extracted parameters.
"""
params = ProteoBenchParameters(filename=json_file)
record = read_file(fname)
# select ms2 fragmentation method specified by parameter
# MaxQuant does this to our knowledge based on the binary rawfile metadata
record["msmsParamsArray"] = [d for d in record["msmsParamsArray"] if d["msmsParams"]["Name"] == ms2frac]
record = build_Series_from_records(record, 4).sort_index()
params.software_name = "MaxQuant"
params.search_engine = "Andromeda"
params.software_version = record.loc["maxQuantVersion"].squeeze()
params.ident_fdr_psm = float(record.loc["peptideFdr"].squeeze())
params.ident_fdr_peptide = None
params.ident_fdr_protein = float(record.loc["proteinFdr"].squeeze())
params.enable_match_between_runs = record.loc["matchBetweenRuns"].squeeze().lower() == "true"
_precursor_mass_tolerance = record.loc[
pd.IndexSlice["parameterGroups", "parameterGroup", "mainSearchTol", :]
].squeeze()
_precursor_mass_tolerance = f"{_precursor_mass_tolerance} ppm"
params.precursor_mass_tolerance = "[-" + _precursor_mass_tolerance + ", " + _precursor_mass_tolerance + "]"
# ! differences between version >1.6 and <=1.5
fragment_mass_tolerance = record.loc[pd.IndexSlice["msmsParamsArray", "msmsParams", "MatchTolerance", :]].squeeze()
in_ppm = bool(record.loc[pd.IndexSlice["msmsParamsArray", "msmsParams", "MatchToleranceInPpm", :]].squeeze())
if in_ppm:
fragment_mass_tolerance = f"{fragment_mass_tolerance} ppm"
fragment_mass_tolerance = f"[-{fragment_mass_tolerance}, {fragment_mass_tolerance}]"
params.fragment_mass_tolerance = fragment_mass_tolerance
params.enzyme = record.loc[("parameterGroups", "parameterGroup", "enzymes", "string")].squeeze()
semi_enzymatic = int(record.loc[("parameterGroups", "parameterGroup", "enzymeMode")].squeeze())
# enzyme mode 0: Fully specific
# enzyme mode 1: Semi specific free N terminus
# enzyme mode 2: Semi specific free C terminus
# enzyme mode 3: Semi specific
# enzyme mode 4: Unspecific
# enzyme mode 5: No digestion
if semi_enzymatic == 0:
params.semi_enzymatic = False
else:
params.semi_enzymatic = True
params.allowed_miscleavages = int(
record.loc[pd.IndexSlice["parameterGroups", "parameterGroup", "maxMissedCleavages", :]].squeeze()
)
try:
params.min_peptide_length = int(record.loc["minPepLen"].squeeze())
except KeyError:
# Version 2.6 and above
params.min_peptide_length = int(record.loc["minPeptideLength"].squeeze())
# minPeptideLengthForUnspecificSearch (what is it?)
params.max_peptide_length = None
# fixed mods
if params.software_version > "1.6.0.0":
fixed_mods = record.loc[pd.IndexSlice["parameterGroups", "parameterGroup", "fixedModifications", :]].squeeze()
if isinstance(fixed_mods, str):
params.fixed_mods = fixed_mods
else:
params.fixed_mods = ",".join(fixed_mods)
else:
fixed_mods = record.loc[pd.IndexSlice["fixedModifications", :]].squeeze()
if isinstance(fixed_mods, str):
params.fixed_mods = fixed_mods
else:
params.fixed_mods = ",".join(fixed_mods)
variable_mods = record.loc[pd.IndexSlice["parameterGroups", "parameterGroup", "variableModifications", :]].squeeze()
if isinstance(variable_mods, str):
params.variable_mods = variable_mods
else:
params.variable_mods = ",".join(variable_mods)
params.max_mods = int(record.loc[("parameterGroups", "parameterGroup", "maxNmods")].squeeze())
params.min_precursor_charge = None
params.max_precursor_charge = int(
record.loc[pd.IndexSlice["parameterGroups", "parameterGroup", "maxCharge", :]].squeeze()
)
params.fill_none()
return params
# create a first version of json files to match
if __name__ == "__main__":
import json
from pprint import pprint
for test_file in [
"../../../test/params/mqpar_MQ1.6.3.3_MBR.xml",
"../../../test/params/mqpar_MQ2.1.3.0_noMBR.xml",
"../../../test/params/mqpar1.5.3.30_MBR.xml",
"../../../test/params/mqpar_mq2.6.2.0_1mc_MBR.xml",
]:
print(f"{test_file = }")
record = read_file(test_file)
(
Path(test_file)
.with_suffix(".json")
.write_text(
json.dumps(
record,
indent=4,
)
)
)
record = build_Series_from_records(record, 4)
record = record.to_frame("run_identifier")
record.to_csv(Path(test_file).with_suffix(".csv"))
params = extract_params(test_file, ms2frac="FTMS")
pprint(params.__dict__)
test_file = Path(test_file)
fname = Path(str(test_file.with_suffix(".json").with_name(test_file.stem + "_sel")) + ".json")
with open(fname, "w") as f:
json.dump(params.__dict__, f, indent=4)