Source code for paftacular.parser

import re
from typing import ClassVar

from .annotation import PafAnnotation
from .comps import (
    Adduct,
    ChemicalFormula,
    ImmoniumIon,
    InternalFragment,
    IonType,
    IsotopeSpecification,
    MassError,
    NamedCompound,
    NeutralLoss,
    PeptideIon,
    PrecursorIon,
    ReferenceIon,
    SMILESCompound,
    UnknownIon,
)
from .constants import ADDUCT_REGEX_PATTERN, FULL_PAF_PATTERN, ISOTOPE_REGEX_PATTERN, NEUTRAL_LOSS_REGEX_PATTERN, AminoAcids, IonSeries


[docs] class mzPAFParser: _instance: ClassVar["mzPAFParser | None"] = None def __new__(cls): if cls._instance is None: cls._instance = object.__new__(cls) return cls._instance
[docs] def parse(self, annotation_str: str) -> PafAnnotation: """Parse a single annotation string""" match = FULL_PAF_PATTERN.match(annotation_str) if not match: raise ValueError(f"Invalid mzPAF annotation: '{annotation_str}'") groups = match.groupdict() return PafAnnotation( ion_type=self._parse_ion_type(groups), analyte_reference=self._parse_int(groups, "analyte_reference"), is_auxiliary=bool(groups.get("is_auxiliary")), neutral_losses=self._parse_neutral_losses(groups.get("neutral_losses")), isotopes=self._parse_isotopes(groups.get("isotope")), adducts=self._parse_adducts(groups.get("adducts")), charge=self._parse_int(groups, "charge") or 1, mass_error=self._parse_mass_error(groups), confidence=self._parse_float(groups, "confidence"), )
def _parse_ion_type(self, groups: dict[str, str | None]) -> IonType: """Parse the ion type from regex groups using dispatch pattern""" # Peptide ion series if groups.get("series"): series_str = self._require(groups, "series") ion_series = IonSeries(series_str) return PeptideIon( series=ion_series, position=self._require_int(groups, "ordinal"), sequence=groups.get("sequence_ordinal"), ) # Internal fragment if groups.get("internal_start"): return InternalFragment( start_position=self._require_int(groups, "internal_start"), end_position=self._require_int(groups, "internal_end"), sequence=groups.get("sequence_internal"), ) # Precursor ion if groups.get("precursor"): return PrecursorIon() # Immonium ion if groups.get("immonium"): aa_str = self._require(groups, "immonium") amino_acid = AminoAcids(aa_str) return ImmoniumIon( amino_acid=amino_acid, modification=groups.get("immonium_modification"), ) # Reference ion if groups.get("reference_label"): return ReferenceIon(name=self._require(groups, "reference_label")) # Chemical formula if groups.get("formula"): return ChemicalFormula(formula=self._require(groups, "formula")) # Named compound if groups.get("named_compound"): return NamedCompound(name=self._require(groups, "named_compound")) # SMILES compound if groups.get("smiles"): return SMILESCompound(smiles=self._require(groups, "smiles")) # Unknown/unannotated ion if groups.get("unannotated"): return UnknownIon(label=self._parse_int(groups, "unannotated_label")) # Should never reach here due to regex, but provide helpful error non_null = {k: v for k, v in groups.items() if v is not None} raise ValueError(f"Unable to parse ion type. Available groups: {non_null}") def _parse_isotopes(self, isotope_str: str | None) -> tuple[IsotopeSpecification, ...]: """Parse isotope notation into tuple of specifications Examples: "+i" -> (IsotopeSpecification(count=1),) "-2i13C" -> (IsotopeSpecification(count=-2, element="13C"),) "+i-2i13C+iA" -> (IsotopeSpecification(count=1), IsotopeSpecification(count=-2, element="13C"), IsotopeSpecification(count=1, is_average=True)) """ if not isotope_str: return () # Extract individual isotope strings like "+i", "-2i13C", "+iA" isotope_matches = re.findall(ISOTOPE_REGEX_PATTERN, isotope_str) if not isotope_matches: return () # Parse each isotope component isotopes: list[IsotopeSpecification] = [] for match_groups in isotope_matches: # Reconstruct the isotope string from regex groups sign_str, count_str, element_or_avg = match_groups # Build the isotope string: sign + count + 'i' + element_or_avg isotope_string = f"{sign_str or '+'}{count_str}i{element_or_avg or ''}" isotopes.append(IsotopeSpecification.parse(isotope_string)) return tuple(isotopes) def _parse_neutral_losses(self, losses_str: str | None) -> tuple[NeutralLoss, ...]: """Parse neutral losses/gains into tuple of NeutralLoss objects""" if not losses_str: return () # Pattern matches: [+-] followed by number, formula, or named group loss_strings = re.findall(NEUTRAL_LOSS_REGEX_PATTERN, losses_str) losses: list[NeutralLoss] = [] for loss_str in loss_strings: losses.append(NeutralLoss.parse(loss_str)) return tuple(losses) def _parse_adducts(self, adduct_str: str | None) -> tuple[Adduct, ...]: """Parse adduct notation into tuple of Adduct objects Examples: "M+H" -> (Adduct(count=1, base_formula="H"),) "M+2H+Na" -> (Adduct(count=2, base_formula="H"), Adduct(count=1, base_formula="Na")) "M+NH4" -> (Adduct(count=1, base_formula="NH4"),) "M-H+2Na" -> (Adduct(count=-1, base_formula="H"), Adduct(count=2, base_formula="Na")) """ if not adduct_str: return () # Verify M prefix if not adduct_str.startswith("M"): raise ValueError(f"Adduct string must start with 'M': '{adduct_str}'") content = adduct_str[1:] # Remove 'M' if not content: raise ValueError(f"Adduct string must have components after 'M': '{adduct_str}'") # Extract individual adduct strings like "+H", "+2Na", "-NH4" adduct_strings = re.findall(ADDUCT_REGEX_PATTERN, content) if not adduct_strings: raise ValueError(f"No adduct components found in '{adduct_str}'") # Parse each adduct component adducts: list[Adduct] = [] for match_groups in adduct_strings: # Reconstruct the adduct string from regex groups sign_str, count_str, formula = match_groups adduct_string = f"{sign_str}{count_str}{formula}" adducts.append(Adduct.parse(adduct_string)) return tuple(adducts) def _parse_mass_error(self, groups: dict[str, str | None]) -> MassError | None: """Parse mass error value and unit""" if not groups.get("mass_error"): return None mass_error_str = groups.get("mass_error") if mass_error_str is None: raise ValueError("Mass error value is missing") value = float(mass_error_str) unit = groups.get("mass_error_unit") if unit == "ppm": return MassError(value, "ppm") elif unit is None: return MassError(value, "da") else: raise ValueError(f"Unknown mass error unit: '{unit}'") # Helper methods for common parsing patterns def _require(self, groups: dict[str, str | None], key: str) -> str: """Get required string value from groups, raise if missing""" value = groups.get(key) if value is None or not isinstance(value, str): raise ValueError(f"Required field '{key}' is missing or invalid") return value def _require_int(self, groups: dict[str, str | None], key: str) -> int: """Get required integer value from groups, raise if missing""" value = groups.get(key) if value is None: raise ValueError(f"Required field '{key}' is missing") try: return int(value) except ValueError as e: raise ValueError(f"Field '{key}' must be an integer, got '{value}'") from e def _parse_int(self, groups: dict[str, str | None], key: str) -> int | None: """Parse optional integer from groups""" value = groups.get(key) if value is None: return None try: return int(value) except ValueError as e: raise ValueError(f"Field '{key}' must be an integer, got '{value}'") from e def _parse_float(self, groups: dict[str, str | None], key: str) -> float | None: """Parse optional float from groups""" value = groups.get(key) if value is None: return None try: return float(value) except ValueError as e: raise ValueError(f"Field '{key}' must be a number, got '{value}'") from e
[docs] def parse_multi(self, annotation_str: str) -> list[PafAnnotation]: """Parse potentially multiple comma-separated annotations""" if not annotation_str: return [] annotations: list[PafAnnotation] = [] for part in annotation_str.split(","): part = part.strip() if part: annotations.append(self.parse(part)) return annotations
MZ_PAF_PARSER = mzPAFParser()
[docs] def parse_multi(annotation_str: str) -> list[PafAnnotation]: """parse mzPAF annotation string into list of PafAnnotation""" return MZ_PAF_PARSER.parse_multi(annotation_str)
[docs] def parse(annotation_str: str) -> PafAnnotation | list[PafAnnotation]: """parse single mzPAF annotation string into PafAnnotation""" annots = parse_multi(annotation_str) if len(annots) == 1: return annots[0] return annots
[docs] def parse_single(annotation_str: str) -> PafAnnotation: """backward compatibility alias for parse()""" annots = parse_multi(annotation_str) if len(annots) != 1: raise ValueError(f"Expected single annotation, got {len(annots)}: '{annotation_str}'") return annots[0]