Source code for paftacular.parser

import re
from typing import ClassVar

from .annotation import PafAnnotation
from .comps import (
    Adduct,
    ChemicalFormula,
    ImmoniumIon,
    InternalFragment,
    IonType,
    IsotopeSpecification,
    MassError,
    NamedCompound,
    NeutralLoss,
    PeptideIon,
    PrecursorIon,
    ReferenceIon,
    SMILESCompound,
    UnknownIon,
)
from .constants import ADDUCT_REGEX_PATTERN, FULL_PAF_PATTERN, ISOTOPE_REGEX_PATTERN, NEUTRAL_LOSS_REGEX_PATTERN, AminoAcids, IonSeries



[docs]
class mzPAFParser:
    _instance: ClassVar["mzPAFParser | None"] = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = object.__new__(cls)
        return cls._instance


[docs]
    def parse(self, annotation_str: str) -> PafAnnotation:
        """Parse a single annotation string"""
        match = FULL_PAF_PATTERN.match(annotation_str)
        if not match:
            raise ValueError(f"Invalid mzPAF annotation: '{annotation_str}'")

        groups = match.groupdict()

        return PafAnnotation(
            ion_type=self._parse_ion_type(groups),
            analyte_reference=self._parse_int(groups, "analyte_reference"),
            is_auxiliary=bool(groups.get("is_auxiliary")),
            neutral_losses=self._parse_neutral_losses(groups.get("neutral_losses")),
            isotopes=self._parse_isotopes(groups.get("isotope")),
            adducts=self._parse_adducts(groups.get("adducts")),
            charge=self._parse_int(groups, "charge") or 1,
            mass_error=self._parse_mass_error(groups),
            confidence=self._parse_float(groups, "confidence"),
        )


    def _parse_ion_type(self, groups: dict[str, str | None]) -> IonType:
        """Parse the ion type from regex groups using dispatch pattern"""

        # Peptide ion series
        if groups.get("series"):
            series_str = self._require(groups, "series")
            ion_series = IonSeries(series_str)
            return PeptideIon(
                series=ion_series,
                position=self._require_int(groups, "ordinal"),
                sequence=groups.get("sequence_ordinal"),
            )

        # Internal fragment
        if groups.get("internal_start"):
            return InternalFragment(
                start_position=self._require_int(groups, "internal_start"),
                end_position=self._require_int(groups, "internal_end"),
                sequence=groups.get("sequence_internal"),
            )

        # Precursor ion
        if groups.get("precursor"):
            return PrecursorIon()

        # Immonium ion
        if groups.get("immonium"):
            aa_str = self._require(groups, "immonium")
            amino_acid = AminoAcids(aa_str)
            return ImmoniumIon(
                amino_acid=amino_acid,
                modification=groups.get("immonium_modification"),
            )

        # Reference ion
        if groups.get("reference_label"):
            return ReferenceIon(name=self._require(groups, "reference_label"))

        # Chemical formula
        if groups.get("formula"):
            return ChemicalFormula(formula=self._require(groups, "formula"))

        # Named compound
        if groups.get("named_compound"):
            return NamedCompound(name=self._require(groups, "named_compound"))

        # SMILES compound
        if groups.get("smiles"):
            return SMILESCompound(smiles=self._require(groups, "smiles"))

        # Unknown/unannotated ion
        if groups.get("unannotated"):
            return UnknownIon(label=self._parse_int(groups, "unannotated_label"))

        # Should never reach here due to regex, but provide helpful error
        non_null = {k: v for k, v in groups.items() if v is not None}
        raise ValueError(f"Unable to parse ion type. Available groups: {non_null}")

    def _parse_isotopes(self, isotope_str: str | None) -> tuple[IsotopeSpecification, ...]:
        """Parse isotope notation into tuple of specifications

        Examples:
            "+i" -> (IsotopeSpecification(count=1),)
            "-2i13C" -> (IsotopeSpecification(count=-2, element="13C"),)
            "+i-2i13C+iA" -> (IsotopeSpecification(count=1), IsotopeSpecification(count=-2, element="13C"), IsotopeSpecification(count=1, is_average=True))
        """
        if not isotope_str:
            return ()

        # Extract individual isotope strings like "+i", "-2i13C", "+iA"
        isotope_matches = re.findall(ISOTOPE_REGEX_PATTERN, isotope_str)
        if not isotope_matches:
            return ()

        # Parse each isotope component
        isotopes: list[IsotopeSpecification] = []
        for match_groups in isotope_matches:
            # Reconstruct the isotope string from regex groups
            sign_str, count_str, element_or_avg = match_groups

            # Build the isotope string: sign + count + 'i' + element_or_avg
            isotope_string = f"{sign_str or '+'}{count_str}i{element_or_avg or ''}"
            isotopes.append(IsotopeSpecification.parse(isotope_string))

        return tuple(isotopes)

    def _parse_neutral_losses(self, losses_str: str | None) -> tuple[NeutralLoss, ...]:
        """Parse neutral losses/gains into tuple of NeutralLoss objects"""
        if not losses_str:
            return ()

        # Pattern matches: [+-] followed by number, formula, or named group

        loss_strings = re.findall(NEUTRAL_LOSS_REGEX_PATTERN, losses_str)

        losses: list[NeutralLoss] = []
        for loss_str in loss_strings:
            losses.append(NeutralLoss.parse(loss_str))
        return tuple(losses)

    def _parse_adducts(self, adduct_str: str | None) -> tuple[Adduct, ...]:
        """Parse adduct notation into tuple of Adduct objects

        Examples:
            "M+H" -> (Adduct(count=1, base_formula="H"),)
            "M+2H+Na" -> (Adduct(count=2, base_formula="H"), Adduct(count=1, base_formula="Na"))
            "M+NH4" -> (Adduct(count=1, base_formula="NH4"),)
            "M-H+2Na" -> (Adduct(count=-1, base_formula="H"), Adduct(count=2, base_formula="Na"))
        """
        if not adduct_str:
            return ()

        # Verify M prefix
        if not adduct_str.startswith("M"):
            raise ValueError(f"Adduct string must start with 'M': '{adduct_str}'")

        content = adduct_str[1:]  # Remove 'M'
        if not content:
            raise ValueError(f"Adduct string must have components after 'M': '{adduct_str}'")

        # Extract individual adduct strings like "+H", "+2Na", "-NH4"
        adduct_strings = re.findall(ADDUCT_REGEX_PATTERN, content)
        if not adduct_strings:
            raise ValueError(f"No adduct components found in '{adduct_str}'")

        # Parse each adduct component
        adducts: list[Adduct] = []
        for match_groups in adduct_strings:
            # Reconstruct the adduct string from regex groups
            sign_str, count_str, formula = match_groups
            adduct_string = f"{sign_str}{count_str}{formula}"
            adducts.append(Adduct.parse(adduct_string))

        return tuple(adducts)

    def _parse_mass_error(self, groups: dict[str, str | None]) -> MassError | None:
        """Parse mass error value and unit"""
        if not groups.get("mass_error"):
            return None

        mass_error_str = groups.get("mass_error")
        if mass_error_str is None:
            raise ValueError("Mass error value is missing")
        value = float(mass_error_str)
        unit = groups.get("mass_error_unit")

        if unit == "ppm":
            return MassError(value, "ppm")
        elif unit is None:
            return MassError(value, "da")
        else:
            raise ValueError(f"Unknown mass error unit: '{unit}'")

    # Helper methods for common parsing patterns
    def _require(self, groups: dict[str, str | None], key: str) -> str:
        """Get required string value from groups, raise if missing"""
        value = groups.get(key)
        if value is None or not isinstance(value, str):
            raise ValueError(f"Required field '{key}' is missing or invalid")
        return value

    def _require_int(self, groups: dict[str, str | None], key: str) -> int:
        """Get required integer value from groups, raise if missing"""
        value = groups.get(key)
        if value is None:
            raise ValueError(f"Required field '{key}' is missing")
        try:
            return int(value)
        except ValueError as e:
            raise ValueError(f"Field '{key}' must be an integer, got '{value}'") from e

    def _parse_int(self, groups: dict[str, str | None], key: str) -> int | None:
        """Parse optional integer from groups"""
        value = groups.get(key)
        if value is None:
            return None
        try:
            return int(value)
        except ValueError as e:
            raise ValueError(f"Field '{key}' must be an integer, got '{value}'") from e

    def _parse_float(self, groups: dict[str, str | None], key: str) -> float | None:
        """Parse optional float from groups"""
        value = groups.get(key)
        if value is None:
            return None
        try:
            return float(value)
        except ValueError as e:
            raise ValueError(f"Field '{key}' must be a number, got '{value}'") from e


[docs]
    def parse_multi(self, annotation_str: str) -> list[PafAnnotation]:
        """Parse potentially multiple comma-separated annotations"""
        if not annotation_str:
            return []

        annotations: list[PafAnnotation] = []
        for part in annotation_str.split(","):
            part = part.strip()
            if part:
                annotations.append(self.parse(part))

        return annotations




MZ_PAF_PARSER = mzPAFParser()



[docs]
def parse_multi(annotation_str: str) -> list[PafAnnotation]:
    """parse mzPAF annotation string into list of PafAnnotation"""
    return MZ_PAF_PARSER.parse_multi(annotation_str)




[docs]
def parse(annotation_str: str) -> PafAnnotation | list[PafAnnotation]:
    """parse single mzPAF annotation string into PafAnnotation"""
    annots = parse_multi(annotation_str)
    if len(annots) == 1:
        return annots[0]
    return annots




[docs]
def parse_single(annotation_str: str) -> PafAnnotation:
    """backward compatibility alias for parse()"""
    annots = parse_multi(annotation_str)
    if len(annots) != 1:
        raise ValueError(f"Expected single annotation, got {len(annots)}: '{annotation_str}'")
    return annots[0]