Source code for glypy.io.glyconnect

'''
Glyconnect
----------

A simple dialect of the Glyconnect/GlycoMod glycan composition notation.
'''
import re

from dataclasses import dataclass, field
from typing import Dict, Union, List, Optional, Type, Generic, TypeVar

from glypy.structure.glycan_composition import (
    FrozenGlycanComposition,
    FrozenMonosaccharideResidue,
    SubstituentResidue)
from glypy.structure.glycan import Glycan


try:
    import requests
except ImportError:
    requests = None

#: The set of defined symbols and their mappings.
defined_symbols: Dict[str, Union[SubstituentResidue, FrozenMonosaccharideResidue]] = {
    "Hex": FrozenMonosaccharideResidue.from_iupac_lite("Hex"),
    "HexNAc": FrozenMonosaccharideResidue.from_iupac_lite('HexNAc'),
    "dHex": FrozenMonosaccharideResidue.from_iupac_lite('dHex'),
    "NeuAc": FrozenMonosaccharideResidue.from_iupac_lite("NeuAc"),
    "NeuGc": FrozenMonosaccharideResidue.from_iupac_lite("NeuGc"),
    "S": SubstituentResidue("sulfate"),
    "Su": SubstituentResidue("sulfate"),
    "Sulpho": SubstituentResidue("sulfate"),
    "P": SubstituentResidue("phosphate"),
    "Ph": SubstituentResidue("phosphate"),
    "Phospho": SubstituentResidue("phosphate"),
    "Xyl": FrozenMonosaccharideResidue.from_iupac_lite("Xyl"),
    "HexA": FrozenMonosaccharideResidue.from_iupac_lite("HexA"),
    "Pent": FrozenMonosaccharideResidue.from_iupac_lite("Pen"),
    "Kdn": FrozenMonosaccharideResidue.from_iupac_lite("Kdn"),
}


def _invert_mapping(table: Dict[str, Union[SubstituentResidue,
                                           FrozenMonosaccharideResidue]]) -> Dict[Union[SubstituentResidue,
                                                                                        FrozenMonosaccharideResidue],
                                                                                        str]:
    inverted = {}
    for k, v in table.items():
        if v in inverted:
            if len(k) > len(inverted[v]):
                continue
        inverted[v] = k
    return inverted


monosaccharide_to_symbol = _invert_mapping(defined_symbols)


def _generate_pattern(symbols: List[str]) -> re.Pattern:
    symbols = sorted(symbols, key=len, reverse=True)
    return re.compile(f"({'|'.join(symbols)})(\d+?)")


tokenizer = re.compile(r"([^:\s]+):(\d+)")
undelimited_tokenizer = _generate_pattern(defined_symbols)


[docs] def loads(string): '''Parse a GlyConnect glycan composition into a :class:`~.FrozenGlycanComposition` Parameters ---------- string: str The string to parse Returns ------- :class:`~.FrozenGlycanComposition` Raises ------ :class:`KeyError`: Raised if a key isn't defined by the GlyConnect dialect ''' tokens = tokenizer.findall(string) if not tokens: tokens = undelimited_tokenizer.findall(string) gc = FrozenGlycanComposition() for mono, count in tokens: mono = defined_symbols[mono] count = int(count) gc[mono] += count return gc
[docs] def dumps(composition): '''Encode :class:`~.GlycanComposition` or :class:`~.Glycan` into the GlyConnect glycan composition text format. Parameters ---------- composition: :class:`~.GlycanComposition` or :class:`~.Glycan` The structure to format Returns ------- :class:`str` Raises ------ :class:`KeyError`: Raised if a key isn't defined by the GlyConnect Compozitor dialect ''' if isinstance(composition, Glycan): composition = FrozenGlycanComposition.from_glycan(composition) tokens = [] for key, value in composition.items(): key = monosaccharide_to_symbol[key] tokens.append("%s:%d" % (key, value)) return ' '.join(tokens)
API_SERVER = "https://glyconnect.expasy.org/api" def from_glytoucan_id(glytoucan_id): response = requests.post( f"{API_SERVER}/structures/search/glytoucan", data={"glytoucan_id": glytoucan_id}) response.raise_for_status() data = response.json() return data @dataclass class RecordBase: @classmethod def from_dict(cls, data): return cls(**data) @dataclass class TaxonomyRecord(RecordBase): id: int taxonomy_id: str common_name: Optional[str] = None species: Optional[str] = None @dataclass class UniprotProteinAccessionRecord(RecordBase): uniprot_acc: str uniprot_id: Optional[str] = None glygen: Optional[str] = None nextprot: Optional[str] = None genecards: Optional[str] = None glycodomain: Optional[str] = None @dataclass class ProteinRecord(RecordBase): id: int name: str taxonomy: TaxonomyRecord uniprots: List[UniprotProteinAccessionRecord] @classmethod def from_dict(cls, data: dict): tax = data.get("taxonomy") if tax: tax = TaxonomyRecord.from_dict(tax) uniprots = list(map(UniprotProteinAccessionRecord.from_dict, data.get("uniprots", []))) return cls(data['id'], data['name'], tax, uniprots) @dataclass class SourceRecord(RecordBase): type: str name: str id: int ref: Optional[str] = None ontology: Optional[str] = None brenda_id: Optional[str] = None @dataclass class Source(RecordBase): source: List[SourceRecord] taxons: List[TaxonomyRecord] @classmethod def from_dict(cls, data: dict): source = [SourceRecord.from_dict(x) for x in data.get("source", [])] taxons = [TaxonomyRecord.from_dict(x) for x in data.get("taxons", [])] return cls(source, taxons) @dataclass class CellLine(RecordBase): cellosaurus_id: str id: int is_problematic: bool name: str @dataclass class Disease: id: int name: str do_id: Optional[str] = None taxons: List[TaxonomyRecord] = field(default_factory=list) @classmethod def from_dict(cls, data: dict): taxons = [TaxonomyRecord.from_dict(x) for x in data.get("taxons", [])] return cls(data['id'], data['name'], data.get('do_id'), taxons) @dataclass class CompositionRecord(RecordBase): format_byonic: str format_condensed: str format_glyconnect: str format_numeric: str id: int mass: float mass_monoisotopic: float reviewed: bool glytoucan_id: Optional[str] = None def parse(self): return loads(self.format_glyconnect) @dataclass class StructureRecord(RecordBase): glycan_core: str glycan_type: str has_image: bool id: int is_undefined: bool reviewed: bool glytoucan_id: Optional[str] = None @dataclass class CompozitorGlycan(RecordBase): composition: CompositionRecord structure: StructureRecord taxonomy: Optional[TaxonomyRecord] protein: Optional[ProteinRecord] @classmethod def from_dict(cls, data: dict): comp = CompositionRecord.from_dict(data['composition']) struct = StructureRecord.from_dict(data['structure']) protein = ProteinRecord.from_dict(data['protein']) taxonomy = TaxonomyRecord.from_dict(data['taxonomy']) return cls(comp, struct, protein, taxonomy) T = TypeVar("T", bound=RecordBase) @dataclass class APICollectionProperty(Generic[T]): url: str record_type: Type[T] def __get__(self, obj, objtype=None) -> List[T]: if obj is None: return self result = obj._cache.get(self.url) if result is not None: return result resp = requests.get(self.url) resp.raise_for_status() data = resp.json() result = [self.record_type.from_dict(d) for d in data] obj._cache[self.url] = result return result def __delete__(self, obj): del obj._cache[self.url] @dataclass class Compozitor: _cache: dict = field(default_factory=dict, repr=False) proteins = APICollectionProperty( f"{API_SERVER}/proteins-all", ProteinRecord ) sources = APICollectionProperty( f"{API_SERVER}/sources-all", Source ) cell_lines = APICollectionProperty( f"{API_SERVER}/cell_lines-all", CellLine ) diseases = APICollectionProperty( f"{API_SERVER}/diseases-all", Disease ) def query(self, taxonomy: Optional[str]=None, cell_line: Optional[str]=None, protein: Optional[str]=None, disease: Optional[str]=None): params = {} if taxonomy: params['taxonomy'] = (taxonomy) if cell_line: params['cell_line'] = (cell_line) if protein: params['protein'] = (protein) if disease: params['disease'] = (disease) resp = requests.get(f"{API_SERVER}/glycosylations", params) resp.raise_for_status() data = resp.json() if isinstance(data, list): raise ValueError("Malformed query or invalid response") results = [] if data['results']: for res in data['results']: results.append(CompozitorGlycan.from_dict(res)) return results client = Compozitor() query = client.query