Source code for bio2bel_hmdb.manager

# -*- coding: utf-8 -*-

"""The Manager is a key component of HMDB. This class is used to create, populate and query the local HMDB version."""

import logging
from typing import List, Mapping, Optional

from bel_resources import get_bel_resource
from bio2bel import AbstractManager
from tqdm import tqdm

from .constants import DOID, HP, MESHD, MODULE_NAME, ONTOLOGIES, ONTOLOGY_NAMESPACES
from .models import (
    Base, Biofluid, Biofunction, CellularLocation, Disease, Metabolite, MetaboliteBiofluid, MetaboliteCellularLocation,
    MetaboliteDiseaseReference, MetabolitePathway, MetaboliteProtein, MetaboliteReference,
    MetaboliteSynonym, MetaboliteTissue, Pathway, Protein, Reference, SecondaryAccession, Tissue,
)
from .parser import get_data

__all__ = [
    'Manager',
]

log = logging.getLogger(__name__)


[docs]class Manager(AbstractManager):
    """Metabolite-proteins and metabolite-disease associations."""

    module_name = MODULE_NAME
    flask_admin_models = [Metabolite, Disease, Protein, Pathway, Biofluid]
    _base = Base

[docs]    def is_populated(self) -> bool:
        """Check if the database is already populated."""
        return 0 < self.count_metabolites()

    @staticmethod
    def _get_tag(element_tag) -> str:
        """Delete the XML namespace prefix when calling element.tag

        :param element_tag: tag attribute of an XML element
        """
        return element_tag.split("}")[1]

    def _populate_with_1_layer_elements(
            self,
            element,
            metabolite_instance,
            instance_dict,
            table,
            relation_table,
            column_name: str,
    ):
        """Parse and populate database with metabolite elements, which themselfes have one more layer.

        :param element: the current parent XML element. E.g. "pathways" where the children would have the tag "pathway".
        :param models.Metabolite metabolite_instance: metabolite object which is associated with the instances (e.g. is
                                                      involved in that "pathway")
        :param dict instance_dict: dictionary which tracks if the found instance is already present in the table and can
                                   then refer to it
        :param class table: sqlalchemy class to which the instances belong. E.g. "Pathways"
        :param class relation_table: sqlalchemy class which stores the many to many relation between the instances and
                                     the metabolites
        :param column_name: Name of the column in the relation tables which does not represent the metabolite.
                                e.g. reference, pathway etc
        :rtype: dict
        """
        for instance_element in element:
            instance_dict_key = instance_element.text

            if instance_dict_key not in instance_dict:  # check if instance is already in table
                new_instance_dict = {column_name: instance_dict_key}
                instance_dict[instance_dict_key] = table(**new_instance_dict)
                self.session.add(instance_dict[instance_dict_key])

            # create metabolite-instance relation object
            new_meta_rel_dict = {"metabolite": metabolite_instance, column_name: instance_dict[instance_dict_key]}
            self.session.add(relation_table(**new_meta_rel_dict))
        return instance_dict

    def _populate_with_2_layer_elements(
            self,
            element,
            metabolite_instance,
            instance_dict,
            table,
            relation_table,
            column,
            instance_dict_key=None,
            metabolite_column='metabolite',
    ):
        """Parse and populate database with metabolite elements, which themselves have two more layers.

        :param element: the current parent XML element. E.g. "pathways" where the children would have the tag "pathway".
        :param models.Metabolite metabolite_instance: metabolite object which is associated with the instances (e.g. is
        involved in that "pathway")
        :param dict instance_dict: dictionary which tracks if the found instance is already present in the table and can
        then refer to it
        :param type table: sqlalchemy class to which the instances belong. E.g. "Pathways"
        :param type relation_table: sqlalchemy class which stores the many to many relation between the instances and
        the metabolites
        :param str column: column name of the relation table which is not the metabolite
        :param str instance_dict_key: String which is used as the key for the instance_dict. (to ensure uniqueness in
        the instance_dict)
        :param str metabolite_column: column of the relation table which represents the foreignkey to the main table.
        In our database model the Metabolite table.
        :rtype: dict
        """
        if instance_dict_key is None and len(element) > 0:
            instance_dict_key = self._get_tag(element[0][0].tag)

        for instance_element in element:
            # build pathway object dict to create pathway object
            instance_object_dict = {}

            # create pathway instance
            for instance_sub_element in instance_element:
                cutted_pathway_tag = self._get_tag(instance_sub_element.tag)
                instance_object_dict[cutted_pathway_tag] = instance_sub_element.text

            # add MetabolitePathway relation and continue with next pathway if pathway already present in Pathways
            if instance_object_dict[instance_dict_key] in instance_dict:
                new_meta_rel_dict = {
                    metabolite_column: metabolite_instance,
                    column: instance_dict[instance_object_dict[instance_dict_key]]
                }
                new_meta_rel = relation_table(**new_meta_rel_dict)
                self.session.add(new_meta_rel)
                continue

            instance_dict[instance_object_dict[instance_dict_key]] = table(**instance_object_dict)
            self.session.add(instance_dict[instance_object_dict[instance_dict_key]])

            new_meta_rel_dict = {
                metabolite_column: metabolite_instance,
                column: instance_dict[instance_object_dict[instance_dict_key]]
            }
            new_meta_rel = relation_table(**new_meta_rel_dict)
            self.session.add(new_meta_rel)

        return instance_dict

    def _populate_diseases(
            self,
            element,
            references_dict,
            diseases_dict,
            metabolite_instance,
            disease_ontologies=None,
            map_dis=True,
    ):
        """Populates the database with disease and related reference information.

        :param element: Element object from the xml ElementTree
        :param dict references_dict: Dictionary to keep track of which references are already in the database
        :param dict diseases_dict: Dictionary to keep track of which diseases are already in the database
        :param models.Metabolite metabolite_instance: Metabolite object to which the diseases and references are related
        :param boolean map_dis: If True the HMDB disease names will be mapped to different ontologies.
        :rtype: dict, dict
        """
        for disease_element in element:
            disease_instance = Disease()

            for disease_sub_element in disease_element:
                dtag = self._get_tag(disease_sub_element.tag)

                if dtag != "references":
                    setattr(disease_instance, dtag, disease_sub_element.text)
                    continue

                if disease_instance.name not in diseases_dict:  # add disease instance if not already in table
                    # map to different disease ontologies if map is True
                    if map_dis:
                        disease_lower = disease_instance.name.lower()  # for case insensitivity
                        for ontology in disease_ontologies:
                            if disease_lower not in disease_ontologies[ontology]:
                                continue

                            v = disease_ontologies[ontology][disease_lower]

                            if ontology == DOID:
                                setattr(disease_instance, 'dion', v)
                            elif ontology == HP:
                                setattr(disease_instance, 'hpo', v)
                            elif ontology == MESHD:
                                setattr(disease_instance, 'mesh_diseases', v)

                    diseases_dict[disease_instance.name] = disease_instance
                    self.session.add(disease_instance)

                for reference_element in disease_sub_element:
                    new_reference_object_dict = {}  # dict to check if reference is already presend in table

                    for reference_sub_element in reference_element:  # construct new reference object
                        reference_tag = self._get_tag(reference_sub_element.tag)
                        new_reference_object_dict[reference_tag] = reference_sub_element.text

                    # add if not already in reference table
                    if new_reference_object_dict['reference_text'] not in references_dict:
                        references_dict[new_reference_object_dict['reference_text']] = Reference(
                            **new_reference_object_dict)
                        self.session.add(references_dict[new_reference_object_dict['reference_text']])

                    rel_meta_dis_ref = MetaboliteDiseaseReference(
                        metabolite=metabolite_instance,
                        disease=diseases_dict[disease_instance.name],
                        reference=references_dict[new_reference_object_dict['reference_text']]
                    )
                    self.session.add(rel_meta_dis_ref)
        return references_dict, diseases_dict

    @staticmethod
    def _disease_ontology_dict(ontology: str) -> Mapping[str, str]:
        """Create a dictionary from the disease ontologies used for mapping HMDB disease names to those ontologies."""
        doid_path = ONTOLOGY_NAMESPACES[ontology]
        doid_ns = get_bel_resource(doid_path)
        return {value.lower(): value for value in doid_ns['Values']}

[docs]    def populate(self, source: Optional[str] = None, map_dis: bool = True, group_size: int = 500_000):
        """Populate the database with the HMDB data.

        :param source: Path to an .xml file. If None the whole HMDB will be downloaded and used for population.
        :param map_dis: Should diseases be mapped?
        """
        # construct sets for disease ontologies for mapping hmdb diseases
        if not map_dis:
            disease_ontologies = None
        else:
            disease_ontologies = {
                ontology: self._disease_ontology_dict(ontology)
                for ontology in ONTOLOGIES
            }

        # construct xml tree
        tree = get_data(source)
        root = tree.getroot()

        # dicts to check unique constraints for specific tables
        biofluids_dict = {}
        tissues_dict = {}
        pathways_dict = {}
        proteins_dict = {}
        references_dict = {}
        diseases_dict = {}
        # biofunctions_dict = {}
        cellular_locations_dict = {}

        # iterate through xml tree
        for i, elements in enumerate(tqdm(root, desc='HMDB Metabolite')):
            # create metabolite dict used to feed in main metabolite table
            metabolite = Metabolite()

            for element in elements:
                # delete namespace prefix
                tag = self._get_tag(element.tag)

                # handle wikipedia typo in xml tags
                if tag == "wikipidia":
                    log.warning("HMDB fixed the 'wikipidia' tag to 'wikipedia'. Change code.")
                    tag = "wikipedia"

                if tag == "secondary_accessions":
                    self.session.add_all([
                        SecondaryAccession(
                            metabolite=metabolite,
                            secondary_accession=secondary_accession_element.text
                        )
                        for secondary_accession_element in element
                    ])

                elif tag == "synonyms":
                    synonyms = {
                        synonym_element.text
                        for synonym_element in element
                    }
                    self.session.add_all([
                        MetaboliteSynonym(
                            metabolite=metabolite,
                            synonym=synonym,
                        )
                        for synonym in synonyms
                    ])

                elif tag == "taxonomy":  # will be delayed to later versions since not important for BEL
                    continue

                elif tag == "ontology":
                    continue

                elif tag == "cellular_locations":
                    cellular_locations_dict = self._populate_with_1_layer_elements(
                        element,
                        metabolite,
                        cellular_locations_dict,
                        CellularLocation,
                        MetaboliteCellularLocation,
                        "cellular_location"
                    )

                elif tag == "experimental_properties":  # will be delayed to later versions since not important for BEL
                    continue

                elif tag == "predicted_properties":  # will be delayed to later versions since not important for BEL
                    continue

                elif tag == "spectra":  # will not be processed since the corresponding database is down
                    continue

                elif tag == "biospecimen_locations":
                    biofluids_dict = self._populate_with_1_layer_elements(
                        element,
                        metabolite,
                        biofluids_dict,
                        Biofluid,
                        MetaboliteBiofluid,
                        'biofluid',
                    )

                elif tag == "tissue_locations":
                    tissues_dict = self._populate_with_1_layer_elements(
                        element,
                        metabolite,
                        tissues_dict,
                        Tissue,
                        MetaboliteTissue,
                        'tissue',
                    )

                elif tag == "pathways":
                    pathways_dict = self._populate_with_2_layer_elements(
                        element,
                        metabolite,
                        pathways_dict,
                        Pathway,
                        MetabolitePathway,
                        'pathway',
                    )

                elif tag == "normal_concentrations":  # will be delayed to later versions since not important for BEL
                    continue

                elif tag == "abnormal_concentrations":  # will be delayed to later versions since not important for BEL
                    continue

                elif tag == "diseases":
                    references_dict, diseases_dict = self._populate_diseases(
                        element,
                        references_dict,
                        diseases_dict,
                        metabolite,
                        disease_ontologies,
                        map_dis=map_dis,
                    )

                elif tag == "general_references":
                    references_dict = self._populate_with_2_layer_elements(
                        element,
                        metabolite,
                        references_dict,
                        Reference,
                        MetaboliteReference,
                        'reference',
                        "reference_text",
                    )

                elif tag == "protein_associations":
                    proteins_dict = self._populate_with_2_layer_elements(
                        element,
                        metabolite,
                        proteins_dict,
                        Protein,
                        MetaboliteProtein,
                        'protein',
                    )

                else:  # feed in main metabolite table
                    setattr(metabolite, tag, element.text)

            self.session.add(metabolite)

            if (i + 1) % group_size:
                log.warning('committing')
                self.session.commit()

        self.session.commit()

[docs]    def get_metabolite_by_accession(self, hmdb_metabolite_accession: str) -> Optional[Metabolite]:
        """Query the constructed HMDB database and extract a metabolite object.

        :param hmdb_metabolite_accession: HMDB metabolite identifier

        Example:

        >>> import bio2bel_hmdb
        >>> manager = bio2bel_hmdb.Manager()
        >>> manager.get_metabolite_by_accession("HMDB00072")
        """
        return self.session.query(Metabolite).filter(Metabolite.accession == hmdb_metabolite_accession).one_or_none()

[docs]    def query_metabolite_associated_proteins(self, hmdb_metabolite_id: str) -> Optional[List[Protein]]:
        """Query the constructed HMDB database to get the metabolite associated protein relations for BEL enrichment

        :param hmdb_metabolite_id: HMDB metabolite identifier
        """
        metabolite = self.get_metabolite_by_accession(hmdb_metabolite_id)
        if metabolite is not None:
            return metabolite.proteins

[docs]    def query_metabolite_associated_diseases(self, hmdb_metabolite_id: str) -> List[Disease]:
        """Query the constructed HMDB database to get the metabolite associated disease relations for BEL enrichment

        :param hmdb_metabolite_id: HMDB metabolite identifier
        """
        metabolite = self.get_metabolite_by_accession(hmdb_metabolite_id)
        return metabolite.diseases

[docs]    def query_disease_associated_metabolites(self, disease_name: str) -> List[Metabolite]:
        """Query function that returns a list of metabolite-disease interactions, which are associated to a disease.

        :param disease_name: HMDB disease name
        """
        return self.session.query(Disease).filter(Disease.name == disease_name).one_or_none().metabolites

[docs]    def query_protein_associated_metabolites(self, uniprot_id):
        """Query function that returns a list of metabolite-disease interactions, which are associated to a disease.

        :param str uniprot_id: uniprot identifier of a protein for which the associated metabolite relations should be
                                outputted
        :rtype: list
        """
        return self.session.query(Protein).filter(Protein.uniprot_id == uniprot_id).one_or_none().metabolites

[docs]    def get_hmdb_accession(self):
        """Create a list of all HMDB metabolite identifiers present in the database.

        :rtype: list
        """
        accessions = self.session.query(Metabolite.accession).all()
        if not accessions:
            log.warning("Database not populated. Please populate database before calling this function")

        return [a[0] for a in accessions]  # if anybody knows a better way of querying for a flat list. Please change.

[docs]    def get_hmdb_diseases(self):
        """Create a list of all disease names present in the database.

        :rtype: list
        """
        accessions = self.session.query(Disease.name).all()
        if not accessions:
            log.warning("Database not populated. Please populate database before calling this function")

        return [a for a, in accessions]

    def _get_models(self, interaction_table):
        """Extracts all interactions from the many to many interaction table.

        :param type interaction_table: Relation table from the database model. (e.g. MetaboliteProteins)
        :rtype: query
        """
        return self.session.query(interaction_table).all()

    def get_metabolite_disease_interactions(self) -> List[MetaboliteDiseaseReference]:
        return self._get_models(MetaboliteDiseaseReference)

    def get_metabolite_protein_interactions(self) -> List[MetaboliteProtein]:
        return self._get_models(MetaboliteProtein)

[docs]    def count_diseases(self) -> int:
        """Count the number of diseases in the database."""
        return self.session.query(Disease).count()

[docs]    def count_cellular_locations(self):
        """Count the number of cellular locations in the database."""
        return self.session.query(CellularLocation).count()

[docs]    def count_references(self):
        """Count the number of literature references in the database."""
        return self.session.query(Reference).count()

[docs]    def get_reference_by_pubmed_id(self, pubmed_id: str) -> Optional[Reference]:
        """Get a reference by its PubMed identifier if it exists.

        :param pubmed_id: The PubMed identifier to search
        """
        return self.session.query(Reference).filter(Reference.pubmed_id == pubmed_id).one_or_none()

[docs]    def count_proteins(self) -> int:
        """Count the number of proteins in the database."""
        return self.session.query(Protein).count()

[docs]    def count_biofunctions(self) -> int:
        """Count the number of biofunctions in the database."""
        return self.session.query(Biofunction).count()

[docs]    def count_metabolites(self) -> int:
        """Count the number of metabolites in the database."""
        return self._count_model(Metabolite)

[docs]    def count_pathways(self) -> int:
        """Count the number of pathways in the database."""
        return self._count_model(Pathway)

[docs]    def count_tissues(self) -> int:
        """Count the number of tissues in the database."""
        return self._count_model(Tissue)

[docs]    def summarize(self) -> Mapping[str, int]:
        """Summarize the contents of the database in a dictionary."""
        return dict(
            proteins=self.count_proteins(),
            diseases=self.count_diseases(),
            biofunctions=self.count_biofunctions(),
            references=self.count_references(),
            cellular_locations=self.count_cellular_locations(),
            metabolites=self.count_metabolites(),
            tissues=self.count_tissues(),
        )