Source code for bio2bel_hmdb.enrich
# -*- coding: utf-8 -*-
"""
Enrich BEL graphs
=================
In the current build it is possible to enrich BEL graphs containing metabolites with associated
disease or protein information and to enrich BEL graphs containing disease or protein information with associated metabolites.
This can be done with the functions further explained in `BEL Serialization`_
.. _BEL Serialization: bel_serialization.html
2. Enriching BEL graphs
-----------------------
Using an BEL graph with metabolites (represented using the `HMDB namespace`_) it can be enriched with disease and protein information from HMDB.
.. _HMDB namespace: construct_namspaces.html
2.1 Metabolites-Proteins
~~~~~~~~~~~~~~~~~~~~~~~~
For a graph containing metabolites:
>>> enrich_metabolites_proteins(bel_graph, manager)
The result of this will be a BEL graph which now includes relations between the metabolites and proteins.
For a graph containing proteins (named using uniprot identifiers):
>>> enrich_proteins_metabolites(bel_graph, manager)
This will result in a BEL graph where the proteins are linked to associated metabolites.
2.2 Metabolites-Diseases
~~~~~~~~~~~~~~~~~~~~~~~~
For a graph containing metabolites:
>>> enrich_metabolites_diseases(bel_graph, manager)
The result of this will be a BEL graph which now includes relations between the metabolites and diseases.
For a graph containing diseases (named using HMDB identifiers):
>>> enrich_diseases_metabolites(bel_graph, manager)
This will result in a BEL graph where the diseases are linked to associated metabolites.
"""
import logging
from typing import Optional
from pybel import BELGraph
from pybel.constants import (
ABUNDANCE, ANNOTATIONS, ASSOCIATION, CITATION, CITATION_REFERENCE, CITATION_TYPE, CITATION_TYPE_PUBMED, EVIDENCE,
FUNCTION, NAME, NAMESPACE, PATHOLOGY, PROTEIN, RELATION,
)
from pybel.struct.pipeline.decorators import in_place_transformation
from .manager import Manager
log = logging.getLogger(__name__)
def _check_namespaces(data, bel_function, bel_namespace):
"""Makes code more structured and reusable."""
if data[FUNCTION] != bel_function:
return False
if NAMESPACE not in data:
return False
if data[NAMESPACE] == bel_namespace:
return True
elif data[NAMESPACE] != bel_namespace:
log.warning("Unable to map namespace: %s", data[NAMESPACE])
return False
# enrich proteins and metabolites
[docs]@in_place_transformation
def enrich_metabolites_proteins(graph: BELGraph, manager: Optional[Manager] = None):
"""Enrich a given BEL graph, which includes metabolites with proteins, that are associated to the metabolites."""
if manager is None:
manager = Manager()
for node in list(graph):
if _check_namespaces(node, ABUNDANCE, 'HMDB'):
metabolite_protein_interactions = manager.query_metabolite_associated_proteins(node[NAME])
else:
continue
if not metabolite_protein_interactions:
log.warning("Unable to find node: %s", node)
continue
for association in metabolite_protein_interactions:
protein_data = association.protein.serialize_to_bel()
protein_tuple = graph.add_node_from_data(protein_data)
graph.add_edge(protein_tuple, node, attr_dict={
RELATION: ASSOCIATION,
EVIDENCE: None,
CITATION: {
CITATION_TYPE: None,
CITATION_REFERENCE: None,
},
ANNOTATIONS: {
'name': association.protein.name,
'protein_type': association.protein.protein_type
}
})
[docs]@in_place_transformation
def enrich_proteins_metabolites(graph: BELGraph, manager: Optional[Manager] = None):
"""Enrich a given BEL graph, which includes uniprot proteins with HMDB metabolites,
that are associated to the proteins.
"""
if manager is None:
manager = Manager()
for node in list(graph):
if _check_namespaces(node, PROTEIN, 'UP'):
protein_metabolite_interactions = manager.query_protein_associated_metabolites(node[NAME])
else:
continue
if protein_metabolite_interactions is None:
log.warning("Unable to find node: %s", node)
continue
for association in protein_metabolite_interactions:
metabolite_data = association.metabolite.serialize_to_bel()
metabolite_tuple = graph.add_node_from_data(metabolite_data)
graph.add_edge(metabolite_tuple, node, attr_dict={
RELATION: ASSOCIATION,
EVIDENCE: None,
CITATION: {
CITATION_TYPE: None,
CITATION_REFERENCE: None,
},
ANNOTATIONS: {
'name': association.protein.name,
'protein_type': association.protein.protein_type
}
})
# enrich diseases and metabolites
[docs]@in_place_transformation
def enrich_metabolites_diseases(graph: BELGraph, manager: Optional[Manager] = None):
"""Enrich a given BEL graph, which includes metabolites with diseases, to which the metabolites are associated."""
if manager is None:
manager = Manager()
for data in list(graph):
if _check_namespaces(data, ABUNDANCE, 'HMDB'):
metabolite_disease_interactions = manager.query_metabolite_associated_diseases(data[NAME])
else:
continue
if metabolite_disease_interactions is None:
log.warning("Unable to find node: %s", data)
continue
# add edges and collect all the references for this edge
i = 0
while i < len(metabolite_disease_interactions):
association = metabolite_disease_interactions[i]
references = [] # list for storing the reference articles
old_disease = association.disease
while True: # collect the references for the metabolite disease interaction
try:
if old_disease != metabolite_disease_interactions[i].disease:
break # break if disease has changed
references.append(metabolite_disease_interactions[i].reference.pubmed_id)
i += 1
except IndexError:
break
# add disease node and construct edge
disease_data = association.disease.serialize_to_bel()
disease_tuple = graph.add_node_from_data(disease_data)
graph.add_edge(disease_tuple, data, attr_dict={
RELATION: ASSOCIATION,
EVIDENCE: None,
CITATION: {
CITATION_TYPE: CITATION_TYPE_PUBMED,
CITATION_REFERENCE: references[0],
},
ANNOTATIONS: {
'omim_id': association.disease.omim_id,
'additional_references': references[1::]
}
})
[docs]@in_place_transformation
def enrich_diseases_metabolites(graph: BELGraph, manager: Optional[Manager] = None):
"""Enrich a given BEL graph, which includes HMDB diseases with HMDB metabolites, which are associated to the
diseases."""
if manager is None:
manager = Manager()
for data in list(graph):
if _check_namespaces(data, PATHOLOGY, 'HMDB_D'):
disease_metabolite_interactions = manager.query_disease_associated_metabolites(data[NAME])
else:
continue
if not disease_metabolite_interactions:
log.warning("Unable to find node: %s", data)
continue
# add edges and collect all the references for this edge
i = 0
while i < len(disease_metabolite_interactions):
association = disease_metabolite_interactions[i]
references = [] # list for storing the reference articles
old_metabolite = association.metabolite
while True: # collect the references for the metabolite disease interaction
try:
if old_metabolite != disease_metabolite_interactions[i].metabolite:
break # break if disease has changed
references.append(disease_metabolite_interactions[i].reference.pubmed_id)
i += 1
except IndexError:
break
# add disease node and construct edge
metabolite_data = association.metabolite.serialize_to_bel()
metabolite_tuple = graph.add_node_from_data(metabolite_data)
graph.add_edge(metabolite_tuple, data, attr_dict={
RELATION: ASSOCIATION,
EVIDENCE: None,
CITATION: {
CITATION_TYPE: CITATION_TYPE_PUBMED,
CITATION_REFERENCE: references[0],
},
ANNOTATIONS: {
'omim_id': association.disease.omim_id,
'additional_references': references[1::]
}
})