# -*- coding: utf-8 -*-
"""Bio2BEL CTD Manager."""
import logging
from typing import List, Mapping, Optional
import pyctd
import pyctd.manager
import pyctd.manager.database
from pyctd.manager.database import DbManager
from pyctd.manager.query import QueryManager
from pyctd.manager.table import get_table_configurations
from sqlalchemy.ext.declarative import DeclarativeMeta
from tqdm import tqdm
import bio2bel_mesh
import pybel
from bio2bel import AbstractManager
from bio2bel.manager.bel_manager import BELManagerMixin
from bio2bel.manager.flask_manager import FlaskMixin
from bio2bel.utils import get_connection
from pybel import BELGraph
from pybel.constants import IDENTIFIER, NAME, NAMESPACE
from .constants import DATA_DIR, MODULE_NAME
from .enrichment_utils import add_chemical_gene_interaction
from .models import Base, ChemGeneIxn, Chemical, Disease, Gene, Pathway
__all__ = [
'Manager'
]
log = logging.getLogger(__name__)
def _get_connection_string(connection):
return get_connection(module_name=MODULE_NAME, connection=connection)
# Monkey patch PyCTD connection loader
pyctd.manager.database.get_connection_string = _get_connection_string
_exclude_tables = {
'exposure_event',
}
class _PyCTDManager(QueryManager, DbManager):
# Override the directory in which data gets stored
pyctd_data_dir = DATA_DIR
def _get_urls():
return [
pyctd.manager.defaults.url_base + pyctd.manager.table_conf.tables[model]['file_name']
for model in pyctd.manager.table_conf.tables
]
[docs]class Manager(AbstractManager, BELManagerMixin, FlaskMixin, _PyCTDManager):
"""Bio2BEL manager for the CTD."""
module_name = MODULE_NAME
flask_admin_models = [Gene, Chemical, Disease, Pathway, ChemGeneIxn]
# Compensate for some weird structuring of PyCTD code
tables = get_table_configurations()
@property
def _base(self) -> DeclarativeMeta:
return Base
[docs] def is_populated(self) -> bool:
"""Check if the database is already populated."""
return 0 < self.count_chemical_gene_interactions()
[docs] def populate(self, urls=None, force_download=False, only_tables=None, exclude_tables=None) -> None:
"""Updates the CTD database
1. downloads all files from CTD
2. drops all tables in database
3. creates all tables in database
4. import all data from CTD files
:param iter[str] urls: An iterable of URL strings
:param bool force_download: force method to download
"""
if not urls:
urls = _get_urls()
log.info('downloading CTD database from %s', urls)
self.download_urls(urls=urls, force_download=force_download)
log.info('importing tables')
self.import_tables(only_tables=only_tables, exclude_tables=(exclude_tables or _exclude_tables))
[docs] def count_genes(self) -> int:
"""Count the genes in the database."""
return self._count_model(Gene)
[docs] def list_chemicals(self) -> List[Chemical]:
"""List all chemicals."""
return self._list_model(Chemical)
[docs] def count_chemicals(self) -> int:
"""Count the chemicals in the database."""
return self._count_model(Chemical)
[docs] def list_chemical_gene_interactions(self) -> List[ChemGeneIxn]:
"""List all chemical-gene interactions."""
return self._list_model(ChemGeneIxn)
[docs] def count_chemical_gene_interactions(self) -> int:
"""Count the chemical-gene interactions in the database."""
return self._count_model(ChemGeneIxn)
[docs] def count_pathways(self) -> int:
"""Count the pathways in the database."""
return self._count_model(Pathway)
[docs] def count_diseases(self) -> int:
"""Count the diseases in the database."""
return self._count_model(Disease)
[docs] def summarize(self) -> Mapping[str, int]:
"""Return a summary dictionary of the database."""
return dict(
chemicals=self.count_chemicals(),
genes=self.count_genes(),
chemical_gene_interactions=self.count_chemical_gene_interactions(),
diseases=self.count_diseases(),
pathways=self.count_pathways(),
)
[docs] def get_chemical_by_mesh(self, mesh_id: str) -> Optional[Chemical]:
"""Get a chemical by its MeSH identifier, if it exists.
:param mesh_id: A MeSH identifier of a chemical
"""
return self.session.query(Chemical).filter(Chemical.chemical_id == mesh_id).one_or_none()
[docs] def get_chemical_by_cas(self, cas_rn: str) -> Optional[Chemical]:
"""Get a chemical by its CAS Registry Number, if it exists.
:param str cas_rn: A CAS Registry Number
:rtype: Optional[Chemical]
"""
return self.session.query(Chemical).filter(Chemical.cas_rn == cas_rn).one_or_none()
[docs] def get_gene_by_entrez_id(self, entrez_id: str) -> Optional[Gene]:
"""Get a gene by its Entrez Gene identifier, if it exists.
:param entrez_id: An Entrez Gene identifier of a gene
:rtype: Optional[Gene]
"""
return self.session.query(Gene).filter(Gene.gene_id == entrez_id).one_or_none()
[docs] def get_interaction_by_id(self, ixn_id: int) -> Optional[ChemGeneIxn]:
"""Get an interaction by its database identifier
:param ixn_id: An interaction database identifier
"""
return self.session.query(ChemGeneIxn).filter(ChemGeneIxn.id == ixn_id).one_or_none()
[docs] def enrich_graph_chemical(self, graph: BELGraph, mesh_id: str) -> None:
"""Enrich the BEL graph with chemical-gene interactions for the given chemical.
:param graph: A BEL graph
:param mesh_id: A MeSH identifier of a chemical
"""
chemical = self.get_chemical_by_mesh(mesh_id)
if chemical is None:
return
for ixn in chemical.gene_interactions:
add_chemical_gene_interaction(graph, ixn)
[docs] def enrich_graph_gene(self, graph: BELGraph, entrez_id: str) -> None:
"""Enrich the BEL graph with chemical-gene interactions for the given gene.
:param graph: A BEL graph
:param entrez_id: An Entrez Gene identifier of a gene
"""
gene = self.get_gene_by_entrez_id(entrez_id)
if gene is None:
return
for ixn in gene.chemical_interactions:
add_chemical_gene_interaction(graph, ixn)
[docs] def enrich_graph_genes(self, graph: BELGraph) -> None:
"""Enrich the BEL graph with chemical-gene interactions for all Entrez genes.
:param graph: A BEL graph
"""
for gene_node, data in graph.nodes(data=True):
namespace = data.get(NAMESPACE)
if namespace not in {'EG', 'EGID', 'ENTREZ'}:
continue
identifier = data.get(IDENTIFIER)
name = data.get(NAME)
if identifier is not None:
self.enrich_graph_gene(graph, identifier)
elif name is not None:
self.enrich_graph_gene(graph, name)
else:
raise KeyError
[docs] def enrich_chemicals(self, graph: BELGraph) -> None:
"""Find chemicals that can be mapped and enriched with the CTD.
:param pybel.BELGraph graph: A BEL graph
"""
for chemical_node, data in graph.nodes(data=True):
namespace = data.get(NAMESPACE)
if namespace not in {'MESHC', 'MESH'}:
continue
identifier = data.get(IDENTIFIER)
name = data.get(NAME)
if identifier is not None:
self.enrich_graph_chemical(graph, identifier)
elif name is not None:
self.enrich_graph_chemical(graph, name)
else:
raise KeyError
[docs] def to_bel(self) -> BELGraph:
"""Convert all possible aspects of the database to BEL.
.. warning:: Not complete!
To do:
- add namespaces
- use cursors
- multiprocessing
"""
graph = BELGraph(name='CTD', version='1.0.0')
mesh_manager = bio2bel_mesh.Manager(engine=self.engine, session=self.session)
mesh_manager.add_namespace_to_graph(graph)
for chem_gene_ixn in tqdm(self.list_chemical_gene_interactions(),
total=self.count_chemical_gene_interactions()):
add_chemical_gene_interaction(graph, chem_gene_ixn)
return graph