# -*- coding: utf-8 -*-
"""KEGG database models."""
from typing import List, Optional, Set
from sqlalchemy import Column, ForeignKey, Integer, String, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
import pybel.dsl
from .constants import HGNC, KEGG
Base = declarative_base()
TABLE_PREFIX = 'kegg'
PATHWAY_TABLE_NAME = f'{TABLE_PREFIX}_pathway'
PATHWAY_TABLE_HIERARCHY = f'{TABLE_PREFIX}_pathway_hierarchy'
PROTEIN_TABLE_NAME = f'{TABLE_PREFIX}_protein'
PROTEIN_PATHWAY_TABLE = f'{TABLE_PREFIX}_protein_pathway'
protein_pathway = Table(
PROTEIN_PATHWAY_TABLE,
Base.metadata,
Column('protein_id', Integer, ForeignKey(f'{PROTEIN_TABLE_NAME}.id'), primary_key=True),
Column('pathway_id', Integer, ForeignKey(f'{PATHWAY_TABLE_NAME}.id'), primary_key=True)
)
[docs]class Pathway(Base): # type: ignore
"""Pathway Table."""
__tablename__ = PATHWAY_TABLE_NAME
id = Column(Integer, primary_key=True)
kegg_id = Column(String(255), unique=True, nullable=False, index=True, doc='KEGG id of the pathway')
name = Column(String(255), doc='pathway name')
proteins = relationship(
'Protein',
secondary=protein_pathway,
backref='pathways'
)
bel_encoding = 'B'
def __repr__(self):
"""Return name."""
return f'Pathway(kegg_id={self.kegg_id}, name="{self.name}")'
def __str__(self):
"""Return name."""
return str(self.name)
[docs] def serialize_to_pathway_node(self) -> pybel.dsl.BiologicalProcess:
"""Serialize to PyBEL node data dictionary."""
return pybel.dsl.BiologicalProcess(
namespace=KEGG,
name=str(self.name),
identifier=str(self.kegg_id)
)
[docs] def get_gene_set(self) -> Set['Protein']:
"""Return the genes associated with the pathway (gene set).
Note this function restricts to HGNC symbols genes.
"""
return {
protein.hgnc_symbol
for protein in self.proteins
if protein.hgnc_symbol
}
@property
def resource_id(self) -> str:
"""Return kegg identifier."""
return self.kegg_id
@property
def url(self) -> str:
"""Return url pointing to kegg pathway."""
return 'http://www.kegg.jp/dbget-bin/www_bget?pathway+map{}'.format(self.kegg_id.strip('path:hsa'))
[docs]class Protein(Base): # type: ignore
"""Genes Table."""
__tablename__ = PROTEIN_TABLE_NAME
id = Column(Integer, primary_key=True)
kegg_id = Column(String(255), nullable=False, index=True, doc='KEGG id of the protein')
uniprot_id = Column(String(255), doc='uniprot id of the protein (there could be more than one)')
hgnc_id = Column(String(255), doc='hgnc id of the protein')
hgnc_symbol = Column(String(255), doc='hgnc symbol of the protein')
def __repr__(self):
"""Return HGNC symbol."""
return f'Protein(kegg_id={self.kegg_id}, ' \
f'uniprot_id={self.uniprot_id}, hgnc_id={self.hgnc_id}, hgnc_symbol={self.hgnc_symbol})'
def __str__(self):
"""Return HGNC symbol."""
return str(self.hgnc_symbol)
[docs] def to_pybel(self) -> pybel.dsl.Protein:
"""Serialize to PyBEL node data dictionary."""
return pybel.dsl.Protein(
namespace=HGNC,
name=self.hgnc_symbol,
identifier=str(self.hgnc_id)
)
[docs] def get_uniprot_ids(self) -> Optional[List[str]]:
"""Return a list of uniprot ids."""
if not self.uniprot_id:
return None
return self.uniprot_id.split(" ")
[docs] def get_pathways_ids(self) -> Set[str]:
"""Return the pathways associated with the protein."""
return {
pathway.kegg_id
for pathway in self.pathways
}