Source code for bio2bel_reactome.manager

# -*- coding: utf-8 -*-

"""This module populates the tables of bio2bel_reactome."""

import logging
from collections import defaultdict
from typing import Dict, List, Mapping, Optional, Set

import pandas as pd
from tqdm import tqdm

from bio2bel.compath import CompathManager
from pyobo import get_name_id_mapping
from .constants import MODULE_NAME, SPECIES_REMAPPING
from .models import Base, Chemical, Pathway, Protein, Species, chemical_pathway, protein_pathway
from .parsers.entity_pathways import get_procesed_chemical_pathways_df, get_procesed_proteins_pathways_df
from .parsers.pathway_hierarchy import get_pathway_hierarchy_df, parse_pathway_hierarchy
from .parsers.pathway_names import get_pathway_names_df, parse_pathway_names

logger = logging.getLogger(__name__)

__all__ = [
    'Manager',
]


[docs]class Manager(CompathManager): """Protein-pathway and chemical-pathway memberships.""" module_name = MODULE_NAME protein_model = Protein _base = Base edge_model = [protein_pathway, chemical_pathway] namespace_model = pathway_model = Pathway flask_admin_models = [Pathway, Protein, Species, Chemical] has_hierarchy = True # Indicates that this manager can handle hierarchies with the Pathway Model def __init__(self, *args, **kwargs) -> None: # noqa: D107 super().__init__(*args, **kwargs) # Global dictionary self.uniprot_id_to_protein: Dict[str, Protein] = {} self.chebi_id_to_chemical: Dict[str, Chemical] = {}
[docs] def summarize(self) -> Mapping[str, int]: """Summarize the database.""" return { 'pathways': self.count_pathways(), 'proteins': self.count_proteins(), 'chemicals': self.count_chemicals(), 'species': self.count_species(), }
[docs] def count_chemicals(self) -> int: """Count the chemicals in the database.""" return self.session.query(Chemical).count()
[docs] def count_species(self) -> int: """Count the species in the database.""" return self.session.query(Species).count()
[docs] def get_gene_sets(self, only_human: bool = False) -> Mapping[str, Set[str]]: """Return pathway - genesets mapping.""" if only_human: pathways = self.get_human_pathways() else: pathways = self.list_pathways() return { pathway.name: { protein.hgnc_symbol for protein in pathway.proteins if protein.hgnc_symbol } for pathway in pathways if pathway.proteins }
[docs] def get_or_create_pathway( self, *, reactome_id: str, name: str, species: Species, chemicals: Optional[List[Chemical]], ) -> Pathway: """Get a pathway from the database or creates it. :param reactome_id: pathway identifier :param name: name of the pathway :param species: Species object :param chemicals: An optional list of chemicals that belong too this pathway """ pathway = self.get_pathway_by_id(reactome_id) if pathway is None: pathway = Pathway( identifier=reactome_id, name=name, species=species, chemicals=chemicals, ) self.session.add(pathway) return pathway
[docs] def get_or_create_chemical(self, *, chebi_id: str, chebi_name: str) -> Chemical: """Get a Chemical from the database or creates it. :param chebi_id: ChEBI identifier :param chebi_name: ChEBI name """ chemical = self.get_chemical_by_chebi_id(chebi_id) if chemical is None: chemical = Chemical( chebi_id=chebi_id, name=chebi_name, ) self.session.add(chemical) return chemical
[docs] def get_or_create_species(self, *, taxonomy_id: str, name: str) -> Species: """Get a Species from the database or creates it.""" species = self.get_species_by_name(name) if species is None: species = Species(taxonomy_id=taxonomy_id, name=name) self.session.add(species) return species
[docs] def get_or_create_protein( self, uniprot_id: str, hgnc_symbol: Optional[str] = None, hgnc_id: Optional[str] = None, ) -> Protein: """Get a protein from the database or creates it. :param uniprot_id: pathway identifier :param hgnc_symbol: name of the pathway :param hgnc_id: Species object """ protein = self.get_protein_by_uniprot_id(uniprot_id) if protein is not None: return protein protein = self.uniprot_id_to_protein.get(uniprot_id) if protein is not None: self.session.add(protein) return protein protein = self.uniprot_id_to_protein[uniprot_id] = Protein( uniprot_id=uniprot_id, hgnc_symbol=hgnc_symbol, hgnc_id=hgnc_id, ) self.session.add(protein) return protein
[docs] def get_species_by_name(self, species_name: str) -> Optional[Species]: """Get a Species by its species_name. :param species_name: name """ return self.session.query(Species).filter(Species.name == species_name).one_or_none()
[docs] def get_pathway_names_to_ids(self, only_human: bool = False): """Return a dictionary of pathway names to ids. :rtype: dict[str,str] """ if only_human: pathways = self.get_human_pathways() else: pathways = self.list_pathways() return { pathway.name: pathway.identifier for pathway in pathways }
[docs] def get_pathway_parent_by_id(self, reactome_id: str) -> Optional[Pathway]: """Get parent pathway by its reactome id. :param reactome_id: reactome identifier """ pathway = self.get_pathway_by_id(reactome_id) if not pathway or not pathway.parent: return None return pathway.parent
[docs] def get_top_hiearchy_parent_by_id(self, reactome_id: str) -> Optional[Pathway]: """Get the oldest pathway at the top of the hierarchy a pathway by its reactome id. :param reactome_id: reactome identifier """ pathway = self.get_pathway_by_id(reactome_id) if not pathway.parent: return pathway return self.get_top_hiearchy_parent_by_id(pathway.parent.identifier)
[docs] def get_all_top_hierarchy_pathways(self) -> List[Pathway]: """Get all pathways without a parent (top hierarchy).""" all_pathways = self.list_pathways() return [ pathway for pathway in all_pathways if not pathway.parent_id ]
[docs] def get_human_pathways(self) -> List[Pathway]: """Get human pathways.""" return self.get_pathways_by_species('Homo sapiens')
[docs] def get_pathways_by_species(self, species_name: str) -> Optional[List[Pathway]]: """Get pathways by species.""" filtered_species = self.session.query(Species).filter(Species.name == species_name).one_or_none() if not filtered_species: return None return filtered_species.pathways
[docs] def get_chemical_by_chebi_id(self, chebi_id: str) -> Optional[Chemical]: """Get chemical by ChEBI id.""" return self.session.query(Chemical).filter(Chemical.chebi_id == chebi_id).one_or_none()
[docs] def get_protein_by_uniprot_id(self, uniprot_id: str) -> Optional[Protein]: """Get protein by UniProt id.""" return self.session.query(Protein).filter(Protein.uniprot_id == uniprot_id).one_or_none()
"""Custom Methods to Populate the DB""" def _populate_pathways( self, chemical_mapping: Mapping[str, List[Chemical]], url: Optional[str] = None, ) -> None: """Populate the pathway table. :param url: url from pathway table file """ df = get_pathway_names_df(url=url) pathways_dict, species_set = parse_pathway_names(df) species_name_to_id = get_name_id_mapping('ncbitaxon') species_name_to_model = {} for species_name in tqdm(species_set, desc='populating species'): species_name = SPECIES_REMAPPING.get(species_name, species_name) species_name_to_model[species_name] = self.get_or_create_species( name=species_name, taxonomy_id=species_name_to_id[species_name], ) for reactome_id, (name, species_name) in tqdm(pathways_dict.items(), desc='populating pathways'): species_name = SPECIES_REMAPPING.get(species_name, species_name) pathway = self.get_or_create_pathway( reactome_id=reactome_id, name=name, species=species_name_to_model[species_name], chemicals=chemical_mapping.get(reactome_id, []), ) self.session.add(pathway) self.session.commit() def _pathway_hierarchy(self, url: Optional[str] = None) -> None: """Links pathway models through hierarchy. :param url: url from pathway hierarchy file """ df = get_pathway_hierarchy_df(url=url) pathways_hierarchy = parse_pathway_hierarchy(df) for parent_id, child_id in tqdm(pathways_hierarchy, desc='populating pathway hierarchy'): if parent_id is None: logger.warning('parent id is None') continue if child_id is None: logger.warning('child id is None') continue parent = self.get_pathway_by_id(parent_id) child = self.get_pathway_by_id(child_id) parent.children.append(child) self.session.commit() def _pathway_protein(self, url: Optional[str] = None) -> None: """Populate UniProt tables. :param url: url from pathway protein file """ pathways_proteins_df = get_procesed_proteins_pathways_df(url=url) missing_reactome_ids = set() missing_hgnc_info = set() protein_info_df = pathways_proteins_df[ ['uniprot_id', 'uniprot_accession', 'hgnc_id', 'hgnc_symbol']].drop_duplicates() it = tqdm(protein_info_df.values, total=len(protein_info_df.index), desc='populating proteins') for uniprot_id, uniprot_accession, hgnc_id, hgnc_symbol in it: self.uniprot_id_to_protein[uniprot_id] = Protein( uniprot_id=uniprot_id, uniprot_accession=uniprot_accession, hgnc_id=hgnc_id, hgnc_symbol=hgnc_symbol, ) it = tqdm( pathways_proteins_df[['uniprot_id', 'reactome_id']].values, total=len(pathways_proteins_df.index), desc='populating proteins-pathway relations', ) for uniprot_id, reactome_id in it: if uniprot_id is None: logger.debug('uniprot_id is none') continue protein = self.uniprot_id_to_protein[uniprot_id] pathway = self.get_pathway_by_id(reactome_id) if pathway is None: if reactome_id not in missing_reactome_ids: it.write(f'protein/pathway mapping: could not find reactome:{reactome_id}') missing_reactome_ids.add(reactome_id) continue if pathway not in protein.pathways: protein.pathways.append(pathway) self.session.commit() if missing_reactome_ids: logger.warning('missing %d reactome ids', len(missing_reactome_ids)) if missing_hgnc_info: logger.warning('missing %d hgncs', len(missing_hgnc_info)) def _get_chemical_mapping(self, url: Optional[str] = None) -> Mapping[str, List[Chemical]]: """Populate ChEBI tables. :param url: url from pathway chemical file """ chemical_pathways_df = get_procesed_chemical_pathways_df(url=url) chemicals_df = chemical_pathways_df[['chebi_id', 'chebi_name']].drop_duplicates() it = tqdm(chemicals_df.values, total=len(chemicals_df.index), desc='populating chemicals') chebi_id_to_chemical = {} for chebi_id, chebi_name in it: if pd.isna(chebi_id): continue chebi_id_to_chemical[chebi_id] = Chemical(chebi_id=chebi_id, name=chebi_name) rv = defaultdict(list) _slim_chemical_pathways_df = chemical_pathways_df[['chebi_id', 'reactome_id']].drop_duplicates() it = tqdm( _slim_chemical_pathways_df.values, total=len(_slim_chemical_pathways_df.index), desc='populating chemical/reactome', ) for chebi_id, reactome_id in it: chemical = chebi_id_to_chemical[chebi_id] rv[reactome_id].append(chemical) return dict(rv)
[docs] def populate( self, pathways_path: Optional[str] = None, pathways_hierarchy_path: Optional[str] = None, pathways_proteins_path: Optional[str] = None, pathways_chemicals_path: Optional[str] = None, ) -> None: """Populate all tables. :param pathways_path: url from pathway table file :param pathways_hierarchy_path: url from pathway hierarchy file :param pathways_proteins_path: url from pathway protein file :param pathways_chemicals_path: url from pathway chemical file """ chemical_mapping = self._get_chemical_mapping(url=pathways_chemicals_path) self._populate_pathways(url=pathways_path, chemical_mapping=chemical_mapping) self._pathway_hierarchy(url=pathways_hierarchy_path) self._pathway_protein(url=pathways_proteins_path)
def _add_admin(self, app, **kwargs): from flask_admin import Admin from flask_admin.contrib.sqla import ModelView class PathwayView(ModelView): """Pathway view in Flask-admin.""" column_searchable_list = ( Pathway.identifier, Pathway.name, ) class ProteinView(ModelView): """Protein view in Flask-admin.""" column_searchable_list = ( Protein.hgnc_symbol, Protein.uniprot_id, Protein.hgnc_id, ) class SpeciesView(ModelView): """Species view in Flask-admin.""" column_searchable_list = ( Species.taxonomy_id, Species.name, ) class ChemicalView(ModelView): """Chemical view in Flask-admin.""" column_searchable_list = ( Chemical.chebi_id, Chemical.name, ) admin = Admin(app, **kwargs) admin.add_view(PathwayView(Pathway, self.session)) admin.add_view(ProteinView(Protein, self.session)) admin.add_view(ChemicalView(Chemical, self.session)) admin.add_view(SpeciesView(Species, self.session)) return admin