Source code for bio2bel.sources.tfregulons

# -*- coding: utf-8 -*-

"""Exporter for TFregulons."""

import logging
from functools import lru_cache
from typing import Set

import pandas as pd
from pyobo import get_name_id_mapping

import pybel.dsl
from pybel import BELGraph
from .. import ensure_path

logger = logging.getLogger(__name__)

MODULE = 'tfregulons'
VERSION = '20180915'
URL = f'https://github.com/saezlab/DoRothEA/blob/master/data/' \
      f'TFregulons/consensus/table/database_normal_{VERSION}.csv.zip?raw=true'


[docs]@lru_cache()
def get_df() -> pd.DataFrame:
    """Get the TFregulons dataframe."""
    path = ensure_path(MODULE, URL)
    return _read_df(path)


def _read_df(path) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        compression='zip',
        usecols=['TF', 'target', 'effect', 'score', 'pubmedID_from_curated_resources'],
    )
    df.rename(
        columns={
            'pubmedID_from_curated_resources': 'pmids',
            'TF': 'tf_hgnc_symbol',
            'target': 'target_hgnc_symbol',
        },
        inplace=True,
    )
    df = df[df['score'].isin(set('ABC'))]  # there are also D and E

    hgnc_name_to_id = get_name_id_mapping('hgnc')
    df['tf_hgnc_id'] = df['tf_hgnc_symbol'].map(hgnc_name_to_id.get)
    df['target_hgnc_id'] = df['target_hgnc_symbol'].map(hgnc_name_to_id.get)

    tf_missing_id = df['tf_hgnc_id'].isna()
    if tf_missing_id.any():
        missing_tf_symbols = df.loc[tf_missing_id, 'tf_hgnc_symbol'].unique()
        logger.warning(f'missing HGNC id for {len(missing_tf_symbols)} transcription factors')
        df = df[~tf_missing_id]

    target_missing_id = df['target_hgnc_id'].isna()
    if target_missing_id.any():
        missing_target_symbols = df.loc[target_missing_id, 'target_hgnc_symbol'].unique()
        logger.warning(f'missing HGNC id for {len(missing_target_symbols)} targets')
        df = df[~target_missing_id]

    return df


[docs]def get_hgnc_ids(graph: BELGraph) -> Set[str]:
    """Get HGNC identifiers for nodes in the graph."""
    return {
        node.identifier
        for node in graph
        if isinstance(node, pybel.dsl.CentralDogma) and node.namespace.lower() == 'hgnc'
    }


[docs]def get_bel() -> BELGraph:
    """Get the entirety of TFregulons as BEL."""
    graph = BELGraph(name='TFRegulons')
    df = get_df()
    _add_rows(df, graph)
    return graph


[docs]def enrich_graph(graph: BELGraph) -> None:
    """Enrich a graph with transcription factors effecting the genes/rnas/proteins in the graph."""
    hgnc_ids = get_hgnc_ids(graph)
    df = get_df()
    df = df[df['target_hgnc_id'].isin(hgnc_ids)]
    _add_rows(df, graph)


def _add_rows(df: pd.DataFrame, graph: BELGraph) -> None:
    for _, row in df.iterrows():
        effect = row['effect']
        if effect == 0:
            continue  # no binding. Could add negative BEL later

        tf_protein = pybel.dsl.Protein(
            namespace='hgnc',
            identifier=row['tf_hgnc_id'],
            name=row['tf_hgnc_symbol'],
        )
        target_rna = pybel.dsl.Rna(
            namespace='hgnc',
            identifier=row['target_hgnc_id'],
            name=row['target_hgnc_symbol'],
        )
        target_gene = target_rna.get_gene()

        if 'pmids' in row:
            citations = [pmid.strip() for pmid in row['pmids'].split(',')]
        else:
            citations = [('database', 'tfregulons')]

        evidence = 'From TFregulons'

        for citation in citations:
            if effect == 1:
                binds_dna_adder, affects_expression_adder = graph.add_directly_increases, graph.add_increases
            else:
                binds_dna_adder, affects_expression_adder = graph.add_directly_decreases, graph.add_decreases
            binds_dna_adder(
                pybel.dsl.ComplexAbundance([tf_protein, target_gene]),
                target_rna,
                citation=citation,
                evidence=evidence,
            )
            affects_expression_adder(
                tf_protein,
                target_rna,
                citation=citation,
                evidence=evidence,
            )
            graph.add_transcription(target_gene, target_rna)