Source code for bio2bel.sources.biogrid

# -*- coding: utf-8 -*-

"""Download and convert `BioGRID <https://thebiogrid.org>`_ to BEL.

Run this script with ``python -m bio2bel.sources.biogrid``

The interaction information contained in  can be categorized into protein
interactions, genetic interactions, chemical associations, and post-translational modifications. BioGRID includes
information from major model organisms and humans.

The file downloaded from BioGRID is a zip archive containing a single file formatted in `PSI MITAB level 2.5
<https://wiki.thebiogrid.org/doku.php/psi_mitab_file>`_ compatible Tab Delimited Text file format, containing all
interaction and associated annotation data.

The interaction types in BioGRID were in the `PSI-MI <https://psicquic.github.io/MITAB25Format.html>`_
(Proteomics Standards Initiative - Molecular Interactions Controlled Vocabulary) format and were mapped to BEL
relations. The following table shows examples of how interaction types in BioGRID were mapped to BEL or other ontologies.

+------------------------------------------------------------------------+----------------------------------------+----------------------------+----------------------------+
| PSI-MI (BioGIRD)                                                       | Mapped BEL term                        | Source                     | Target                     |
+========================================================================+========================================+============================+============================+
| psi-mi:"MI:0794"(synthetic genetic interaction defined by inequality)  | :code:`pybel.BELGraph.add_association` | :class:`pybel.dsl.Gene`    | :class:`pybel.dsl.Gene`    |
+------------------------------------------------------------------------+----------------------------------------+----------------------------+----------------------------+
| psi-mi:"MI:0915"(physical association)                                 | :code:`pybel.BELGraph.add_association` | :class:`pybel.dsl.Protein` | :class:`pybel.dsl.Protein` |
+------------------------------------------------------------------------+----------------------------------------+----------------------------+----------------------------+
| psi-mi:"MI:0407"(direct interaction)                                   | :code:`pybel.BELGraph.add_binds`       | :class:`pybel.dsl.Protein` | :class:`pybel.dsl.Protein` |
+------------------------------------------------------------------------+----------------------------------------+----------------------------+----------------------------+

Summary statistics of the BEL graph generated in the BioGRID module:

+------------+----------+
| Key        | Value    |
+============+==========+
| Version    | v3.5.183 |
+------------+----------+
| Nodes      | 293030   |
+------------+----------+
| Edges      | 3127695  |
+------------+----------+
| Citations  | 9        |
+------------+----------+
| Components | 1225     |
+------------+----------+
| Density:   | 3.64E-05 |
+------------+----------+
"""

import logging
import os
from functools import lru_cache
from typing import Iterable, List, Optional, Tuple

import click
import pandas as pd
import pyobo.sources.biogrid
from more_click import verbose_option
from pyobo.identifier_utils import normalize_curie
from tqdm import tqdm

import pybel.dsl
from pybel import BELGraph
from ..utils import ensure_path, get_data_dir

__all__ = [
    'get_bel',
]

logger = logging.getLogger(__name__)

EVIDENCE = 'From BioGRID'
MODULE_NAME = 'biogrid'

VERSION = '3.5.186'
BASE_URL = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive'
URL = f'{BASE_URL}/BIOGRID-{VERSION}/BIOGRID-ALL-{VERSION}.mitab.zip'

"""All of these can be extracted from the original file with the following

cat BIOGRID-ALL-3.5.183.mitab.txt | cut -f 12 | sort | uniq -c

Note that column 12 is the interaction column. You could also do

gzcat BIOGRID-ALL-3.5.183.mitab.zip | cut -f 12 | sort | uniq -c

becuase the zip file only contains that one file
"""

#: Relationship types in BioGRID that map to BEL relation 'increases'
BIOGRID_GENE_ASSOCIATION = {
    'psi-mi:"MI:0794"(synthetic genetic interaction defined by inequality)',
    'psi-mi:"MI:0799"(additive genetic interaction defined by inequality)',
    'psi-mi:"MI:0796"(suppressive genetic interaction defined by inequality)',
}

#: Relationship types in BioGRID that map to BEL relation 'association'
BIOGRID_ASSOCIATION_ACTIONS = {
    'psi-mi:"MI:0403"(colocalization)',
    'psi-mi:"MI:0914"(association)',
    # Look on OLS: https://www.ebi.ac.uk/ols/ontologies/mi/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMI_0915
    # They're in a complex together, but not necessarily touching. This is
    # really dumb to put in a binary association database. Check ComplexPortal
    # or other higher granularity sources for more information
    'psi-mi:"MI:0915"(physical association)',
}

BIOGRID_BINDS_ACTIONS = {
    # https://www.ebi.ac.uk/ols/ontologies/mi/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMI_0407
    'psi-mi:"MI:0407"(direct interaction)',
}


@lru_cache()
def _get_ncbigene_mapping():
    return pyobo.sources.biogrid.get_ncbigene_mapping()


def _map_ncbigene(identifier):
    return _get_ncbigene_mapping().get(identifier)


#: biogrid id to ncbigene id
BIOGRID_NCBIGENE_REMAPPING = {
    '4349295': None,  # https://www.yeastgenome.org/locus/S000006792
    '4349491': None,  # http://www.candidagenome.org/cgi-bin/locus.pl?locus=CAF0007452
    '4349337': None,  # https://www.yeastgenome.org/locus/S000006962
    '4349775': None,  # http://www.candidagenome.org/cgi-bin/locus.pl?locus=CAL0000184983
    '4349716': None,  # http://www.candidagenome.org/cgi-bin/locus.pl?locus=CAL0000193047
    '4349853': None,  # http://www.candidagenome.org/cgi-bin/locus.pl?locus=CAL0006683
    '4383869': None,  # SARS-CoV2 protein ORF3B, not on uniprot or entrez
    '4383875': None,  # SARS-CoV2 protein ORF9C, not on uniprot or entrez
}

#: uniprot id to ncbigene id
UNIPROT_NCBIGENE_REMAPPING = {
    # FIXME
    'P0DTC1': None,  # SARS-CoV2 protein https://swissmodel.expasy.org/repository/uniprot/P0DTC1
    # TODO checkme
    'P0DTD2': '1489679',  # SARS-CoV2 protein https://swissmodel.expasy.org/repository/uniprot/P0DTD2
    'Q7TLC7': None,  # SARS-CoV protein
}


def _process_interactor(s: str) -> Optional[str]:
    prefix, identifier = normalize_curie(s)
    if prefix is None:
        logger.warning('could not parse %s', s)
        return

    if prefix == 'ncbigene':
        return identifier
    elif prefix == 'biogrid':
        ncbigene_identifier = _map_ncbigene(identifier)
        if ncbigene_identifier is not None:
            return ncbigene_identifier
        elif identifier in BIOGRID_NCBIGENE_REMAPPING:
            remapped = BIOGRID_NCBIGENE_REMAPPING[identifier]
            if not remapped:
                logger.debug('tried but failed curation on %s', s)
            return remapped
        else:
            logger.warning('need to curate: %s', s)
            return
    elif prefix == 'uniprot':
        if identifier in UNIPROT_NCBIGENE_REMAPPING:
            remapped = UNIPROT_NCBIGENE_REMAPPING[identifier]
            if not remapped:
                logger.debug('tried but failed curation on %s', s)
            return remapped
        else:
            logger.warning('need to curate: %s', s)
            return
    else:
        logger.warning('unhandled interactor: %s (%s:%s)', s, prefix, identifier)


def _process_xrefs(s: str) -> List[Tuple[str, str]]:
    return list(_iter_process_xrefs(s))


def _iter_process_xrefs(s: str) -> Iterable[Tuple[str, str]]:
    """Take a string with pipe-delimited curies and split/normalize them.

    Compact Uniform Identfiers (CURIE) examples:
    - hgnc:12345
    - ncbigene:12345
    - uniprot:P12345
    - ec-code:1.2.3.15

    Goal:
    make hgnc:1234|ncbigene:1245|uniprot...:12345" into a list of tuples
    """
    for curie in s.split('|'):
        curie = curie.strip()
        prefix, identifier = normalize_curie(curie)
        if prefix is not None:
            yield prefix, identifier


def _process_pmid(s: str) -> str:
    """Process provenance column."""
    if not s.startswith('pubmed:'):
        raise ValueError(f'Non pubmed: {s}')
    return s[len('pubmed:')]


COLUMNS = [
    '#ID Interactor A',
    'ID Interactor B',
    'Interaction Types',
    'Publication Identifiers',
    'Interaction Detection Method',
    'Source Database',
    'Confidence Values',
]


def get_processed_biogrid() -> pd.DataFrame:
    """Load BioGRID file, filter, and rename columns and return a dataframe.

    :return: dataframe of preprocessed BioGRID data
    """
    path = ensure_path(prefix=MODULE_NAME, url=URL)
    logger.info('reading BioGRID from %s', path)
    df = pd.read_csv(path, sep='\t', dtype=str, usecols=COLUMNS)

    logger.info('mapping provenance')
    df['Publication Identifiers'] = df['Publication Identifiers'].map(_process_pmid)

    logger.info('mapping interactors')
    df['#ID Interactor A'] = df['#ID Interactor A'].map(_process_interactor)
    df['ID Interactor B'] = df['ID Interactor B'].map(_process_interactor)

    # logger.info('mapping alternate identifiers')
    # df['Alt IDs Interactor A'] = df['Alt IDs Interactor A'].map(_process_xrefs)
    # df['Alt IDs Interactor B'] = df['Alt IDs Interactor B'].map(_process_xrefs)

    return df


[docs]def get_bel() -> BELGraph:
    """Get a BEL graph for BioGRID."""
    df = get_processed_biogrid()
    graph = BELGraph(name='BioGRID', version=VERSION)
    it = tqdm(df[COLUMNS].values, total=len(df.index), desc=f'Convering {MODULE_NAME} to BEL', unit_scale=True)
    for source_ncbigene_id, target_ncbigene_id, relation, pubmed_id, detection_method, source_db, confidence in it:
        if pd.isna(source_ncbigene_id) or pd.isna(target_ncbigene_id):
            continue
        _add_my_row(
            graph,
            relation=relation,
            source_ncbigene_id=source_ncbigene_id,
            target_ncbigene_id=target_ncbigene_id,
            pubmed_id=pubmed_id,
            int_detection_method=detection_method,
            source_database=source_db,
            confidence=confidence,
        )
    return graph


def _add_my_row(
    graph: BELGraph,
    relation: str,
    source_ncbigene_id: str,
    target_ncbigene_id: str,
    pubmed_id: str,
    int_detection_method: str,
    source_database: str,
    confidence: str,
) -> None:  # noqa:C901
    """Add an edge with information about relationship type, source, and target for every PubMed ID.

    :param graph: graph to add edges to
    :param relation: row value of column relation
    :param source_ncbigene_id: row value of column source
    :param target_ncbigene_id: row value of column target
    :param pubmed_id: row value of column pubmed_id
    :param int_detection_method: row value of column interaction detection method
    """
    annotations = {
        'psi-mi': relation,
        'biogrid-detection': int_detection_method,
        'biogrid-source': source_database,
        'biogrid-confidence': confidence,
    }

    if relation in BIOGRID_GENE_ASSOCIATION:
        graph.add_association(
            pybel.dsl.Gene(namespace='ncbigene', identifier=source_ncbigene_id),
            pybel.dsl.Gene(namespace='ncbigene', identifier=target_ncbigene_id),
            citation=pubmed_id,
            evidence=EVIDENCE,
            annotations=annotations,
        )
    elif relation in BIOGRID_ASSOCIATION_ACTIONS:
        graph.add_association(
            pybel.dsl.Protein(namespace='ncbigene', identifier=source_ncbigene_id),
            pybel.dsl.Protein(namespace='ncbigene', identifier=target_ncbigene_id),
            citation=pubmed_id,
            evidence=EVIDENCE,
            annotations=annotations,
        )
    elif relation in BIOGRID_BINDS_ACTIONS:
        graph.add_binds(
            pybel.dsl.Protein(namespace='ncbigene', identifier=source_ncbigene_id),
            pybel.dsl.Protein(namespace='ncbigene', identifier=target_ncbigene_id),
            citation=pubmed_id,
            evidence=EVIDENCE,
            annotations=annotations,
        )
    else:
        raise ValueError(f'Unhandled BioGrid relation: {relation}')


def _create_table_biogrid():
    df = get_processed_biogrid()

    d = []
    for interaction_set, bel_relation in zip(
            [BIOGRID_BINDS_ACTIONS, BIOGRID_ASSOCIATION_ACTIONS, BIOGRID_GENE_ASSOCIATION],
            ['hasComponent', 'association', 'association'],
    ):

        for interaction in interaction_set:
            tmp_df = df[df['#ID Interactor A'] == interaction]

            if tmp_df.empty:
                continue

            source = 'Protein'
            target = 'Protein'

            source_type = 'p'
            target_type = 'p'

            if interaction in BIOGRID_GENE_ASSOCIATION:
                source = 'Gene'
                source_type = 'g'
                target = 'Gene'
                target_type = 'g'

            source_identifier = tmp_df['#ID Interactor A'].iloc[0]

            target_identifier = tmp_df['ID Interactor B'].iloc[0]

            bel_example = f'{source_type}{source_identifier} {bel_relation} {target_type}{target_identifier}'

            d.append({
                'Source Type': source,
                'Target Type': target,
                'Interaction Type': interaction,
                'BEL Example': bel_example,
            })

    return pd.DataFrame(d)


@click.command()
@verbose_option
@click.option(
    '-o', '--output',
    default=os.path.join(get_data_dir(MODULE_NAME), 'biogrid.bel.nodelink.json.gz'),
    show_default=True,
)
def main(output: Optional[str]):
    """Convert and summarize BioGRID."""
    click.echo('Converting')
    graph = get_bel()
    click.echo('Summarizing')
    click.echo(graph.summary_str())
    click.echo('Outputting')
    pybel.dump(graph, output)


if __name__ == '__main__':
    main()