Start working on experiments/decagon_run.

6 年之前 · 373a12f51a
--- a/docker/postgresql/Dockerfile
+++ b/docker/postgresql/Dockerfile
@@ -1,3 +1,6 @@
 FROM alpine:latest

 RUN apk add postgresql

 RUN mkdir /data && \
  chown postgres:postgres /data
--- a/docker/postgresql/docker-init.sh
+++ b/docker/postgresql/docker-init.sh
@@ -0,0 +1,5 @@
 #!/bin/sh

 if [ ! -f /data/PG_VERSION ]; then
  initdb -D /data --pwfile=/superuser_password
 fi
--- a/experiments/decagon_run/decagon_run.py
+++ b/experiments/decagon_run/decagon_run.py
@@ -0,0 +1,62 @@
 #!/usr/bin/env python3

 from icosagon.data import Data
 import os
 import pandas as pd
 from bisect import bisect_left
 import torch


 def index(a, x):
    i = bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return i
    raise ValueError


 def main():
    path = '/pstore/data/data_science/ref/decagon'
    df_combo = pd.read_csv(os.path.join(path, 'bio-decagon-combo.csv'))
    df_effcat = pd.read_csv(os.path.join(path, 'bio-decagon-effectcategories.csv'))
    df_mono = pd.read_csv(os.path.join(path, 'bio-decagon-mono.csv'))
    df_ppi = pd.read_csv(os.path.join(path, 'bio-decagon-ppi.csv'))
    df_tgtall = pd.read_csv(os.path.join(path, 'bio-decagon-targets-all.csv'))
    df_tgt = pd.read_csv(os.path.join(path, 'bio-decagon-targets.csv'))
    lst = [ 'df_combo', 'df_effcat', 'df_mono', 'df_ppi', 'df_tgtall', 'df_tgt' ]
    for nam in lst:
        print(f'len({nam}): {len(locals()[nam])}')
        print(f'{nam}.columns: {locals()[nam].columns}')

    genes = set()
    genes = genes.union(df_ppi['Gene 1']).union(df_ppi['Gene 2']) \
        .union(df_tgtall['Gene']).union(df_tgt['Gene'])
    genes = sorted(genes)
    print('len(genes):', len(genes))

    drugs = set()
    drugs = drugs.union(df_combo['STITCH 1']).union(df_combo['STITCH 2']) \
        .union(df_mono['STITCH']).union(df_tgtall['STITCH']).union(df_tgt['STITCH'])
    drugs = sorted(drugs)
    print('len(drugs):', len(drugs))

    data = Data()
    data.add_node_type('Gene', len(genes))
    data.add_node_type('Drug', len(drugs))

    print('Indexing rows...')
    rows = [index(genes, g) for g in df_ppi['Gene 1']]
    print('Indexing cols...')
    cols = [index(genes, g) for g in df_ppi['Gene 2']]
    indices = list(zip(rows, cols))
    indices = torch.tensor(indices).transpose(0, 1)
    values = torch.ones(len(rows))
    print('indices.shape:', indices.shape, 'values.shape:', values.shape)
    adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(genes),) * 2)
    adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2
    print('adj_mat created')
    fam = data.add_relation_family('PPI', 0, 0, True)
    rel = fam.add_relation_type('PPI', adj_mat)


 if __name__ == '__main__':
    main()