diff --git a/docker/postgresql/Dockerfile b/docker/postgresql/Dockerfile index dcfa052..137526d 100644 --- a/docker/postgresql/Dockerfile +++ b/docker/postgresql/Dockerfile @@ -1,3 +1,6 @@ FROM alpine:latest RUN apk add postgresql + +RUN mkdir /data && \ + chown postgres:postgres /data diff --git a/docker/postgresql/docker-init.sh b/docker/postgresql/docker-init.sh new file mode 100644 index 0000000..a69650a --- /dev/null +++ b/docker/postgresql/docker-init.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +if [ ! -f /data/PG_VERSION ]; then + initdb -D /data --pwfile=/superuser_password +fi diff --git a/experiments/decagon_run/decagon_run.py b/experiments/decagon_run/decagon_run.py new file mode 100644 index 0000000..239250e --- /dev/null +++ b/experiments/decagon_run/decagon_run.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +from icosagon.data import Data +import os +import pandas as pd +from bisect import bisect_left +import torch + + +def index(a, x): + i = bisect_left(a, x) + if i != len(a) and a[i] == x: + return i + raise ValueError + + +def main(): + path = '/pstore/data/data_science/ref/decagon' + df_combo = pd.read_csv(os.path.join(path, 'bio-decagon-combo.csv')) + df_effcat = pd.read_csv(os.path.join(path, 'bio-decagon-effectcategories.csv')) + df_mono = pd.read_csv(os.path.join(path, 'bio-decagon-mono.csv')) + df_ppi = pd.read_csv(os.path.join(path, 'bio-decagon-ppi.csv')) + df_tgtall = pd.read_csv(os.path.join(path, 'bio-decagon-targets-all.csv')) + df_tgt = pd.read_csv(os.path.join(path, 'bio-decagon-targets.csv')) + lst = [ 'df_combo', 'df_effcat', 'df_mono', 'df_ppi', 'df_tgtall', 'df_tgt' ] + for nam in lst: + print(f'len({nam}): {len(locals()[nam])}') + print(f'{nam}.columns: {locals()[nam].columns}') + + genes = set() + genes = genes.union(df_ppi['Gene 1']).union(df_ppi['Gene 2']) \ + .union(df_tgtall['Gene']).union(df_tgt['Gene']) + genes = sorted(genes) + print('len(genes):', len(genes)) + + drugs = set() + drugs = drugs.union(df_combo['STITCH 1']).union(df_combo['STITCH 2']) \ + .union(df_mono['STITCH']).union(df_tgtall['STITCH']).union(df_tgt['STITCH']) + drugs = sorted(drugs) + print('len(drugs):', len(drugs)) + + data = Data() + data.add_node_type('Gene', len(genes)) + data.add_node_type('Drug', len(drugs)) + + print('Indexing rows...') + rows = [index(genes, g) for g in df_ppi['Gene 1']] + print('Indexing cols...') + cols = [index(genes, g) for g in df_ppi['Gene 2']] + indices = list(zip(rows, cols)) + indices = torch.tensor(indices).transpose(0, 1) + values = torch.ones(len(rows)) + print('indices.shape:', indices.shape, 'values.shape:', values.shape) + adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(genes),) * 2) + adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2 + print('adj_mat created') + fam = data.add_relation_family('PPI', 0, 0, True) + rel = fam.add_relation_type('PPI', adj_mat) + + +if __name__ == '__main__': + main()