From 13940671ccb2b0381b1ff03fb16f928aa226eb95 Mon Sep 17 00:00:00 2001 From: Stanislaw Adaszewski Date: Fri, 17 Jul 2020 11:55:38 +0200 Subject: [PATCH] Performance tests. --- docs/nodes-involved.svg | 438 ++++++++++++++++++ .../decagon_run_effcat/decagon_run_effcat.py | 131 ++++++ src/icosagon/compile.py | 28 ++ tests/icosagon/test_trainloop.py | 33 ++ 4 files changed, 630 insertions(+) create mode 100644 docs/nodes-involved.svg create mode 100644 experiments/decagon_run_effcat/decagon_run_effcat.py create mode 100644 src/icosagon/compile.py diff --git a/docs/nodes-involved.svg b/docs/nodes-involved.svg new file mode 100644 index 0000000..045d85e --- /dev/null +++ b/docs/nodes-involved.svg @@ -0,0 +1,438 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/experiments/decagon_run_effcat/decagon_run_effcat.py b/experiments/decagon_run_effcat/decagon_run_effcat.py new file mode 100644 index 0000000..afe65ec --- /dev/null +++ b/experiments/decagon_run_effcat/decagon_run_effcat.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +from icosagon.data import Data +from icosagon.trainprep import TrainValTest, \ + prepare_training +from icosagon.model import Model +from icosagon.trainloop import TrainLoop +import os +import pandas as pd +from bisect import bisect_left +import torch +import sys + + +def index(a, x): + i = bisect_left(a, x) + if i != len(a) and a[i] == x: + return i + raise ValueError + + +def load_data(dev): + path = '/pstore/data/data_science/ref/decagon' + df_combo = pd.read_csv(os.path.join(path, 'bio-decagon-combo.csv')) + df_effcat = pd.read_csv(os.path.join(path, 'bio-decagon-effectcategories.csv')) + df_mono = pd.read_csv(os.path.join(path, 'bio-decagon-mono.csv')) + df_ppi = pd.read_csv(os.path.join(path, 'bio-decagon-ppi.csv')) + df_tgtall = pd.read_csv(os.path.join(path, 'bio-decagon-targets-all.csv')) + df_tgt = pd.read_csv(os.path.join(path, 'bio-decagon-targets.csv')) + lst = [ 'df_combo', 'df_effcat', 'df_mono', 'df_ppi', 'df_tgtall', 'df_tgt' ] + for nam in lst: + print(f'len({nam}): {len(locals()[nam])}') + print(f'{nam}.columns: {locals()[nam].columns}') + + genes = set() + genes = genes.union(df_ppi['Gene 1']).union(df_ppi['Gene 2']) \ + .union(df_tgtall['Gene']).union(df_tgt['Gene']) + genes = sorted(genes) + print('len(genes):', len(genes)) + + drugs = set() + drugs = drugs.union(df_combo['STITCH 1']).union(df_combo['STITCH 2']) \ + .union(df_mono['STITCH']).union(df_tgtall['STITCH']).union(df_tgt['STITCH']) + drugs = sorted(drugs) + print('len(drugs):', len(drugs)) + + data = Data() + data.add_node_type('Gene', len(genes)) + data.add_node_type('Drug', len(drugs)) + + print('Preparing PPI...') + print('Indexing rows...') + rows = [index(genes, g) for g in df_ppi['Gene 1']] + print('Indexing cols...') + cols = [index(genes, g) for g in df_ppi['Gene 2']] + indices = list(zip(rows, cols)) + indices = torch.tensor(indices).transpose(0, 1) + values = torch.ones(len(rows)) + print('indices.shape:', indices.shape, 'values.shape:', values.shape) + adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(genes),) * 2, + device=dev) + adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2 + print('adj_mat created') + fam = data.add_relation_family('PPI', 0, 0, True) + rel = fam.add_relation_type('PPI', adj_mat) + print('OK') + + print('Preparing Drug-Gene (Target) edges...') + rows = [index(drugs, d) for d in df_tgtall['STITCH']] + cols = [index(genes, g) for g in df_tgtall['Gene']] + indices = list(zip(rows, cols)) + indices = torch.tensor(indices).transpose(0, 1) + values = torch.ones(len(rows)) + adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(drugs), len(genes)), + device=dev) + fam = data.add_relation_family('Drug-Gene (Target)', 1, 0, True) + rel = fam.add_relation_type('Drug-Gene (Target)', adj_mat) + print('OK') + + df_combo_effcat = df_combo.merge(df_effcat, left_on='Polypharmacy Side Effect', right_on='Side Effect') + disease_classes = [] + + print('Preparing Drug-Drug (Side Effect) edges...') + fam = data.add_relation_family('Drug-Drug (Side Effect)', 1, 1, True) + print('# of side effects:', len(df_combo), 'unique:', len(df_combo['Polypharmacy Side Effect'].unique())) + for discls, df in df_combo_effcat.groupby('Disease Class'): + disease_classes.append(discls) + sys.stdout.write('.') # print(eff, '...') + sys.stdout.flush() + rows = [index(drugs, d) for d in df['STITCH 1']] + cols = [index(drugs, d) for d in df['STITCH 2']] + indices = list(zip(rows, cols)) + indices = torch.tensor(indices).transpose(0, 1) + values = torch.ones(len(rows)) + adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(drugs), len(drugs)), + device=dev) + adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2 + rel = fam.add_relation_type(df['Polypharmacy Side Effect'], adj_mat) + print() + print('len(disease_classes):', len(disease_classes)) + print('OK') + + return data + + +def _wrap(obj, method_name): + orig_fn = getattr(obj, method_name) + def fn(*args, **kwargs): + print(f'{method_name}() :: ENTER') + res = orig_fn(*args, **kwargs) + print(f'{method_name}() :: EXIT') + return res + setattr(obj, method_name, fn) + + +def main(): + dev = torch.device('cuda:0') + data = load_data(dev) + prep_d = prepare_training(data, TrainValTest(.8, .1, .1)) + _wrap(Model, 'build') + model = Model(prep_d) + model = model.to(dev) + # model = torch.nn.DataParallel(model, ['cuda:0', 'cuda:1']) + _wrap(TrainLoop, 'build') + _wrap(TrainLoop, 'run_epoch') + loop = TrainLoop(model, batch_size=512, shuffle=True) + loop.run_epoch() + + +if __name__ == '__main__': + main() diff --git a/src/icosagon/compile.py b/src/icosagon/compile.py new file mode 100644 index 0000000..a16c29d --- /dev/null +++ b/src/icosagon/compile.py @@ -0,0 +1,28 @@ +# +# The goal of this module is to make Icosagon more efficient. +# It takes the nice Icosagon model architecture and tries to +# formulate it in terms of batch matrix multiplications instead +# of using Python for loops. +# + +from .weights import init_glorot +from .input +import torch + + +class EncodeLayer(object): + def __init__(self, num_relation_types, input_dim, output_dim): + weights = [ init_glorot(input_dim, output_dim) \ + for _ in range(num_relation_types) ] + weights = torch.cat(weights) + + +class Compiler(object): + def __init__(self, data: Data, layer_dimensions: List[int] = [32, 64]) -> None: + self.data = data + self.layer_dimensions = layer_dimensions + self.build() + + def build(self) -> None: + for fam in data.relation_families: + init_glorot(in_channels, out_channels) diff --git a/tests/icosagon/test_trainloop.py b/tests/icosagon/test_trainloop.py index be6273c..accbea9 100644 --- a/tests/icosagon/test_trainloop.py +++ b/tests/icosagon/test_trainloop.py @@ -79,3 +79,36 @@ def test_timing_01(): for _ in range(1300): _ = torch.sparse.mm(adj_mat, rep) print('Elapsed:', time.time() - t) + + +def test_timing_02(): + adj_mat = (torch.rand(2000, 2000) < .001).to(torch.float32) + adj_mat_batch = [adj_mat.view(1, 2000, 2000)] * 1300 + adj_mat_batch = torch.cat(adj_mat_batch) + rep = torch.eye(2000).requires_grad_(True) + t = time.time() + res = torch.matmul(adj_mat_batch, rep) + print('Elapsed:', time.time() - t) + print('res.shape:', res.shape) + + +def test_timing_03(): + adj_mat = (torch.rand(2000, 2000) < .001).to(torch.float32) + adj_mat_batch = [adj_mat.view(1, 2000, 2000).to_sparse()] * 1300 + adj_mat_batch = torch.cat(adj_mat_batch) + rep = torch.eye(2000).requires_grad_(True) + rep_batch = [rep.view(1, 2000, 2000)] * 1300 + rep_batch = torch.cat(rep_batch) + t = time.time() + with pytest.raises(RuntimeError): + _ = torch.bmm(adj_mat_batch, rep) + print('Elapsed:', time.time() - t) + + +def test_timing_04(): + adj_mat = (torch.rand(2000, 2000) < .0001).to(torch.float32).to_sparse() + rep = torch.eye(2000).requires_grad_(True) + t = time.time() + for _ in range(1300): + _ = torch.sparse.mm(adj_mat, rep) + print('Elapsed:', time.time() - t)