IF YOU WOULD LIKE TO GET AN ACCOUNT, please write an email to s dot adaszewski at gmail dot com. User accounts are meant only to report issues and/or generate pull requests. This is a purpose-specific Git hosting for ADARED projects. Thank you for your understanding!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 lines
4.4KB

  1. #!/usr/bin/env python3
  2. from icosagon.data import Data
  3. from icosagon.trainprep import TrainValTest, \
  4. prepare_training
  5. from icosagon.model import Model
  6. from icosagon.trainloop import TrainLoop
  7. import os
  8. import pandas as pd
  9. from bisect import bisect_left
  10. import torch
  11. import sys
  12. def index(a, x):
  13. i = bisect_left(a, x)
  14. if i != len(a) and a[i] == x:
  15. return i
  16. raise ValueError
  17. def load_data():
  18. path = '/pstore/data/data_science/ref/decagon'
  19. df_combo = pd.read_csv(os.path.join(path, 'bio-decagon-combo.csv'))
  20. df_effcat = pd.read_csv(os.path.join(path, 'bio-decagon-effectcategories.csv'))
  21. df_mono = pd.read_csv(os.path.join(path, 'bio-decagon-mono.csv'))
  22. df_ppi = pd.read_csv(os.path.join(path, 'bio-decagon-ppi.csv'))
  23. df_tgtall = pd.read_csv(os.path.join(path, 'bio-decagon-targets-all.csv'))
  24. df_tgt = pd.read_csv(os.path.join(path, 'bio-decagon-targets.csv'))
  25. lst = [ 'df_combo', 'df_effcat', 'df_mono', 'df_ppi', 'df_tgtall', 'df_tgt' ]
  26. for nam in lst:
  27. print(f'len({nam}): {len(locals()[nam])}')
  28. print(f'{nam}.columns: {locals()[nam].columns}')
  29. genes = set()
  30. genes = genes.union(df_ppi['Gene 1']).union(df_ppi['Gene 2']) \
  31. .union(df_tgtall['Gene']).union(df_tgt['Gene'])
  32. genes = sorted(genes)
  33. print('len(genes):', len(genes))
  34. drugs = set()
  35. drugs = drugs.union(df_combo['STITCH 1']).union(df_combo['STITCH 2']) \
  36. .union(df_mono['STITCH']).union(df_tgtall['STITCH']).union(df_tgt['STITCH'])
  37. drugs = sorted(drugs)
  38. print('len(drugs):', len(drugs))
  39. data = Data()
  40. data.add_node_type('Gene', len(genes))
  41. data.add_node_type('Drug', len(drugs))
  42. print('Preparing PPI...')
  43. print('Indexing rows...')
  44. rows = [index(genes, g) for g in df_ppi['Gene 1']]
  45. print('Indexing cols...')
  46. cols = [index(genes, g) for g in df_ppi['Gene 2']]
  47. indices = list(zip(rows, cols))
  48. indices = torch.tensor(indices).transpose(0, 1)
  49. values = torch.ones(len(rows))
  50. print('indices.shape:', indices.shape, 'values.shape:', values.shape)
  51. adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(genes),) * 2)
  52. adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2
  53. print('adj_mat created')
  54. fam = data.add_relation_family('PPI', 0, 0, True)
  55. rel = fam.add_relation_type('PPI', adj_mat)
  56. print('OK')
  57. print('Preparing Drug-Gene (Target) edges...')
  58. rows = [index(drugs, d) for d in df_tgtall['STITCH']]
  59. cols = [index(genes, g) for g in df_tgtall['Gene']]
  60. indices = list(zip(rows, cols))
  61. indices = torch.tensor(indices).transpose(0, 1)
  62. values = torch.ones(len(rows))
  63. adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(drugs), len(genes)))
  64. fam = data.add_relation_family('Drug-Gene (Target)', 1, 0, True)
  65. rel = fam.add_relation_type('Drug-Gene (Target)', adj_mat)
  66. print('OK')
  67. print('Preparing Drug-Drug (Side Effect) edges...')
  68. fam = data.add_relation_family('Drug-Drug (Side Effect)', 1, 1, True)
  69. print('# of side effects:', len(df_combo), 'unique:', len(df_combo['Polypharmacy Side Effect'].unique()))
  70. for eff, df in df_combo.groupby('Polypharmacy Side Effect'):
  71. sys.stdout.write('.') # print(eff, '...')
  72. sys.stdout.flush()
  73. rows = [index(drugs, d) for d in df['STITCH 1']]
  74. cols = [index(drugs, d) for d in df['STITCH 2']]
  75. indices = list(zip(rows, cols))
  76. indices = torch.tensor(indices).transpose(0, 1)
  77. values = torch.ones(len(rows))
  78. adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(drugs), len(drugs)))
  79. adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2
  80. rel = fam.add_relation_type(df['Polypharmacy Side Effect'], adj_mat)
  81. print()
  82. print('OK')
  83. return data
  84. def _wrap(obj, method_name):
  85. orig_fn = getattr(obj, method_name)
  86. def fn(*args, **kwargs):
  87. print(f'{method_name}() :: ENTER')
  88. res = orig_fn(*args, **kwargs)
  89. print(f'{method_name}() :: EXIT')
  90. return res
  91. setattr(obj, method_name, fn)
  92. def main():
  93. data = load_data()
  94. prep_d = prepare_training(data, TrainValTest(.8, .1, .1))
  95. _wrap(Model, 'build')
  96. model = Model(prep_d)
  97. _wrap(TrainLoop, 'build')
  98. _wrap(TrainLoop, 'run_epoch')
  99. loop = TrainLoop(model, batch_size=1000000)
  100. loop.run_epoch()
  101. if __name__ == '__main__':
  102. main()