IF YOU WOULD LIKE TO GET AN ACCOUNT, please write an email to s dot adaszewski at gmail dot com. User accounts are meant only to report issues and/or generate pull requests. This is a purpose-specific Git hosting for ADARED projects. Thank you for your understanding!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
2.2KB

  1. #!/usr/bin/env python3
  2. from icosagon.data import Data
  3. import os
  4. import pandas as pd
  5. from bisect import bisect_left
  6. import torch
  7. def index(a, x):
  8. i = bisect_left(a, x)
  9. if i != len(a) and a[i] == x:
  10. return i
  11. raise ValueError
  12. def main():
  13. path = '/pstore/data/data_science/ref/decagon'
  14. df_combo = pd.read_csv(os.path.join(path, 'bio-decagon-combo.csv'))
  15. df_effcat = pd.read_csv(os.path.join(path, 'bio-decagon-effectcategories.csv'))
  16. df_mono = pd.read_csv(os.path.join(path, 'bio-decagon-mono.csv'))
  17. df_ppi = pd.read_csv(os.path.join(path, 'bio-decagon-ppi.csv'))
  18. df_tgtall = pd.read_csv(os.path.join(path, 'bio-decagon-targets-all.csv'))
  19. df_tgt = pd.read_csv(os.path.join(path, 'bio-decagon-targets.csv'))
  20. lst = [ 'df_combo', 'df_effcat', 'df_mono', 'df_ppi', 'df_tgtall', 'df_tgt' ]
  21. for nam in lst:
  22. print(f'len({nam}): {len(locals()[nam])}')
  23. print(f'{nam}.columns: {locals()[nam].columns}')
  24. genes = set()
  25. genes = genes.union(df_ppi['Gene 1']).union(df_ppi['Gene 2']) \
  26. .union(df_tgtall['Gene']).union(df_tgt['Gene'])
  27. genes = sorted(genes)
  28. print('len(genes):', len(genes))
  29. drugs = set()
  30. drugs = drugs.union(df_combo['STITCH 1']).union(df_combo['STITCH 2']) \
  31. .union(df_mono['STITCH']).union(df_tgtall['STITCH']).union(df_tgt['STITCH'])
  32. drugs = sorted(drugs)
  33. print('len(drugs):', len(drugs))
  34. data = Data()
  35. data.add_node_type('Gene', len(genes))
  36. data.add_node_type('Drug', len(drugs))
  37. print('Indexing rows...')
  38. rows = [index(genes, g) for g in df_ppi['Gene 1']]
  39. print('Indexing cols...')
  40. cols = [index(genes, g) for g in df_ppi['Gene 2']]
  41. indices = list(zip(rows, cols))
  42. indices = torch.tensor(indices).transpose(0, 1)
  43. values = torch.ones(len(rows))
  44. print('indices.shape:', indices.shape, 'values.shape:', values.shape)
  45. adj_mat = torch.sparse_coo_tensor(indices, values, size=(len(genes),) * 2)
  46. adj_mat = (adj_mat + adj_mat.transpose(0, 1)) / 2
  47. print('adj_mat created')
  48. fam = data.add_relation_family('PPI', 0, 0, True)
  49. rel = fam.add_relation_type('PPI', adj_mat)
  50. if __name__ == '__main__':
  51. main()