diff --git a/src/triacontagon/sampling.py b/src/triacontagon/sampling.py index 7c55944..60b7647 100644 --- a/src/triacontagon/sampling.py +++ b/src/triacontagon/sampling.py @@ -8,7 +8,10 @@ import numpy as np import torch import torch.utils.data from typing import List, \ - Union + Union, \ + Tuple +from .data import Data, \ + EdgeType def fixed_unigram_candidate_sampler( @@ -24,7 +27,7 @@ def fixed_unigram_candidate_sampler( if len(true_classes.shape) != 2: raise ValueError('true_classes must be a 2D matrix with shape (num_samples, num_true)') - + num_samples = true_classes.shape[0] unigrams = np.array(unigrams) if distortion != 1.: @@ -40,8 +43,74 @@ def fixed_unigram_candidate_sampler( # print('candidates:', candidates) # print('true_classes:', true_classes[indices, :]) result[indices] = candidates.T + # print('result:', result) mask = (candidates == true_classes[indices, :]) mask = mask.sum(1).astype(np.bool) # print('mask:', mask) indices = indices[mask] + # result[indices] = 0 return torch.tensor(result) + + +def get_edges_and_degrees(adj_mat: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor]: + + if adj_mat.is_sparse: + adj_mat = adj_mat.coalesce() + degrees = torch.zeros(adj_mat.shape[1], dtype=torch.int64, + device=adj_mat.device) + degrees = degrees.index_add(0, adj_mat.indices()[1], + torch.ones(adj_mat.indices().shape[1], dtype=torch.int64, + device=adj_mat.device)) + edges_pos = adj_mat.indices().transpose(0, 1) + else: + degrees = adj_mat.sum(0) + edges_pos = torch.nonzero(adj_mat, as_tuple=False) + return edges_pos, degrees + + +def negative_sample_adj_mat(adj_mat: torch.Tensor) -> torch.Tensor: + if not isinstance(adj_mat, torch.Tensor): + raise ValueError('adj_mat must be a torch.Tensor, got: %s' % adj_mat.__class__.__name__) + + edges_pos, degrees = get_edges_and_degrees(adj_mat) + + neg_neighbors = fixed_unigram_candidate_sampler( + edges_pos[:, 1].view(-1, 1), degrees, 0.75).to(adj_mat.device) + edges_neg = torch.cat([ edges_pos[:, 0].view(-1, 1), + neg_neighbors.view(-1, 1) ], 1) + + adj_mat_neg = torch.sparse_coo_tensor(indices = edges_neg.transpose(0, 1), + values=torch.ones(len(edges_neg)), size=adj_mat.shape, + dtype=adj_mat.dtype, device=adj_mat.device) + + adj_mat_neg = adj_mat_neg.coalesce() + indices = adj_mat_neg.indices() + adj_mat_neg = torch.sparse_coo_tensor(indices, + torch.ones(indices.shape[1]), adj_mat.shape, + dtype=adj_mat.dtype, device=adj_mat.device) + + adj_mat_neg = adj_mat_neg.coalesce() + + return adj_mat_neg + + +def negative_sample_data(data: Data) -> Data: + new_edge_types = {} + res = Data() + for vt in data.vertex_types: + res.add_vertex_type(vt.name, vt.count) + for key, et in data.edge_types.items(): + adjacency_matrices_neg = [] + for adj_mat in et.adjacency_matrices: + adj_mat_neg = negative_sample_adj_mat(adj_mat) + adjacency_matrices_neg.append(adj_mat_neg) + res.add_edge_type(et.name, + et.vertex_type_row, et.vertex_type_column, + adjacency_matrices_neg, et.decoder_factory) + #new_et = EdgeType(et.name, et.vertex_type_row, + # et.vertex_type_column, adjacency_matrices_neg, + # et.decoder_factory, et.total_connectivity) + #new_edge_types[key] = new_et + #res = Data(data.vertex_types, new_edge_types) + return res diff --git a/src/triacontagon/trainprep.py b/src/triacontagon/trainprep.py new file mode 100644 index 0000000..dd7a12c --- /dev/null +++ b/src/triacontagon/trainprep.py @@ -0,0 +1,59 @@ +from .data import Data, \ + TrainingBatch, \ + EdgeType +from typing import Tuple +from .util import _sparse_coo_tensor + + +def split_adj_mat(adj_mat: torch.Tensor, ratios: List[float]): + indices = adj_mat.indices() + values = adj_mat.values() + + order = torch.randperm(indices.shape[1]) + + indices = indices[:, order] + values = values[order] + + ofs = 0 + res = [] + for r in ratios: + cnt = r * len(values) + ind = indices[:, ofs:ofs+cnt] + val = values[ofs:ofs+cnt] + res.append(_sparse_coo_tensor(ind, val, adj_mat.shape)) + ofs += cnt + + return res + + +def split_edge_type(et: EdgeType, ratios: Tuple[float, float, float]): + res = [ [] for _ in range(len(et.adjacency_matrices)) ] + + for adj_mat in et.adjacency_matrices: + for i, new_adj_mat in enumerate(split_adj_mat(adj_mat, ratios)): + res[i].append(new_adj_mat) + + return res + + +def split_data(data: Data, + ratios: List[float]): + + if not isinstance(data, Data): + raise TypeError('data must be an instance of Data') + + ratios = list(ratios) + + if sum(ratios) != 1: + raise ValueError('ratios must sum to 1') + + res = [ {} for _ in range(len(ratios)) ] + + for key, et in data.edge_types: + for i, new_et in enumerate(split_edge_type(et, ratios)): + res[i][key] = new_et + + res = [ Data(data.vertex_types, new_edge_types) \ + for new_edge_types in res ] + + return res diff --git a/tests/triacontagon/test_sampling.py b/tests/triacontagon/test_sampling.py new file mode 100644 index 0000000..6d7a155 --- /dev/null +++ b/tests/triacontagon/test_sampling.py @@ -0,0 +1,38 @@ +from triacontagon.data import Data +from triacontagon.sampling import negative_sample_adj_mat, \ + negative_sample_data +from triacontagon.decode import dedicom_decoder +import torch + + +def test_negative_sample_adj_mat_01(): + adj_mat = torch.tensor([ + [0, 1, 0, 1, 0], + [0, 0, 0, 0, 1], + [1, 1, 0, 0, 0], + [0, 0, 1, 0, 1], + [0, 1, 0, 0, 0] + ]) + + print('adj_mat:', adj_mat) + + adj_mat_neg = negative_sample_adj_mat(adj_mat) + + print('adj_mat_neg:', adj_mat_neg.to_dense()) + + +def test_negative_sample_data_01(): + d = Data() + d.add_vertex_type('Gene', 5) + + d.add_edge_type('Gene-Gene', 0, 0, [ + torch.tensor([ + [0, 1, 0, 1, 0], + [0, 0, 0, 0, 1], + [1, 1, 0, 0, 0], + [0, 0, 1, 0, 1], + [0, 1, 0, 0, 0] + ], dtype=torch.float).to_sparse() + ], dedicom_decoder) + + d_neg = negative_sample_data(d)