diff --git a/src/icosagon/sampling.py b/src/icosagon/sampling.py index 8de44f9..28b143d 100644 --- a/src/icosagon/sampling.py +++ b/src/icosagon/sampling.py @@ -13,14 +13,14 @@ from typing import List, \ def fixed_unigram_candidate_sampler( true_classes: Union[np.array, torch.Tensor], - num_samples: int, unigrams: List[Union[int, float]], distortion: float = 1.): if isinstance(true_classes, torch.Tensor): true_classes = true_classes.detach().cpu().numpy() - if true_classes.shape[0] != num_samples: + if len(true_classes.shape) != 2: raise ValueError('true_classes must be a 2D matrix with shape (num_samples, num_true)') + num_samples = true_classes.shape[0] unigrams = np.array(unigrams) if distortion != 1.: unigrams = unigrams.astype(np.float64) ** distortion @@ -39,4 +39,4 @@ def fixed_unigram_candidate_sampler( mask = mask.sum(1).astype(np.bool) # print('mask:', mask) indices = indices[mask] - return result + return torch.tensor(result) diff --git a/src/icosagon/trainprep.py b/src/icosagon/trainprep.py index 38beec6..a999dec 100644 --- a/src/icosagon/trainprep.py +++ b/src/icosagon/trainprep.py @@ -13,6 +13,9 @@ from typing import Any, \ Dict from .data import NodeType from collections import defaultdict +from .normalize import norm_adj_mat_one_node_type, \ + norm_adj_mat_two_node_types +import numpy as np @dataclass @@ -70,28 +73,50 @@ def train_val_test_split_edges(edges: torch.Tensor, return TrainValTest(edges_train, edges_val, edges_test) +def get_edges_and_degrees(adj_mat): + if adj_mat.is_sparse: + adj_mat = adj_mat.coalesce() + degrees = torch.zeros(adj_mat.shape[1], dtype=torch.int64) + degrees = degrees.index_add(0, adj_mat.indices()[1], + torch.ones(adj_mat.indices().shape[1], dtype=torch.int64)) + edges_pos = adj_mat.indices().transpose(0, 1) + else: + degrees = adj_mat.sum(0) + edges_pos = torch.nonzero(adj_mat) + return edges_pos, degrees + + def prepare_adj_mat(adj_mat: torch.Tensor, ratios: TrainValTest) -> Tuple[TrainValTest, TrainValTest]: - degrees = adj_mat.sum(0) - edges_pos = torch.nonzero(adj_mat) + if not isinstance(adj_mat, torch.Tensor): + raise ValueError('adj_mat must be a torch.Tensor') - neg_neighbors = fixed_unigram_candidate_sampler(edges_pos[:, 1], - len(edges), degrees, 0.75) - edges_neg = torch.cat((edges_pos[:, 0], neg_neighbors.view(-1, 1)), 1) + edges_pos, degrees = get_edges_and_degrees(adj_mat) + + neg_neighbors = fixed_unigram_candidate_sampler( + edges_pos[:, 1].view(-1, 1), degrees, 0.75) + print(edges_pos.dtype) + print(neg_neighbors.dtype) + edges_neg = torch.cat((edges_pos[:, 0].view(-1, 1), neg_neighbors.view(-1, 1)), 1) edges_pos = train_val_test_split_edges(edges_pos, ratios) edges_neg = train_val_test_split_edges(edges_neg, ratios) - return edges_pos, edges_neg + adj_mat_train = torch.sparse_coo_tensor(indices = edges_pos.train.transpose(0, 1), + values=torch.ones(len(edges_pos.train), dtype=adj_mat.dtype)) + + return adj_mat_train, edges_pos, edges_neg def prepare_relation(r, ratios): adj_mat = r.adjacency_matrix - edges_pos, edges_neg = prepare_adj_mat(adj_mat) + adj_mat_train, edges_pos, edges_neg = prepare_adj_mat(adj_mat) - adj_mat_train = torch.sparse_coo_tensor(indices = edges_pos[0].transpose(0, 1), - values=torch.ones(len(edges_pos[0]), dtype=adj_mat.dtype)) + if r.node_type_row == r.node_type_column: + adj_mat_train = norm_adj_mat_one_node_type(adj_mat_train) + else: + adj_mat_train = norm_adj_mat_two_node_types(adj_mat_train) return PreparedRelation(r.name, r.node_type_row, r.node_type_column, adj_mat_train, edges_pos, edges_neg) diff --git a/tests/icosagon/test_data.py b/tests/icosagon/test_data.py index ef64e7b..4ec8164 100644 --- a/tests/icosagon/test_data.py +++ b/tests/icosagon/test_data.py @@ -1,3 +1,9 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + from icosagon import Data import torch import pytest diff --git a/tests/icosagon/test_trainprep.py b/tests/icosagon/test_trainprep.py index 874b2c4..ae9c562 100644 --- a/tests/icosagon/test_trainprep.py +++ b/tests/icosagon/test_trainprep.py @@ -1,8 +1,17 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + from icosagon.trainprep import TrainValTest, \ - train_val_test_split_edges + train_val_test_split_edges, \ + get_edges_and_degrees, \ + prepare_adj_mat import torch import pytest import numpy as np +from itertools import chain def test_train_val_test_split_edges_01(): @@ -43,16 +52,65 @@ def test_train_val_test_split_edges_01(): res.test.shape == (0, 2) +def test_train_val_test_split_edges_02(): + edges = torch.randint(0, 30, (30, 2)) + ratios = TrainValTest(.8, .1, .1) + res = train_val_test_split_edges(edges, ratios) + edges = [ tuple(a) for a in edges ] + res = [ tuple(a) for a in chain(res.train, res.val, res.test) ] + assert all([ a in edges for a in res ]) + + +def test_get_edges_and_degrees_01(): + adj_mat_dense = (torch.rand((10, 10)) > .5) + adj_mat_sparse = adj_mat_dense.to_sparse() + edges_dense, degrees_dense = get_edges_and_degrees(adj_mat_dense) + edges_sparse, degrees_sparse = get_edges_and_degrees(adj_mat_sparse) + assert torch.all(degrees_dense == degrees_sparse) + edges_dense = [ tuple(a) for a in edges_dense ] + edges_sparse = [ tuple(a) for a in edges_dense ] + assert len(edges_dense) == len(edges_sparse) + assert all([ a in edges_dense for a in edges_sparse ]) + assert all([ a in edges_sparse for a in edges_dense ]) + # assert torch.all(edges_dense == edges_sparse) + + +def test_prepare_adj_mat_01(): + adj_mat = (torch.rand((10, 10)) > .5) + adj_mat = adj_mat.to_sparse() + ratios = TrainValTest(.8, .1, .1) + _ = prepare_adj_mat(adj_mat, ratios) + + +def test_prepare_adj_mat_02(): + adj_mat = (torch.rand((10, 10)) > .5) + adj_mat = adj_mat.to_sparse() + ratios = TrainValTest(.8, .1, .1) + (adj_mat_train, edges_pos, edges_neg) = prepare_adj_mat(adj_mat, ratios) + assert isinstance(adj_mat_train, torch.Tensor) + assert adj_mat_train.is_sparse + assert adj_mat_train.shape == adj_mat.shape + assert adj_mat_train.dtype == adj_mat.dtype + assert isinstance(edges_pos, TrainValTest) + assert isinstance(edges_neg, TrainValTest) + for a in ['train', 'val', 'test']: + for b in [edges_pos, edges_neg]: + edges = getattr(b, a) + assert isinstance(edges, torch.Tensor) + assert len(edges.shape) == 2 + assert edges.shape[1] == 2 - # if ratios.train + ratios.val + ratios.test != 1.0: - # raise ValueError('Train, validation and test ratios must add up to 1') - # - # order = torch.randperm(len(edges)) - # edges = edges[order, :] - # n = round(len(edges) * ratios.train) - # edges_train = edges[:n] - # n_1 = round(len(edges) * (ratios.train + ratios.val)) - # edges_val = edges[n:n_1] - # edges_test = edges[n_1:] - # - # return TrainValTest(edges_train, edges_val, edges_test) +# def prepare_adj_mat(adj_mat: torch.Tensor, +# ratios: TrainValTest) -> Tuple[TrainValTest, TrainValTest]: +# +# degrees = adj_mat.sum(0) +# edges_pos = torch.nonzero(adj_mat) +# +# neg_neighbors = fixed_unigram_candidate_sampler(edges_pos[:, 1], +# len(edges), degrees, 0.75) +# edges_neg = torch.cat((edges_pos[:, 0], neg_neighbors.view(-1, 1)), 1) +# +# edges_pos = train_val_test_split_edges(edges_pos, ratios) +# edges_neg = train_val_test_split_edges(edges_neg, ratios) +# +# return edges_pos, edges_neg