| @@ -13,14 +13,14 @@ from typing import List, \ | |||||
| def fixed_unigram_candidate_sampler( | def fixed_unigram_candidate_sampler( | ||||
| true_classes: Union[np.array, torch.Tensor], | true_classes: Union[np.array, torch.Tensor], | ||||
| num_samples: int, | |||||
| unigrams: List[Union[int, float]], | unigrams: List[Union[int, float]], | ||||
| distortion: float = 1.): | distortion: float = 1.): | ||||
| if isinstance(true_classes, torch.Tensor): | if isinstance(true_classes, torch.Tensor): | ||||
| true_classes = true_classes.detach().cpu().numpy() | true_classes = true_classes.detach().cpu().numpy() | ||||
| if true_classes.shape[0] != num_samples: | |||||
| if len(true_classes.shape) != 2: | |||||
| raise ValueError('true_classes must be a 2D matrix with shape (num_samples, num_true)') | raise ValueError('true_classes must be a 2D matrix with shape (num_samples, num_true)') | ||||
| num_samples = true_classes.shape[0] | |||||
| unigrams = np.array(unigrams) | unigrams = np.array(unigrams) | ||||
| if distortion != 1.: | if distortion != 1.: | ||||
| unigrams = unigrams.astype(np.float64) ** distortion | unigrams = unigrams.astype(np.float64) ** distortion | ||||
| @@ -39,4 +39,4 @@ def fixed_unigram_candidate_sampler( | |||||
| mask = mask.sum(1).astype(np.bool) | mask = mask.sum(1).astype(np.bool) | ||||
| # print('mask:', mask) | # print('mask:', mask) | ||||
| indices = indices[mask] | indices = indices[mask] | ||||
| return result | |||||
| return torch.tensor(result) | |||||
| @@ -13,6 +13,9 @@ from typing import Any, \ | |||||
| Dict | Dict | ||||
| from .data import NodeType | from .data import NodeType | ||||
| from collections import defaultdict | from collections import defaultdict | ||||
| from .normalize import norm_adj_mat_one_node_type, \ | |||||
| norm_adj_mat_two_node_types | |||||
| import numpy as np | |||||
| @dataclass | @dataclass | ||||
| @@ -70,28 +73,50 @@ def train_val_test_split_edges(edges: torch.Tensor, | |||||
| return TrainValTest(edges_train, edges_val, edges_test) | return TrainValTest(edges_train, edges_val, edges_test) | ||||
| def get_edges_and_degrees(adj_mat): | |||||
| if adj_mat.is_sparse: | |||||
| adj_mat = adj_mat.coalesce() | |||||
| degrees = torch.zeros(adj_mat.shape[1], dtype=torch.int64) | |||||
| degrees = degrees.index_add(0, adj_mat.indices()[1], | |||||
| torch.ones(adj_mat.indices().shape[1], dtype=torch.int64)) | |||||
| edges_pos = adj_mat.indices().transpose(0, 1) | |||||
| else: | |||||
| degrees = adj_mat.sum(0) | |||||
| edges_pos = torch.nonzero(adj_mat) | |||||
| return edges_pos, degrees | |||||
| def prepare_adj_mat(adj_mat: torch.Tensor, | def prepare_adj_mat(adj_mat: torch.Tensor, | ||||
| ratios: TrainValTest) -> Tuple[TrainValTest, TrainValTest]: | ratios: TrainValTest) -> Tuple[TrainValTest, TrainValTest]: | ||||
| degrees = adj_mat.sum(0) | |||||
| edges_pos = torch.nonzero(adj_mat) | |||||
| if not isinstance(adj_mat, torch.Tensor): | |||||
| raise ValueError('adj_mat must be a torch.Tensor') | |||||
| neg_neighbors = fixed_unigram_candidate_sampler(edges_pos[:, 1], | |||||
| len(edges), degrees, 0.75) | |||||
| edges_neg = torch.cat((edges_pos[:, 0], neg_neighbors.view(-1, 1)), 1) | |||||
| edges_pos, degrees = get_edges_and_degrees(adj_mat) | |||||
| neg_neighbors = fixed_unigram_candidate_sampler( | |||||
| edges_pos[:, 1].view(-1, 1), degrees, 0.75) | |||||
| print(edges_pos.dtype) | |||||
| print(neg_neighbors.dtype) | |||||
| edges_neg = torch.cat((edges_pos[:, 0].view(-1, 1), neg_neighbors.view(-1, 1)), 1) | |||||
| edges_pos = train_val_test_split_edges(edges_pos, ratios) | edges_pos = train_val_test_split_edges(edges_pos, ratios) | ||||
| edges_neg = train_val_test_split_edges(edges_neg, ratios) | edges_neg = train_val_test_split_edges(edges_neg, ratios) | ||||
| return edges_pos, edges_neg | |||||
| adj_mat_train = torch.sparse_coo_tensor(indices = edges_pos.train.transpose(0, 1), | |||||
| values=torch.ones(len(edges_pos.train), dtype=adj_mat.dtype)) | |||||
| return adj_mat_train, edges_pos, edges_neg | |||||
| def prepare_relation(r, ratios): | def prepare_relation(r, ratios): | ||||
| adj_mat = r.adjacency_matrix | adj_mat = r.adjacency_matrix | ||||
| edges_pos, edges_neg = prepare_adj_mat(adj_mat) | |||||
| adj_mat_train, edges_pos, edges_neg = prepare_adj_mat(adj_mat) | |||||
| adj_mat_train = torch.sparse_coo_tensor(indices = edges_pos[0].transpose(0, 1), | |||||
| values=torch.ones(len(edges_pos[0]), dtype=adj_mat.dtype)) | |||||
| if r.node_type_row == r.node_type_column: | |||||
| adj_mat_train = norm_adj_mat_one_node_type(adj_mat_train) | |||||
| else: | |||||
| adj_mat_train = norm_adj_mat_two_node_types(adj_mat_train) | |||||
| return PreparedRelation(r.name, r.node_type_row, r.node_type_column, | return PreparedRelation(r.name, r.node_type_row, r.node_type_column, | ||||
| adj_mat_train, edges_pos, edges_neg) | adj_mat_train, edges_pos, edges_neg) | ||||
| @@ -1,3 +1,9 @@ | |||||
| # | |||||
| # Copyright (C) Stanislaw Adaszewski, 2020 | |||||
| # License: GPLv3 | |||||
| # | |||||
| from icosagon import Data | from icosagon import Data | ||||
| import torch | import torch | ||||
| import pytest | import pytest | ||||
| @@ -1,8 +1,17 @@ | |||||
| # | |||||
| # Copyright (C) Stanislaw Adaszewski, 2020 | |||||
| # License: GPLv3 | |||||
| # | |||||
| from icosagon.trainprep import TrainValTest, \ | from icosagon.trainprep import TrainValTest, \ | ||||
| train_val_test_split_edges | |||||
| train_val_test_split_edges, \ | |||||
| get_edges_and_degrees, \ | |||||
| prepare_adj_mat | |||||
| import torch | import torch | ||||
| import pytest | import pytest | ||||
| import numpy as np | import numpy as np | ||||
| from itertools import chain | |||||
| def test_train_val_test_split_edges_01(): | def test_train_val_test_split_edges_01(): | ||||
| @@ -43,16 +52,65 @@ def test_train_val_test_split_edges_01(): | |||||
| res.test.shape == (0, 2) | res.test.shape == (0, 2) | ||||
| def test_train_val_test_split_edges_02(): | |||||
| edges = torch.randint(0, 30, (30, 2)) | |||||
| ratios = TrainValTest(.8, .1, .1) | |||||
| res = train_val_test_split_edges(edges, ratios) | |||||
| edges = [ tuple(a) for a in edges ] | |||||
| res = [ tuple(a) for a in chain(res.train, res.val, res.test) ] | |||||
| assert all([ a in edges for a in res ]) | |||||
| def test_get_edges_and_degrees_01(): | |||||
| adj_mat_dense = (torch.rand((10, 10)) > .5) | |||||
| adj_mat_sparse = adj_mat_dense.to_sparse() | |||||
| edges_dense, degrees_dense = get_edges_and_degrees(adj_mat_dense) | |||||
| edges_sparse, degrees_sparse = get_edges_and_degrees(adj_mat_sparse) | |||||
| assert torch.all(degrees_dense == degrees_sparse) | |||||
| edges_dense = [ tuple(a) for a in edges_dense ] | |||||
| edges_sparse = [ tuple(a) for a in edges_dense ] | |||||
| assert len(edges_dense) == len(edges_sparse) | |||||
| assert all([ a in edges_dense for a in edges_sparse ]) | |||||
| assert all([ a in edges_sparse for a in edges_dense ]) | |||||
| # assert torch.all(edges_dense == edges_sparse) | |||||
| def test_prepare_adj_mat_01(): | |||||
| adj_mat = (torch.rand((10, 10)) > .5) | |||||
| adj_mat = adj_mat.to_sparse() | |||||
| ratios = TrainValTest(.8, .1, .1) | |||||
| _ = prepare_adj_mat(adj_mat, ratios) | |||||
| def test_prepare_adj_mat_02(): | |||||
| adj_mat = (torch.rand((10, 10)) > .5) | |||||
| adj_mat = adj_mat.to_sparse() | |||||
| ratios = TrainValTest(.8, .1, .1) | |||||
| (adj_mat_train, edges_pos, edges_neg) = prepare_adj_mat(adj_mat, ratios) | |||||
| assert isinstance(adj_mat_train, torch.Tensor) | |||||
| assert adj_mat_train.is_sparse | |||||
| assert adj_mat_train.shape == adj_mat.shape | |||||
| assert adj_mat_train.dtype == adj_mat.dtype | |||||
| assert isinstance(edges_pos, TrainValTest) | |||||
| assert isinstance(edges_neg, TrainValTest) | |||||
| for a in ['train', 'val', 'test']: | |||||
| for b in [edges_pos, edges_neg]: | |||||
| edges = getattr(b, a) | |||||
| assert isinstance(edges, torch.Tensor) | |||||
| assert len(edges.shape) == 2 | |||||
| assert edges.shape[1] == 2 | |||||
| # if ratios.train + ratios.val + ratios.test != 1.0: | |||||
| # raise ValueError('Train, validation and test ratios must add up to 1') | |||||
| # | |||||
| # order = torch.randperm(len(edges)) | |||||
| # edges = edges[order, :] | |||||
| # n = round(len(edges) * ratios.train) | |||||
| # edges_train = edges[:n] | |||||
| # n_1 = round(len(edges) * (ratios.train + ratios.val)) | |||||
| # edges_val = edges[n:n_1] | |||||
| # edges_test = edges[n_1:] | |||||
| # | |||||
| # return TrainValTest(edges_train, edges_val, edges_test) | |||||
| # def prepare_adj_mat(adj_mat: torch.Tensor, | |||||
| # ratios: TrainValTest) -> Tuple[TrainValTest, TrainValTest]: | |||||
| # | |||||
| # degrees = adj_mat.sum(0) | |||||
| # edges_pos = torch.nonzero(adj_mat) | |||||
| # | |||||
| # neg_neighbors = fixed_unigram_candidate_sampler(edges_pos[:, 1], | |||||
| # len(edges), degrees, 0.75) | |||||
| # edges_neg = torch.cat((edges_pos[:, 0], neg_neighbors.view(-1, 1)), 1) | |||||
| # | |||||
| # edges_pos = train_val_test_split_edges(edges_pos, ratios) | |||||
| # edges_neg = train_val_test_split_edges(edges_neg, ratios) | |||||
| # | |||||
| # return edges_pos, edges_neg | |||||