diff --git a/docs/decagon-diagram.svg b/docs/decagon-diagram.svg index 8f8b73a..aafe9ba 100644 --- a/docs/decagon-diagram.svg +++ b/docs/decagon-diagram.svg @@ -18,6 +18,94 @@ sodipodi:docname="decagon-diagram.svg"> + + + + + + + + + + + + + + + + + + @@ -1831,5 +1919,163 @@ x="-6.6224065" id="tspan2683" sodipodi:role="line">A' + + 1 + + 2 + + 3 + + + + [ + [ + 0 0 1 11 0 0 00 1 0 10 1 1 0 A + + 4 + + + + diff --git a/src/icosagon/__init__.py b/src/icosagon/__init__.py index dc4d081..78237bd 100644 --- a/src/icosagon/__init__.py +++ b/src/icosagon/__init__.py @@ -1 +1,7 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + from .data import Data diff --git a/src/icosagon/data.py b/src/icosagon/data.py index 9f696ca..dab3852 100644 --- a/src/icosagon/data.py +++ b/src/icosagon/data.py @@ -1,3 +1,9 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + from collections import defaultdict from dataclasses import dataclass import torch diff --git a/src/icosagon/normalize.py b/src/icosagon/normalize.py new file mode 100644 index 0000000..82e1072 --- /dev/null +++ b/src/icosagon/normalize.py @@ -0,0 +1,29 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + +import numpy as np +import scipy.sparse as sp + + +def norm_adj_mat_one_node_type(adj): + adj = sp.coo_matrix(adj) + assert adj.shape[0] == adj.shape[1] + adj_ = adj + sp.eye(adj.shape[0]) + rowsum = np.array(adj_.sum(1)) + degree_mat_inv_sqrt = np.power(rowsum, -0.5).flatten() + degree_mat_inv_sqrt = sp.diags(degree_mat_inv_sqrt) + adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt) + return adj_normalized + + +def norm_adj_mat_two_node_types(adj): + adj = sp.coo_matrix(adj) + rowsum = np.array(adj.sum(1)) + colsum = np.array(adj.sum(0)) + rowdegree_mat_inv = sp.diags(np.nan_to_num(np.power(rowsum, -0.5)).flatten()) + coldegree_mat_inv = sp.diags(np.nan_to_num(np.power(colsum, -0.5)).flatten()) + adj_normalized = rowdegree_mat_inv.dot(adj).dot(coldegree_mat_inv).tocoo() + return adj_normalized diff --git a/src/icosagon/sampling.py b/src/icosagon/sampling.py new file mode 100644 index 0000000..8de44f9 --- /dev/null +++ b/src/icosagon/sampling.py @@ -0,0 +1,42 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + +import numpy as np +import torch +import torch.utils.data +from typing import List, \ + Union + + +def fixed_unigram_candidate_sampler( + true_classes: Union[np.array, torch.Tensor], + num_samples: int, + unigrams: List[Union[int, float]], + distortion: float = 1.): + + if isinstance(true_classes, torch.Tensor): + true_classes = true_classes.detach().cpu().numpy() + if true_classes.shape[0] != num_samples: + raise ValueError('true_classes must be a 2D matrix with shape (num_samples, num_true)') + unigrams = np.array(unigrams) + if distortion != 1.: + unigrams = unigrams.astype(np.float64) ** distortion + # print('unigrams:', unigrams) + indices = np.arange(num_samples) + result = np.zeros(num_samples, dtype=np.int64) + while len(indices) > 0: + # print('len(indices):', len(indices)) + sampler = torch.utils.data.WeightedRandomSampler(unigrams, len(indices)) + candidates = np.array(list(sampler)) + candidates = np.reshape(candidates, (len(indices), 1)) + # print('candidates:', candidates) + # print('true_classes:', true_classes[indices, :]) + result[indices] = candidates.T + mask = (candidates == true_classes[indices, :]) + mask = mask.sum(1).astype(np.bool) + # print('mask:', mask) + indices = indices[mask] + return result diff --git a/src/icosagon/trainprep.py b/src/icosagon/trainprep.py new file mode 100644 index 0000000..38beec6 --- /dev/null +++ b/src/icosagon/trainprep.py @@ -0,0 +1,106 @@ +# +# Copyright (C) Stanislaw Adaszewski, 2020 +# License: GPLv3 +# + + +from .sampling import fixed_unigram_candidate_sampler +import torch +from dataclasses import dataclass +from typing import Any, \ + List, \ + Tuple, \ + Dict +from .data import NodeType +from collections import defaultdict + + +@dataclass +class TrainValTest(object): + train: Any + val: Any + test: Any + + +@dataclass +class PreparedEdges(object): + positive: TrainValTest + negative: TrainValTest + + +@dataclass +class PreparedRelationType(object): + name: str + node_type_row: int + node_type_column: int + adj_mat_train: torch.Tensor + edges_pos: TrainValTest + edges_neg: TrainValTest + + +@dataclass +class PreparedData(object): + node_types: List[NodeType] + relation_types: Dict[int, Dict[int, List[PreparedRelationType]]] + + +def train_val_test_split_edges(edges: torch.Tensor, + ratios: TrainValTest) -> TrainValTest: + + if not isinstance(edges, torch.Tensor): + raise ValueError('edges must be a torch.Tensor') + + if len(edges.shape) != 2 or edges.shape[1] != 2: + raise ValueError('edges shape must be (num_edges, 2)') + + if not isinstance(ratios, TrainValTest): + raise ValueError('ratios must be a TrainValTest') + + if ratios.train + ratios.val + ratios.test != 1.0: + raise ValueError('Train, validation and test ratios must add up to 1') + + order = torch.randperm(len(edges)) + edges = edges[order, :] + n = round(len(edges) * ratios.train) + edges_train = edges[:n] + n_1 = round(len(edges) * (ratios.train + ratios.val)) + edges_val = edges[n:n_1] + edges_test = edges[n_1:] + + return TrainValTest(edges_train, edges_val, edges_test) + + +def prepare_adj_mat(adj_mat: torch.Tensor, + ratios: TrainValTest) -> Tuple[TrainValTest, TrainValTest]: + + degrees = adj_mat.sum(0) + edges_pos = torch.nonzero(adj_mat) + + neg_neighbors = fixed_unigram_candidate_sampler(edges_pos[:, 1], + len(edges), degrees, 0.75) + edges_neg = torch.cat((edges_pos[:, 0], neg_neighbors.view(-1, 1)), 1) + + edges_pos = train_val_test_split_edges(edges_pos, ratios) + edges_neg = train_val_test_split_edges(edges_neg, ratios) + + return edges_pos, edges_neg + + +def prepare_relation(r, ratios): + adj_mat = r.adjacency_matrix + edges_pos, edges_neg = prepare_adj_mat(adj_mat) + + adj_mat_train = torch.sparse_coo_tensor(indices = edges_pos[0].transpose(0, 1), + values=torch.ones(len(edges_pos[0]), dtype=adj_mat.dtype)) + + return PreparedRelation(r.name, r.node_type_row, r.node_type_column, + adj_mat_train, edges_pos, edges_neg) + + +def prepare_training(data): + relation_types = defaultdict(lambda: defaultdict(list)) + for (node_type_row, node_type_column), rels in data.relation_types: + for r in rels: + relation_types[node_type_row][node_type_column].append( + prep_relation(r)) + return PreparedData(data.node_types, relation_types) diff --git a/tests/icosagon/test_trainprep.py b/tests/icosagon/test_trainprep.py new file mode 100644 index 0000000..874b2c4 --- /dev/null +++ b/tests/icosagon/test_trainprep.py @@ -0,0 +1,58 @@ +from icosagon.trainprep import TrainValTest, \ + train_val_test_split_edges +import torch +import pytest +import numpy as np + + +def test_train_val_test_split_edges_01(): + edges = torch.randint(0, 10, (10, 2)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(edges, TrainValTest(.5, .5, .5)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(edges, TrainValTest(.2, .2, .2)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(edges, None) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(edges, (.8, .1, .1)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(np.random.randint(0, 10, (10, 2)), TrainValTest(.8, .1, .1)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(torch.randint(0, 10, (10, 3)), TrainValTest(.8, .1, .1)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(torch.randint(0, 10, (10, 2, 1)), TrainValTest(.8, .1, .1)) + with pytest.raises(ValueError): + _ = train_val_test_split_edges(None, TrainValTest(.8, .2, .2)) + res = train_val_test_split_edges(edges, TrainValTest(.8, .1, .1)) + assert res.train.shape == (8, 2) and res.val.shape == (1, 2) and \ + res.test.shape == (1, 2) + res = train_val_test_split_edges(edges, TrainValTest(.8, .0, .2)) + assert res.train.shape == (8, 2) and res.val.shape == (0, 2) and \ + res.test.shape == (2, 2) + res = train_val_test_split_edges(edges, TrainValTest(.8, .2, .0)) + assert res.train.shape == (8, 2) and res.val.shape == (2, 2) and \ + res.test.shape == (0, 2) + res = train_val_test_split_edges(edges, TrainValTest(.0, .5, .5)) + assert res.train.shape == (0, 2) and res.val.shape == (5, 2) and \ + res.test.shape == (5, 2) + res = train_val_test_split_edges(edges, TrainValTest(.0, .0, 1.)) + assert res.train.shape == (0, 2) and res.val.shape == (0, 2) and \ + res.test.shape == (10, 2) + res = train_val_test_split_edges(edges, TrainValTest(.0, 1., .0)) + assert res.train.shape == (0, 2) and res.val.shape == (10, 2) and \ + res.test.shape == (0, 2) + + + + # if ratios.train + ratios.val + ratios.test != 1.0: + # raise ValueError('Train, validation and test ratios must add up to 1') + # + # order = torch.randperm(len(edges)) + # edges = edges[order, :] + # n = round(len(edges) * ratios.train) + # edges_train = edges[:n] + # n_1 = round(len(edges) * (ratios.train + ratios.val)) + # edges_val = edges[n:n_1] + # edges_test = edges[n_1:] + # + # return TrainValTest(edges_train, edges_val, edges_test)