import tensorflow as tf import numpy as np from collections import defaultdict import torch import torch.utils.data from typing import List, \ Union import icosagon.sampling import scipy.stats def test_unigram_01(): range_max = 7 distortion = 0.75 batch_size = 500 unigrams = [ 1, 3, 2, 1, 2, 1, 3] num_true = 1 true_classes = np.zeros((batch_size, num_true), dtype=np.int64) for i in range(batch_size): true_classes[i, 0] = i % range_max true_classes = tf.convert_to_tensor(true_classes) neg_samples, _, _ = tf.nn.fixed_unigram_candidate_sampler( true_classes=true_classes, num_true=num_true, num_sampled=batch_size, unique=False, range_max=range_max, distortion=distortion, unigrams=unigrams) assert neg_samples.shape == (batch_size,) for i in range(batch_size): assert neg_samples[i] != true_classes[i, 0] counts = defaultdict(int) with tf.Session() as sess: neg_samples = neg_samples.eval() for x in neg_samples: counts[x] += 1 print('counts:', counts) assert counts[0] < counts[1] and \ counts[0] < counts[2] and \ counts[0] < counts[4] and \ counts[0] < counts[6] assert counts[2] < counts[1] and \ counts[0] < counts[6] assert counts[3] < counts[1] and \ counts[3] < counts[2] and \ counts[3] < counts[4] and \ counts[3] < counts[6] assert counts[4] < counts[1] and \ counts[4] < counts[6] assert counts[5] < counts[1] and \ counts[5] < counts[2] and \ counts[5] < counts[4] and \ counts[5] < counts[6] def test_unigram_02(): range_max = 7 distortion = 0.75 batch_size = 500 unigrams = [ 1, 3, 2, 1, 2, 1, 3] num_true = 1 true_classes = np.zeros((batch_size, num_true), dtype=np.int64) for i in range(batch_size): true_classes[i, 0] = i % range_max true_classes = torch.tensor(true_classes) neg_samples = icosagon.sampling.fixed_unigram_candidate_sampler( true_classes=true_classes, unigrams=unigrams, distortion=distortion) assert neg_samples.shape == (batch_size,) for i in range(batch_size): assert neg_samples[i] != true_classes[i, 0] counts = defaultdict(int) for x in neg_samples: counts[x.item()] += 1 print('counts:', counts) assert counts[0] < counts[1] and \ counts[0] < counts[2] and \ counts[0] < counts[4] and \ counts[0] < counts[6] assert counts[2] < counts[1] and \ counts[0] < counts[6] assert counts[3] < counts[1] and \ counts[3] < counts[2] and \ counts[3] < counts[4] and \ counts[3] < counts[6] assert counts[4] < counts[1] and \ counts[4] < counts[6] assert counts[5] < counts[1] and \ counts[5] < counts[2] and \ counts[5] < counts[4] and \ counts[5] < counts[6] def test_unigram_03(): range_max = 7 distortion = 0.75 batch_size = 25 unigrams = [ 1, 3, 2, 1, 2, 1, 3] num_true = 1 true_classes = np.zeros((batch_size, num_true), dtype=np.int64) for i in range(batch_size): true_classes[i, 0] = i % range_max true_classes_tf = tf.convert_to_tensor(true_classes) true_classes_torch = torch.tensor(true_classes) counts_tf = defaultdict(list) counts_torch = defaultdict(list) for i in range(100): neg_samples, _, _ = tf.nn.fixed_unigram_candidate_sampler( true_classes=true_classes_tf, num_true=num_true, num_sampled=batch_size, unique=False, range_max=range_max, distortion=distortion, unigrams=unigrams) counts = defaultdict(int) with tf.Session() as sess: neg_samples = neg_samples.eval() for x in neg_samples: counts[x.item()] += 1 for k, v in counts.items(): counts_tf[k].append(v) neg_samples = icosagon.sampling.fixed_unigram_candidate_sampler( true_classes=true_classes, distortion=distortion, unigrams=unigrams) counts = defaultdict(int) for x in neg_samples: counts[x.item()] += 1 for k, v in counts.items(): counts_torch[k].append(v) for i in range(range_max): print('counts_tf[%d]:' % i, counts_tf[i]) print('counts_torch[%d]:' % i, counts_torch[i]) for i in range(range_max): statistic, pvalue = scipy.stats.ttest_ind(counts_tf[i], counts_torch[i]) assert pvalue * range_max > .05