public void HybridEstimatorRemoveTest()
        {
            //create data
            var data = DataGenerator.Generate().Take(10000).ToArray();
            //create the estimator
            var configuration = new KeyValueLargeBloomFilterConfiguration();
            var factory       = new HybridEstimatorFactory();
            var estimator     = factory.Create(configuration, data.Length);

            //add data to the estimator
            foreach (var element in data)
            {
                estimator.Add(element);
            }
            //create a second estimator
            var estimator2 = factory.Create(configuration, data.Length);

            foreach (var element in data)
            {
                estimator2.Add(element);
            }
            //estimate differences
            var estimateBeforeRemoval = estimator.Decode(estimator2);

            Assert.AreEqual(estimateBeforeRemoval, 0, "Unexpected number of differences before removing items.");
            //remove 100 items from the estimator
            foreach (var item in data.Take(100))
            {
                estimator.Remove(item);
            }
            //compare differences
            var estimateAfterRemoval = estimator.Decode(estimator2);

            Assert.IsTrue(estimateAfterRemoval > 100, "Removal from estimator resulted in not enough differences.");
        }
        public void HybridEstimatorCompressTest()
        {
            var data          = DataGenerator.Generate().Take(10000).ToArray();
            var configuration = new KeyValueLargeBloomFilterConfiguration();
            var factory       = new HybridEstimatorFactory();
            var estimator     = factory.Create(configuration, 2000 * data.Length);

            Assert.AreEqual(estimator.BlockSize, 1170, "Unexpected block size before compression.");
            foreach (var element in data.Take(50))
            {
                estimator.Add(element);
            }
            var estimator2 = factory.Create(configuration, 2000 * data.Length);

            foreach (var element in data.Skip(50).Take(50))
            {
                estimator2.Add(element);
            }
            var estimateBeforeCompression = estimator.Decode(estimator2);

            estimator.Compress(true);
            Assert.AreEqual(estimator.BlockSize, 65, "Compression resulted in unexpected block size.");
            var estimateAfterCompression = estimator.Decode(estimator2);

            //note: rather tricky. Both estimators should have the same item count, otherwise compression does impact the result,
            //since one estimator no longer fits in the compressed one.
            Assert.AreEqual(estimateAfterCompression, estimateBeforeCompression, "Estimate changed due to compression.");
        }
        public void HybridEstimatorBasicFillAndEstimate()
        {
            var data          = DataGenerator.Generate().Take(10000).ToArray();
            var configuration = new KeyValueLargeBloomFilterConfiguration();
            var factory       = new HybridEstimatorFactory();
            var estimator     = factory.Create(configuration, data.Length);

            foreach (var element in data)
            {
                estimator.Add(element);
            }
            Assert.AreEqual(estimator.ItemCount, data.LongLength, "Estimator item count is wrong");
            var estimator2  = factory.Create(configuration, data.Length);
            var halfTheDiff = 100;

            foreach (var elt in data.Take(halfTheDiff))
            {
                elt.Id += 1000000;
            }
            foreach (var elt in data.Skip(100000).Take(halfTheDiff))
            {
                elt.Value += 10;
            }
            foreach (var element in data)
            {
                //just making sure we do not depend upon the order of adding things.
                estimator2.Add(element);
            }
            Assert.AreEqual(estimator2.ItemCount, data.LongLength, "Second estimator item count is wrong");
            var differenceCount = estimator.Decode(estimator2);

            Assert.IsTrue(differenceCount >= 2 * halfTheDiff, "Estimate below the difference count.");
        }
Example #4
0
        public void TestPrecalculatedRoundTrip()
        {
            //choosing a counter type that is too small will result in many overflows (which manifests itself in horribly slow performance).
            //Keep the count type large enough, so extreme folds do not cause overflows. Benefit of folds outweighs benefit of small count types.
            var configuration = new KeyValueLargeBloomFilterConfiguration();
            IHybridEstimatorFactory       estimatorFactory   = new HybridEstimatorFactory();
            IInvertibleBloomFilterFactory bloomFilterFactory = new InvertibleBloomFilterFactory();
            //create the first actor
            var dataSet1 = DataGenerator.Generate().Take(15000).ToList();
            var actor1   = new PrecalculatedActor <short>(
                dataSet1,
                estimatorFactory,
                bloomFilterFactory,
                configuration);
            //create the second actor
            var dataSet2 = DataGenerator.Generate().Take(17000).ToList();

            dataSet2.Modify(1000);
            var actor2 = new PrecalculatedActor <short>(
                dataSet2,
                estimatorFactory,
                bloomFilterFactory,
                configuration);

            //have actor 1 determine the difference with actor 2.
            var timer = new Stopwatch();

            timer.Start();
            var result = actor1.GetDifference(actor2);

            timer.Stop();
            Console.WriteLine($"Time: {timer.ElapsedMilliseconds} ms");

            //analyze results
            var allFound = new HashSet <long>(result.Item1.Union(result.Item2).Union(result.Item3));

            Assert.IsTrue(allFound.Count() > 3000, "Less than the expected number of diffferences found.");
            var onlyInSet1 =
                dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray();
            var onlyInSet2 =
                dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray();
            var modified =
                dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value))
                .Select(d => d.Id)
                .OrderBy(id => id)
                .ToArray();
            var falsePositives =
                allFound.Where(itm => !onlyInSet1.Contains(itm) && !onlyInSet2.Contains(itm) && !modified.Contains(itm))
                .ToArray();

            Assert.IsTrue(falsePositives.Count() < 50, "Too many false positives found");
            var falseNegatives =
                onlyInSet1.Where(itm => !allFound.Contains(itm))
                .Union(onlyInSet2.Where(itm => !allFound.Contains(itm)))
                .Union(modified.Where(itm => !allFound.Contains(itm)))
                .ToArray();

            Assert.IsTrue(falseNegatives.Count() < 25, "Too many false negatives found");
        }
        public void HybridEstimatorQuasiRandomDecodeTest()
        {
            var data          = DataGenerator.Generate().Take(900000).ToList();
            var configuration = new KeyValueLargeBloomFilterConfiguration();
            var factory       = new HybridEstimatorFactory();
            var estimator     = factory.Create(configuration, data.Count);

            foreach (var element in data)
            {
                estimator.Add(element);
            }
            // data = DataGenerator.Generate().Skip(500).Take(20).ToList();
            data.Modify(1000);
            var estimate = estimator.QuasiDecode(configuration, data);
            //actual difference is expected to be about 91500
        }
        public void HybridEstimatorQuasiDecodeTest()
        {
            var data          = DataGenerator.Generate().Take(100000).ToList();
            var configuration = new KeyValueLargeBloomFilterConfiguration();
            var factory       = new HybridEstimatorFactory();
            var estimator     = factory.Create(configuration, data.Count);

            foreach (var element in data)
            {
                estimator.Add(element);
            }
            data = DataGenerator.Generate().Skip(500).Take(8000).ToList();
            data.Modify(1000);
            var estimate = estimator.QuasiDecode(configuration, data);

            //actual difference is expected to be about 91500
            Assert.IsTrue(estimate > 90500 && estimate < 97000, "Unexpected estimate for difference.");
        }