public void HybridEstimatorRemoveTest() { //create data var data = DataGenerator.Generate().Take(10000).ToArray(); //create the estimator var configuration = new KeyValueLargeBloomFilterConfiguration(); var factory = new HybridEstimatorFactory(); var estimator = factory.Create(configuration, data.Length); //add data to the estimator foreach (var element in data) { estimator.Add(element); } //create a second estimator var estimator2 = factory.Create(configuration, data.Length); foreach (var element in data) { estimator2.Add(element); } //estimate differences var estimateBeforeRemoval = estimator.Decode(estimator2); Assert.AreEqual(estimateBeforeRemoval, 0, "Unexpected number of differences before removing items."); //remove 100 items from the estimator foreach (var item in data.Take(100)) { estimator.Remove(item); } //compare differences var estimateAfterRemoval = estimator.Decode(estimator2); Assert.IsTrue(estimateAfterRemoval > 100, "Removal from estimator resulted in not enough differences."); }
public void HybridEstimatorCompressTest() { var data = DataGenerator.Generate().Take(10000).ToArray(); var configuration = new KeyValueLargeBloomFilterConfiguration(); var factory = new HybridEstimatorFactory(); var estimator = factory.Create(configuration, 2000 * data.Length); Assert.AreEqual(estimator.BlockSize, 1170, "Unexpected block size before compression."); foreach (var element in data.Take(50)) { estimator.Add(element); } var estimator2 = factory.Create(configuration, 2000 * data.Length); foreach (var element in data.Skip(50).Take(50)) { estimator2.Add(element); } var estimateBeforeCompression = estimator.Decode(estimator2); estimator.Compress(true); Assert.AreEqual(estimator.BlockSize, 65, "Compression resulted in unexpected block size."); var estimateAfterCompression = estimator.Decode(estimator2); //note: rather tricky. Both estimators should have the same item count, otherwise compression does impact the result, //since one estimator no longer fits in the compressed one. Assert.AreEqual(estimateAfterCompression, estimateBeforeCompression, "Estimate changed due to compression."); }
public void HybridEstimatorBasicFillAndEstimate() { var data = DataGenerator.Generate().Take(10000).ToArray(); var configuration = new KeyValueLargeBloomFilterConfiguration(); var factory = new HybridEstimatorFactory(); var estimator = factory.Create(configuration, data.Length); foreach (var element in data) { estimator.Add(element); } Assert.AreEqual(estimator.ItemCount, data.LongLength, "Estimator item count is wrong"); var estimator2 = factory.Create(configuration, data.Length); var halfTheDiff = 100; foreach (var elt in data.Take(halfTheDiff)) { elt.Id += 1000000; } foreach (var elt in data.Skip(100000).Take(halfTheDiff)) { elt.Value += 10; } foreach (var element in data) { //just making sure we do not depend upon the order of adding things. estimator2.Add(element); } Assert.AreEqual(estimator2.ItemCount, data.LongLength, "Second estimator item count is wrong"); var differenceCount = estimator.Decode(estimator2); Assert.IsTrue(differenceCount >= 2 * halfTheDiff, "Estimate below the difference count."); }
public void TestPrecalculatedRoundTrip() { //choosing a counter type that is too small will result in many overflows (which manifests itself in horribly slow performance). //Keep the count type large enough, so extreme folds do not cause overflows. Benefit of folds outweighs benefit of small count types. var configuration = new KeyValueLargeBloomFilterConfiguration(); IHybridEstimatorFactory estimatorFactory = new HybridEstimatorFactory(); IInvertibleBloomFilterFactory bloomFilterFactory = new InvertibleBloomFilterFactory(); //create the first actor var dataSet1 = DataGenerator.Generate().Take(15000).ToList(); var actor1 = new PrecalculatedActor <short>( dataSet1, estimatorFactory, bloomFilterFactory, configuration); //create the second actor var dataSet2 = DataGenerator.Generate().Take(17000).ToList(); dataSet2.Modify(1000); var actor2 = new PrecalculatedActor <short>( dataSet2, estimatorFactory, bloomFilterFactory, configuration); //have actor 1 determine the difference with actor 2. var timer = new Stopwatch(); timer.Start(); var result = actor1.GetDifference(actor2); timer.Stop(); Console.WriteLine($"Time: {timer.ElapsedMilliseconds} ms"); //analyze results var allFound = new HashSet <long>(result.Item1.Union(result.Item2).Union(result.Item3)); Assert.IsTrue(allFound.Count() > 3000, "Less than the expected number of diffferences found."); var onlyInSet1 = dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var onlyInSet2 = dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var modified = dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value)) .Select(d => d.Id) .OrderBy(id => id) .ToArray(); var falsePositives = allFound.Where(itm => !onlyInSet1.Contains(itm) && !onlyInSet2.Contains(itm) && !modified.Contains(itm)) .ToArray(); Assert.IsTrue(falsePositives.Count() < 50, "Too many false positives found"); var falseNegatives = onlyInSet1.Where(itm => !allFound.Contains(itm)) .Union(onlyInSet2.Where(itm => !allFound.Contains(itm))) .Union(modified.Where(itm => !allFound.Contains(itm))) .ToArray(); Assert.IsTrue(falseNegatives.Count() < 25, "Too many false negatives found"); }
public void HybridEstimatorQuasiRandomDecodeTest() { var data = DataGenerator.Generate().Take(900000).ToList(); var configuration = new KeyValueLargeBloomFilterConfiguration(); var factory = new HybridEstimatorFactory(); var estimator = factory.Create(configuration, data.Count); foreach (var element in data) { estimator.Add(element); } // data = DataGenerator.Generate().Skip(500).Take(20).ToList(); data.Modify(1000); var estimate = estimator.QuasiDecode(configuration, data); //actual difference is expected to be about 91500 }
public void HybridEstimatorQuasiDecodeTest() { var data = DataGenerator.Generate().Take(100000).ToList(); var configuration = new KeyValueLargeBloomFilterConfiguration(); var factory = new HybridEstimatorFactory(); var estimator = factory.Create(configuration, data.Count); foreach (var element in data) { estimator.Add(element); } data = DataGenerator.Generate().Skip(500).Take(8000).ToList(); data.Modify(1000); var estimate = estimator.QuasiDecode(configuration, data); //actual difference is expected to be about 91500 Assert.IsTrue(estimate > 90500 && estimate < 97000, "Unexpected estimate for difference."); }