public void ParallelInvertibleAddDifferentSizesTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var testData2 = DataGenerator.Generate().Skip(addSize).Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(4 * size, errorRate); Parallel.ForEach(Partitioner.Create(testData, true), d => bloomFilter.Add(d)); var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); //We have to create a foldable version. var data = bloomFilter.Extract(); var foldFactor = configuration.FoldingStrategy.GetAllFoldFactors(data.BlockSize).Where(f => f > 1).OrderBy(f => f).First(); bloomFilter2.Initialize(addSize, data.BlockSize / foldFactor, data.HashFunctionCount); Parallel.ForEach(Partitioner.Create(testData2, true), d => bloomFilter2.Add(d)); //add the bloom filters. bloomFilter.Add(bloomFilter2); var contained = testData.Union(testData2).Count(item => bloomFilter.Contains(item)); Assert.AreEqual(contained, 2 * addSize, "Not all items found in added Bloom filters"); }
public void InvertibleRemoveKeyTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var contained = testData.Count(item => bloomFilter.Contains(item)); foreach (var item in testData.Take(addSize / 2)) { bloomFilter.RemoveKey(item.Id); } var containedAfterRemove = testData.Count(item => bloomFilter.Contains(item)); //tricky: assuming zero false positives. Assert.AreEqual(contained, containedAfterRemove * 2, "Wrong item count after removal."); }
public void InvertibleIntersectDifferentFiltersTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter2.Initialize(2 * size, errorRate); foreach (var itm in testData.Skip(1000)) { bloomFilter2.Add(itm); } bloomFilter.Intersect(bloomFilter2); Assert.AreEqual(9000, bloomFilter.ItemCount); var count = testData.Skip(1000).Count(bloomFilter.Contains); //Note: intersect introduces a horrible error rate when utilizing XOR, so don't actually use intersect. //There are however definitions of operations possible where the intersect would not have this horrible effect. Assert.IsTrue(count > 7800); }
public void InvertibleIntersectEqualFiltersTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter2.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter2.Add(itm); } bloomFilter.Intersect(bloomFilter2); Assert.AreEqual(addSize, bloomFilter.ItemCount); Assert.IsTrue(testData.All(bloomFilter.Contains)); }
public void InvertibleAddTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var testData2 = DataGenerator.Generate().Skip(addSize).Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter2.Initialize(2 * size, errorRate); foreach (var itm in testData2) { bloomFilter2.Add(itm); } bloomFilter.Add(bloomFilter2); var contained = testData.Union(testData2).Count(item => bloomFilter.Contains(item)); Assert.AreEqual(contained, 2 * addSize, "Not all items found in added Bloom filters"); }
public void InvertibleBloomFilterQuasiDecodeTest() { var size = 100000; var data = DataGenerator.Generate().Take(size).ToList(); var errorRate = 0.001F; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(size, errorRate); foreach (var itm in data) { bloomFilter.Add(itm); } data = DataGenerator.Generate().Skip(500).Take(8000).ToList(); data.Modify(1000); var estimate = bloomFilter.QuasiDecode(data); Assert.IsTrue(estimate > 90500 && estimate < 97000, "Unexpected estimate for difference."); }
public void InvertibleSimpleFold() { var addSize = 50; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(size, 1024, (uint)3); foreach (var itm in testData) { bloomFilter.Add(itm); } var positiveCount = DataGenerator.Generate().Take(500).Count(itm => bloomFilter.Contains(itm)); var folded = bloomFilter.Fold(4); var positiveCountAfterFold = DataGenerator.Generate().Take(500).Count(itm => bloomFilter.Contains(itm)); Assert.AreEqual(positiveCount, positiveCountAfterFold, "False positive count different after fold"); Assert.AreEqual(256, folded.Extract().BlockSize); Assert.IsTrue(testData.All(item => bloomFilter.Contains(item)), "False negative found"); }
public void InvertibleBloomFilterSetDiffTest() { var addSize = 1000; var modCount = 50; var dataSet1 = DataGenerator.Generate().Take(addSize).ToList(); var dataSet2 = DataGenerator.Generate().Take(addSize).ToList(); dataSet2.Modify(modCount); var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(10 * modCount, 0.0001F); foreach (var itm in dataSet1) { bloomFilter.Add(itm); } var secondBloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); secondBloomFilter.Initialize(10 * modCount, 0.0001F); foreach (var itm in dataSet2) { secondBloomFilter.Add(itm); } var changed = new HashSet <long>(); var onlyInFirst = new HashSet <long>(); var onlyInSecond = new HashSet <long>(); var decoded = bloomFilter .SubtractAndDecode(secondBloomFilter, onlyInFirst, onlyInSecond, changed); var onlyInSet1 = dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var onlyInSet2 = dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var modified = dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value)).Select(d => d.Id).OrderBy(id => id).ToArray(); Assert.IsTrue(decoded.HasValue, "Decoding failed"); Assert.IsTrue(onlyInSet1.Length == onlyInFirst.Count, "Incorrect number of changes detected on 'only in set 1'"); Assert.IsTrue(onlyInSet2.Length == onlyInSecond.Count, "Incorrect number of changes detected on 'only in set 2'"); //very bad at recognizing changes. Assert.IsTrue(changed.Count <= modified.Length, "Incorrect number of modified items detected"); }
public void InvertibleCompressTest() { var addSize = 10000; var errorRate = 0.001F; var data = DataGenerator.Generate().Take(addSize).ToArray(); var filter = new InvertibleBloomFilter <TestEntity, long, sbyte>(new DefaultBloomFilterConfiguration()); filter.Initialize(50 * data.Length, errorRate); Assert.AreEqual(filter.Capacity, 500000, "Unexpected size of Bloom filter."); foreach (var item in data) { filter.Add(item); } //check error rate. var notFoundCount = DataGenerator.Generate().Skip(addSize).Take(10000).Count(itm => filter.Contains(itm)); Assert.IsTrue(notFoundCount <= errorRate * addSize, "Uncompressed Bloom filter exceeded error rate."); filter.Compress(true); Assert.AreEqual(filter.Capacity, 12820, "Unexpected size of compressed Bloom filter."); var compressNotFoundCount = DataGenerator.Generate().Skip(addSize).Take(10000).Count(itm => filter.Contains(itm)); Assert.IsTrue(compressNotFoundCount <= errorRate * addSize, "Compressed Bloom filter exceeded error rate."); }
public void InvertibleFalsePositiveTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new DefaultBloomFilterConfiguration(); var bloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var notFoundCount = testData.Count(itm => !bloomFilter.Contains(itm)); Assert.IsTrue(notFoundCount == 0, "False negative error rate violated"); notFoundCount = testData.Count(itm => !bloomFilter.ContainsKey(itm.Id)); Assert.IsTrue(notFoundCount == 0, "False negative error rate violated on ContainsKey"); notFoundCount = DataGenerator.Generate().Skip(addSize).Take(addSize).Count(itm => bloomFilter.Contains(itm)); Assert.IsTrue(notFoundCount <= errorRate * addSize, "False positive error rate violated"); notFoundCount = DataGenerator.Generate().Skip(addSize).Take(addSize).Count(itm => bloomFilter.ContainsKey(itm.Id)); Assert.IsTrue(notFoundCount <= errorRate * addSize, "False positive error rate violated on ContainsKey"); }
/// <summary> /// Create filters /// </summary> /// <param name="estimatorData">Filter data to rehydrate.</param> private void CreateFilters(IStrataEstimatorData <int, TCount> estimatorData = null) { var configuration = Configuration.ConvertToEstimatorConfiguration(); HashFunctionCount = configuration.BestHashFunctionCount(BlockSize, ErrorRate); for (var idx = 0; idx < StrataFilters.Length; idx++) { if (idx >= MaxStrata) { StrataFilters[idx] = null; continue; } var filterData = estimatorData.GetFilterForStrata(idx); //lazily create Strata filters. StrataFilters[idx] = new Lazy <InvertibleBloomFilter <KeyValuePair <int, int>, int, TCount> >(() => { var res = new InvertibleBloomFilter <KeyValuePair <int, int>, int, TCount>(configuration); //capacity doesn't really matter, the capacity is basically the block size. res.Initialize(BlockSize, BlockSize, HashFunctionCount); res.Rehydrate(filterData); return(res); }); } }