public void ReverseAddDifferentSizesTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var testData2 = DataGenerator.Generate().Skip(addSize).Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(4 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); //We have to create a foldable version. var data = bloomFilter.Extract(); var foldFactor = configuration.FoldingStrategy.GetAllFoldFactors(data.BlockSize).Where(f => f > 1).OrderBy(f => f).First(); bloomFilter2.Initialize(addSize, data.BlockSize / foldFactor, data.HashFunctionCount); foreach (var itm in testData2) { bloomFilter2.Add(itm); } bloomFilter.Add(bloomFilter2); var contained = testData.Union(testData2).Count(item => bloomFilter.Contains(item)); Assert.AreEqual(contained, 2 * addSize, "Not all items found in added Bloom filters"); }
public void ReverseAddTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var testData2 = DataGenerator.Generate().Skip(addSize).Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter2.Initialize(2 * size, errorRate); foreach (var itm in testData2) { bloomFilter2.Add(itm); } bloomFilter.Add(bloomFilter2); var contained = testData.Union(testData2).Count(item => bloomFilter.Contains(item)); Assert.AreEqual(contained, 2 * addSize, "Not all items found in added Bloom filters"); }
public void ReverseRemoveKeyTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var contained = testData.Count(item => bloomFilter.Contains(item)); try { foreach (var item in testData.Take(addSize / 2)) { bloomFilter.RemoveKey(item.Id); } Assert.Fail("RemoveKey should not be supported by a reverse invertible Bloom filter"); } catch (NotSupportedException) { }; }
public void ReverseRemoveItemTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var contained = testData.Count(item => bloomFilter.Contains(item)); foreach (var item in testData.Take(addSize / 2)) { bloomFilter.Remove(item); } var containedAfterRemove = testData.Count(item => bloomFilter.Contains(item)); //tricky: assuming zero false positives. Assert.AreEqual(contained, containedAfterRemove * 2, "Wrong item count after removal."); }
public void HybridIntersectDifferentFiltersTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter2.Initialize(2 * size, errorRate); foreach (var itm in testData.Skip(1000)) { bloomFilter2.Add(itm); } bloomFilter.Intersect(bloomFilter2); Assert.AreEqual(9000, bloomFilter.ItemCount); var count = testData.Skip(1000).Count(bloomFilter.Contains); //Note: intersect introduces a horrible error rate when utilizing XOR, so don't actually use intersect. //There are however definitions of operations possible where the intersect would not have this horrible effect. Assert.IsTrue(count > 6700); Assert.IsTrue(testData.Take(1000).All(d => !bloomFilter.Contains(d))); }
public void ReverseIntersectEqualFiltersTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var bloomFilter2 = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter2.Initialize(2 * size, errorRate); foreach (var itm in testData) { bloomFilter2.Add(itm); } bloomFilter.Intersect(bloomFilter2); Assert.AreEqual(addSize, bloomFilter.ItemCount); Assert.IsTrue(testData.All(bloomFilter.Contains)); }
public void ReverseFalsePositiveTest() { var addSize = 10000; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var errorRate = 0.001F; var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(size, errorRate); foreach (var itm in testData) { bloomFilter.Add(itm); } var notFoundCount = testData.Count(itm => !bloomFilter.Contains(itm)); Assert.IsTrue(notFoundCount == 0, "False negative error rate violated"); try { notFoundCount = testData.Count(itm => !bloomFilter.ContainsKey(itm.Id)); Assert.Fail("Invertible reverse Bloom filter does not support ContainsKey."); } catch (NotSupportedException) { }; notFoundCount = DataGenerator.Generate().Skip(addSize).Take(addSize).Count(itm => bloomFilter.Contains(itm)); Assert.IsTrue(notFoundCount <= 20 * errorRate * addSize, "False positive error rate violated"); try { notFoundCount = DataGenerator.Generate().Skip(addSize).Take(addSize).Count(itm => bloomFilter.ContainsKey(itm.Id)); Assert.Fail("Invertible reverse Bloom filter does not support ContainsKey."); } catch (NotSupportedException) { }; }
public void ReverseSimpleFold() { var addSize = 50; var testData = DataGenerator.Generate().Take(addSize).ToArray(); var size = testData.Length; var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(size, 1024, (uint)3); foreach (var itm in testData) { bloomFilter.Add(itm); } var positiveCount = DataGenerator.Generate().Take(500).Count(itm => bloomFilter.Contains(itm)); var folded = bloomFilter.Fold(4); var positiveCountAfterFold = DataGenerator.Generate().Take(500).Count(itm => bloomFilter.Contains(itm)); Assert.AreEqual(positiveCount, positiveCountAfterFold, "False positive count different after fold"); Assert.AreEqual(256, folded.Extract().BlockSize); Assert.IsTrue(testData.All(item => bloomFilter.Contains(item)), "False negative found"); }
public void ReverseInvertibleBloomFilterEmptySetDiffTest() { var addSize = 1000; var modCount = addSize; var dataSet1 = DataGenerator.Generate().Take(0).ToList(); var dataSet2 = DataGenerator.Generate().Take(addSize).ToList(); dataSet2.Modify(modCount); var configuration = new KeyValueBloomFilterConfiguration(); var bloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); bloomFilter.Initialize(10 * modCount, 0.0001F); foreach (var itm in dataSet1) { bloomFilter.Add(itm); } var secondBloomFilter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(configuration); secondBloomFilter.Initialize(10 * modCount, 0.0001F); foreach (var itm in dataSet2) { secondBloomFilter.Add(itm); } var changed = new HashSet <long>(); var onlyInFirst = new HashSet <long>(); var onlyInSecond = new HashSet <long>(); var decoded = bloomFilter .SubtractAndDecode(secondBloomFilter, onlyInFirst, onlyInSecond, changed); var onlyInSet1 = dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var onlyInSet2 = dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var modified = dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value)).Select(d => d.Id).OrderBy(id => id).ToArray(); //fairly sensitive to decoding errors (due to the same reason as Contains is rather unreliable: the pure function does not check the id value and hash value) Assert.IsTrue(decoded.HasValue, "Decoding failed"); Assert.IsTrue(onlyInSet1.Length == onlyInFirst.Count, "Incorrect number of changes detected on 'only in set 1'"); Assert.IsTrue(onlyInSet2.Length == onlyInSecond.Count, "Incorrect number of changes detected on 'only in set 2'"); Assert.IsTrue(changed.Count == modified.Length, "Incorrect number of modified items detected"); }
public void ReverseCompressTest() { var addSize = 10000; var errorRate = 0.001F; var data = DataGenerator.Generate().Take(addSize).ToArray(); var filter = new InvertibleReverseBloomFilter <TestEntity, long, sbyte>(new KeyValueBloomFilterConfiguration()); filter.Initialize(50 * data.Length, errorRate); Assert.AreEqual(filter.Capacity, 500000, "Unexpected size of reverse Bloom filter."); foreach (var item in data) { filter.Add(item); } //check error rate. var notFoundCount = DataGenerator.Generate().Skip(addSize).Take(10000).Count(itm => filter.Contains(itm)); Assert.IsTrue(notFoundCount <= 4 * errorRate * addSize, "Uncompressed reverse Bloom filter exceeded error rate."); filter.Compress(true); Assert.AreEqual(filter.Capacity, 12820, "Unexpected size of compressed reverse Bloom filter."); var compressNotFoundCount = DataGenerator.Generate().Skip(addSize).Take(10000).Count(itm => filter.Contains(itm)); Assert.IsTrue(compressNotFoundCount <= 4 * errorRate * addSize, "Compressed reverse Bloom filter exceeded error rate."); }
public void SplitRibfDecodePerformance() { var configuration = new KeyValueLargeBloomFilterConfiguration(); var size = new[] { 1000, 10000, 100000 }; var modPercentage = new[] { 0, 0.01D, 0.1D, 0.2D, 0.5D, 1.0D }; foreach (var s in size) { using ( var writer = new StreamWriter(File.Open($"splitribfdecode-{s}.csv", FileMode.Create))) { writer.WriteLine("timeInMs,sizeInBytes,capacity,modCount,detectedModCount,countDiff,countDiffSd,decodeSuccessRate"); foreach (var mod in modPercentage) { foreach (var capacityPercentage in new[] { 0.5, 1, 2, 5, 10, 100 }) { var sizeInBytes = new long[100]; var timeSpan = new long[50]; var countAggregate = new int[50]; var modCountResultAggregate = new int[50]; var decodeResult = new int[50]; for (var run = 0; run < 50; run++) { var dataSet1 = DataGenerator.Generate().Take(s).ToList(); var dataSet2 = DataGenerator.Generate().Take(s).ToList(); dataSet2.Modify((int)(s * mod)); var onlyInSet1 = dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var onlyInSet2 = dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var modified = dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value)).Select(d => d.Id).OrderBy(id => id).ToArray(); var idealCapacity = Math.Max(15, onlyInSet1.Count() + onlyInSet2.Count() + modified.Count()); var stopWatch = new Stopwatch(); stopWatch.Start(); var bloomFilter1 = new InvertibleReverseBloomFilter <TestEntity, long, int>(configuration); bloomFilter1.Initialize((int)(idealCapacity * capacityPercentage), 0.01F); foreach (var item in dataSet1) { bloomFilter1.Add(item); } var bloomFilter2 = new InvertibleReverseBloomFilter <TestEntity, long, int>(configuration); bloomFilter2.Initialize((int)(idealCapacity * capacityPercentage), 0.01F); foreach (var item in dataSet2) { bloomFilter2.Add(item); } var s1 = new HashSet <long>(); var s2 = new HashSet <long>(); var s3 = new HashSet <long>(); var success = bloomFilter1.SubtractAndDecode(bloomFilter2, s1, s2, s3); stopWatch.Stop(); using (var stream = new MemoryStream()) { _protobufTypeModel.Serialize(stream, bloomFilter1.Extract()); stream.Position = 0; sizeInBytes[run] = stream.Length; } using (var stream = new MemoryStream()) { _protobufTypeModel.Serialize(stream, bloomFilter2.Extract()); stream.Position = 0; sizeInBytes[50 + run] = stream.Length; } timeSpan[run] = stopWatch.ElapsedMilliseconds; countAggregate[run] = onlyInSet1.Count() + onlyInSet2.Count() + modified.Count(); modCountResultAggregate[run] = s1.Union(s2).Union(s3).Count(v => onlyInSet1.Contains(v) || onlyInSet2.Contains(v) || modified.Contains(v)); decodeResult[run] = success == true ? 1 : 0; } var countAvg = (long)countAggregate.Average(); var modCountResult = (long)modCountResultAggregate.Average(); var differenceResult = modCountResultAggregate.Select((r, i) => r - countAggregate[i]).ToArray(); var differenceSd = Math.Sqrt(differenceResult.Variance()); writer .WriteLine($"{timeSpan.Average()},{sizeInBytes.Average()},{Math.Max(15, capacityPercentage * (int)(s * mod))},{countAvg},{modCountResult},{(long)differenceResult.Average()},{differenceSd},{1.0D * decodeResult.Sum() / 50}"); } } } } }
public void RibfDecodePerSizePerformance() { var configuration = new KeyValueLargeBloomFilterConfiguration(); var size = new[] { 1000, 10000, 100000 }; var modPercentage = new[] { 0, 0.01D, 0.1D, 0.2D, 0.5D, 1.0D }; foreach (var s in size) { using ( var writer = new StreamWriter(File.Open($"ribfdecodespersize-{s}.csv", FileMode.Create))) { writer.WriteLine("capacity,modCount,estimatedModCount,size,decodesPerSize,decodesPerSizeSd,decodeSuccessRate"); foreach (var mod in modPercentage) { foreach (var capacity in new[] { 10, 100, 500, 1000, 2000, 5000, 10000 }) { var countSize = 0; var decodesPerSize = new double[50]; var decodeResult = new int[50]; var modCountResultAggregate = new int[50]; for (var run = 0; run < 50; run++) { var dataSet1 = DataGenerator.Generate().Take(s).ToList(); var dataSet2 = DataGenerator.Generate().Take(s).ToList(); dataSet2.Modify((int)(s * mod)); var onlyInSet1 = dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var onlyInSet2 = dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray(); var modified = dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value)).Select(d => d.Id).OrderBy(id => id).ToArray(); var bloomFilter1 = new InvertibleReverseBloomFilter <TestEntity, long, int>(configuration); bloomFilter1.Initialize(capacity, 0.01F); foreach (var item in dataSet1) { bloomFilter1.Add(item); } var bloomFilter2 = new InvertibleReverseBloomFilter <TestEntity, long, int>(configuration); bloomFilter2.Initialize(capacity, 0.01F); foreach (var item in dataSet2) { bloomFilter2.Add(item); } var s1 = new HashSet <long>(); var s2 = new HashSet <long>(); var s3 = new HashSet <long>(); decodeResult[run] = bloomFilter1.SubtractAndDecode(bloomFilter2, s1, s2, s3) == true ? 1 : 0; var mods = s1.Union(s2).Union(s3).ToArray(); modCountResultAggregate[run] = mods.Count(v => onlyInSet1.Contains(v) || onlyInSet2.Contains(v) || modified.Contains(v)); decodesPerSize[run] = 1.0D * modCountResultAggregate[run] / bloomFilter1.Extract().Counts.Length; countSize = bloomFilter1.Extract().Counts.Length; } writer .WriteLine($"{capacity},{s * mod},{modCountResultAggregate.Average()},{countSize},{decodesPerSize.Average()},{Math.Sqrt(decodesPerSize.Variance())},{decodeResult.Average()}"); } } } } }