Example #1
0
        public void ParallelInvertibleAddDifferentSizesTest()
        {
            var addSize       = 10000;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var testData2     = DataGenerator.Generate().Skip(addSize).Take(addSize).ToArray();
            var errorRate     = 0.001F;
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(4 * size, errorRate);
            Parallel.ForEach(Partitioner.Create(testData, true), d => bloomFilter.Add(d));

            var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);
            //We have to create a foldable version.
            var data       = bloomFilter.Extract();
            var foldFactor = configuration.FoldingStrategy.GetAllFoldFactors(data.BlockSize).Where(f => f > 1).OrderBy(f => f).First();

            bloomFilter2.Initialize(addSize, data.BlockSize / foldFactor, data.HashFunctionCount);
            Parallel.ForEach(Partitioner.Create(testData2, true), d => bloomFilter2.Add(d));

            //add the bloom filters.
            bloomFilter.Add(bloomFilter2);
            var contained = testData.Union(testData2).Count(item => bloomFilter.Contains(item));

            Assert.AreEqual(contained, 2 * addSize, "Not all items found in added Bloom filters");
        }
Example #2
0
        public void InvertibleRemoveKeyTest()
        {
            var addSize       = 10000;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var errorRate     = 0.001F;
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(2 * size, errorRate);
            foreach (var itm in testData)
            {
                bloomFilter.Add(itm);
            }
            var contained = testData.Count(item => bloomFilter.Contains(item));

            foreach (var item in testData.Take(addSize / 2))
            {
                bloomFilter.RemoveKey(item.Id);
            }
            var containedAfterRemove = testData.Count(item => bloomFilter.Contains(item));

            //tricky: assuming zero false positives.
            Assert.AreEqual(contained, containedAfterRemove * 2, "Wrong item count after removal.");
        }
Example #3
0
        public void InvertibleIntersectDifferentFiltersTest()
        {
            var addSize       = 10000;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var errorRate     = 0.001F;
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(2 * size, errorRate);
            foreach (var itm in testData)
            {
                bloomFilter.Add(itm);
            }
            var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter2.Initialize(2 * size, errorRate);
            foreach (var itm in testData.Skip(1000))
            {
                bloomFilter2.Add(itm);
            }
            bloomFilter.Intersect(bloomFilter2);
            Assert.AreEqual(9000, bloomFilter.ItemCount);
            var count = testData.Skip(1000).Count(bloomFilter.Contains);

            //Note: intersect introduces a horrible error rate when utilizing XOR, so don't actually use intersect.
            //There are however definitions of operations possible where the intersect would not have this horrible effect.
            Assert.IsTrue(count > 7800);
        }
Example #4
0
        public void InvertibleIntersectEqualFiltersTest()
        {
            var addSize       = 10000;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var errorRate     = 0.001F;
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(2 * size, errorRate);
            foreach (var itm in testData)
            {
                bloomFilter.Add(itm);
            }
            var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter2.Initialize(2 * size, errorRate);
            foreach (var itm in testData)
            {
                bloomFilter2.Add(itm);
            }
            bloomFilter.Intersect(bloomFilter2);
            Assert.AreEqual(addSize, bloomFilter.ItemCount);
            Assert.IsTrue(testData.All(bloomFilter.Contains));
        }
Example #5
0
        public void InvertibleAddTest()
        {
            var addSize       = 10000;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var testData2     = DataGenerator.Generate().Skip(addSize).Take(addSize).ToArray();
            var errorRate     = 0.001F;
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(2 * size, errorRate);
            foreach (var itm in testData)
            {
                bloomFilter.Add(itm);
            }
            var bloomFilter2 = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter2.Initialize(2 * size, errorRate);
            foreach (var itm in testData2)
            {
                bloomFilter2.Add(itm);
            }
            bloomFilter.Add(bloomFilter2);
            var contained = testData.Union(testData2).Count(item => bloomFilter.Contains(item));

            Assert.AreEqual(contained, 2 * addSize, "Not all items found in added Bloom filters");
        }
Example #6
0
        public void InvertibleBloomFilterQuasiDecodeTest()
        {
            var size          = 100000;
            var data          = DataGenerator.Generate().Take(size).ToList();
            var errorRate     = 0.001F;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(size, errorRate);
            foreach (var itm in data)
            {
                bloomFilter.Add(itm);
            }
            data = DataGenerator.Generate().Skip(500).Take(8000).ToList();
            data.Modify(1000);
            var estimate = bloomFilter.QuasiDecode(data);

            Assert.IsTrue(estimate > 90500 && estimate < 97000, "Unexpected estimate for difference.");
        }
Example #7
0
        public void InvertibleSimpleFold()
        {
            var addSize       = 50;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(size, 1024, (uint)3);
            foreach (var itm in testData)
            {
                bloomFilter.Add(itm);
            }
            var positiveCount          = DataGenerator.Generate().Take(500).Count(itm => bloomFilter.Contains(itm));
            var folded                 = bloomFilter.Fold(4);
            var positiveCountAfterFold = DataGenerator.Generate().Take(500).Count(itm => bloomFilter.Contains(itm));

            Assert.AreEqual(positiveCount, positiveCountAfterFold, "False positive count different after fold");
            Assert.AreEqual(256, folded.Extract().BlockSize);
            Assert.IsTrue(testData.All(item => bloomFilter.Contains(item)), "False negative found");
        }
Example #8
0
        public void InvertibleBloomFilterSetDiffTest()
        {
            var addSize  = 1000;
            var modCount = 50;
            var dataSet1 = DataGenerator.Generate().Take(addSize).ToList();
            var dataSet2 = DataGenerator.Generate().Take(addSize).ToList();

            dataSet2.Modify(modCount);
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(10 * modCount, 0.0001F);
            foreach (var itm in dataSet1)
            {
                bloomFilter.Add(itm);
            }
            var secondBloomFilter = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            secondBloomFilter.Initialize(10 * modCount, 0.0001F);
            foreach (var itm in dataSet2)
            {
                secondBloomFilter.Add(itm);
            }
            var changed      = new HashSet <long>();
            var onlyInFirst  = new HashSet <long>();
            var onlyInSecond = new HashSet <long>();
            var decoded      = bloomFilter
                               .SubtractAndDecode(secondBloomFilter, onlyInFirst, onlyInSecond, changed);
            var onlyInSet1 = dataSet1.Where(d => dataSet2.All(d2 => d2.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray();
            var onlyInSet2 = dataSet2.Where(d => dataSet1.All(d1 => d1.Id != d.Id)).Select(d => d.Id).OrderBy(id => id).ToArray();
            var modified   = dataSet1.Where(d => dataSet2.Any(d2 => d2.Id == d.Id && d2.Value != d.Value)).Select(d => d.Id).OrderBy(id => id).ToArray();

            Assert.IsTrue(decoded.HasValue, "Decoding failed");
            Assert.IsTrue(onlyInSet1.Length == onlyInFirst.Count, "Incorrect number of changes detected on 'only in set 1'");
            Assert.IsTrue(onlyInSet2.Length == onlyInSecond.Count, "Incorrect number of changes detected on 'only in set 2'");
            //very bad at recognizing changes.
            Assert.IsTrue(changed.Count <= modified.Length, "Incorrect number of modified items detected");
        }
Example #9
0
        public void InvertibleCompressTest()
        {
            var addSize   = 10000;
            var errorRate = 0.001F;
            var data      = DataGenerator.Generate().Take(addSize).ToArray();
            var filter    = new InvertibleBloomFilter <TestEntity, long, sbyte>(new DefaultBloomFilterConfiguration());

            filter.Initialize(50 * data.Length, errorRate);
            Assert.AreEqual(filter.Capacity, 500000, "Unexpected size of Bloom filter.");
            foreach (var item in data)
            {
                filter.Add(item);
            }
            //check error rate.
            var notFoundCount = DataGenerator.Generate().Skip(addSize).Take(10000).Count(itm => filter.Contains(itm));

            Assert.IsTrue(notFoundCount <= errorRate * addSize, "Uncompressed Bloom filter exceeded error rate.");
            filter.Compress(true);
            Assert.AreEqual(filter.Capacity, 12820, "Unexpected size of compressed Bloom filter.");
            var compressNotFoundCount = DataGenerator.Generate().Skip(addSize).Take(10000).Count(itm => filter.Contains(itm));

            Assert.IsTrue(compressNotFoundCount <= errorRate * addSize, "Compressed Bloom filter exceeded error rate.");
        }
Example #10
0
        public void InvertibleFalsePositiveTest()
        {
            var addSize       = 10000;
            var testData      = DataGenerator.Generate().Take(addSize).ToArray();
            var errorRate     = 0.001F;
            var size          = testData.Length;
            var configuration = new DefaultBloomFilterConfiguration();
            var bloomFilter   = new InvertibleBloomFilter <TestEntity, long, sbyte>(configuration);

            bloomFilter.Initialize(size, errorRate);
            foreach (var itm in testData)
            {
                bloomFilter.Add(itm);
            }
            var notFoundCount = testData.Count(itm => !bloomFilter.Contains(itm));

            Assert.IsTrue(notFoundCount == 0, "False negative error rate violated");
            notFoundCount = testData.Count(itm => !bloomFilter.ContainsKey(itm.Id));
            Assert.IsTrue(notFoundCount == 0, "False negative error rate violated on ContainsKey");
            notFoundCount = DataGenerator.Generate().Skip(addSize).Take(addSize).Count(itm => bloomFilter.Contains(itm));
            Assert.IsTrue(notFoundCount <= errorRate * addSize, "False positive error rate violated");
            notFoundCount = DataGenerator.Generate().Skip(addSize).Take(addSize).Count(itm => bloomFilter.ContainsKey(itm.Id));
            Assert.IsTrue(notFoundCount <= errorRate * addSize, "False positive error rate violated on ContainsKey");
        }
Example #11
0
        /// <summary>
        /// Create filters
        /// </summary>
        /// <param name="estimatorData">Filter data to rehydrate.</param>
        private void CreateFilters(IStrataEstimatorData <int, TCount> estimatorData = null)
        {
            var configuration = Configuration.ConvertToEstimatorConfiguration();

            HashFunctionCount = configuration.BestHashFunctionCount(BlockSize, ErrorRate);
            for (var idx = 0; idx < StrataFilters.Length; idx++)
            {
                if (idx >= MaxStrata)
                {
                    StrataFilters[idx] = null;
                    continue;
                }
                var filterData = estimatorData.GetFilterForStrata(idx);
                //lazily create Strata filters.
                StrataFilters[idx] = new Lazy <InvertibleBloomFilter <KeyValuePair <int, int>, int, TCount> >(() =>
                {
                    var res = new InvertibleBloomFilter <KeyValuePair <int, int>, int, TCount>(configuration);
                    //capacity doesn't really matter, the capacity is basically the block size.
                    res.Initialize(BlockSize, BlockSize, HashFunctionCount);
                    res.Rehydrate(filterData);
                    return(res);
                });
            }
        }