/// <summary> /// Gets an enumerable whose items are probablisticly distinct, this may exclude some distinct items that a normal <see cref="Enumerable.Distinct{TSource}(IEnumerable{TSource})"/> would include /// </summary> /// <typeparam name="T">Item type</typeparam> /// <param name="enumerable">Enumerable to operate over</param> /// <param name="expectedItems">How many distinct items you expect to process</param> /// <param name="errorRate">Desired error (false positive) rate expressed as value between 0.0 and 1.0</param> /// <param name="h1">Hash function</param> /// <param name="h2">Another hash function</param> /// <returns></returns> public static IEnumerable <T> ProbabilisticDistinct <T>(this IEnumerable <T> enumerable, long expectedItems, double errorRate, Func <T, int> h1, Func <T, int> h2) { IBloomFilterParameters parameters = BloomUtils.CalculateBloomParameters(expectedItems, errorRate); Func <IBloomFilter <T> > filterFactory = () => new SparseFastBloomFilter <T>(parameters, h1, h2); return(new ProbabilisticDistinctEnumerable <T>(enumerable, filterFactory)); }
/// <summary> /// Creates a new filter /// </summary> /// <param name="storage">Bloom Filter storage</param> /// <param name="parameters">Parameters</param> /// <param name="hashFunctions">Hash Functions</param> protected BaseHybridBloomFilter(IBloomFilterStorage storage, IBloomFilterParameters parameters, IEnumerable <Func <T, int> > hashFunctions) : base(storage) { if (parameters.NumberOfBits < 1) { throw new ArgumentException("Number of bits must be >= 1", "parameters"); } if (hashFunctions == null) { throw new ArgumentNullException("hashFunctions"); } this._hashFunctions = new List <Func <T, int> >(hashFunctions); this._hashFunctions.RemoveAll(f => f == null); if (this._hashFunctions.Count <= 1) { throw new ArgumentException("A bloom filter requires at least 2 hash functions", "hashFunctions"); } if (parameters.NumberOfBits <= this._hashFunctions.Count) { throw new ArgumentException("Number of bits must be bigger than the number of hash functions", "parameters"); } this.NumberOfBits = parameters.NumberOfBits; this._parameters = parameters; }
/// <summary> /// Creates a new /// </summary> /// <param name="storage">Bloom Filter Storage</param> /// <param name="parameters">Parameters</param> /// <param name="h1">First hash function</param> /// <param name="h2">Second hash function</param> protected BaseFastBloomFilter(IBloomFilterStorage storage, IBloomFilterParameters parameters, Func <T, int> h1, Func <T, int> h2) : base(storage) { if (parameters == null) { throw new ArgumentNullException("parameters", "Paramaeters cannot be null"); } if (h1 == null) { throw new ArgumentException("Hash functions cannot be null", "h1"); } if (h2 == null) { throw new ArgumentException("Hash functions cannot be null", "h2"); } if (parameters.NumberOfBits <= parameters.NumberOfHashFunctions) { throw new ArgumentException("Number of bits must be bigger than the number of hash functions", "parameters"); } this._parameters = parameters; this.NumberOfBits = parameters.NumberOfBits; this._h1 = h1; this._h2 = h2; }
public void TestMethod1() { IBloomFilterParameters parameters = BloomUtils.CalculateBloomParameters(100, 0.01); Console.WriteLine(parameters.NumberOfBits); Console.WriteLine(parameters.NumberOfHashFunctions); }
public void CheckParameterCalculation(long expectedItems, long errorRate, int expectedNumBits, int expectedNumHashFunctions) { IBloomFilterParameters parameters = BloomUtils.CalculateBloomParameters(expectedItems, errorRate); Assert.AreEqual(expectedNumBits, parameters.NumberOfBits); Assert.AreEqual(expectedNumHashFunctions, parameters.NumberOfHashFunctions); CheckErrorRate(expectedItems, errorRate, parameters); }
/// <summary> /// Creates new storage /// </summary> /// <param name="parameters">Parameters</param> public SparseArrayStorage(IBloomFilterParameters parameters) { if (parameters == null) { throw new ArgumentNullException("parameters"); } if (parameters.NumberOfBits <= 0) { throw new ArgumentException("Number of bits must be > 0", "parameters"); } this._bits = new BlockSparseArray <bool>(parameters.NumberOfBits); }
/* * ln p = -(m/n) * ((ln 2)^2). */ /// <summary> /// Given some parameters and the expected number of items calculates the error rate /// </summary> /// <param name="expectedItems">Expected number of items that will be added to the filter</param> /// <param name="parameters">Bloom Filter Parameters</param> /// <returns>Error Rate as a value between 0 and 1.0</returns> public static double CalculateErrorRate(long expectedItems, IBloomFilterParameters parameters) { if (expectedItems < 1) { throw new ArgumentException("expectedItems must be >= 1", "expectedItems"); } if (parameters == null) { throw new ArgumentNullException("parameters"); } double lnP = (-1d * ((double)parameters.NumberOfBits / expectedItems)) * Math.Pow(Math.Log(2), 2d); return(Math.Pow(Math.E, lnP)); }
// Test cases are based on values calculated at http://hur.st/bloomfilter private void CheckErrorRate(long expectedItems, long expectedErrorRate, IBloomFilterParameters parameters) { long actualErrorRate = CalculateErrorRate(expectedItems, parameters); Console.WriteLine("n = {0}, p = 1 in {1}", expectedItems, actualErrorRate); Assert.AreEqual(expectedErrorRate, actualErrorRate); // If we half the number of items we add the error rate should decrease // NB - Since we are expressing error rate as 1 in p actual value will increase actualErrorRate = CalculateErrorRate(expectedItems / 2, parameters); Console.WriteLine("n = {0}, p = 1 in {1}", expectedItems / 2, actualErrorRate); Assert.IsTrue(actualErrorRate > expectedErrorRate); // If we double the number of items we add the error rate should increase // NB - Since we are expressing error rate as 1 in p actual value will decrease actualErrorRate = CalculateErrorRate(expectedItems * 2, parameters); Console.WriteLine("n = {0}, p = 1 in {1}", expectedItems * 2, actualErrorRate); Assert.IsTrue(actualErrorRate < expectedErrorRate); }
/// <summary> /// Creates a new filter /// </summary> /// <param name="parameters">Parameters</param> /// <param name="hashFunctions">Hash functions</param> public HybridBloomFilter(IBloomFilterParameters parameters, IEnumerable <Func <T, int> > hashFunctions) : base(new ArrayStorage(parameters.NumberOfBits), parameters, hashFunctions) { }
/// <summary> /// Creates a new filter /// </summary> /// <param name="parameters">Parameters</param> /// <param name="h1">Hash function 1</param> /// <param name="h2">Hash function 2</param> public SparseFastBloomFilter(IBloomFilterParameters parameters, Func <T, int> h1, Func <T, int> h2) : base(new SparseArrayStorage(parameters), parameters, h1, h2) { }
private static long CalculateErrorRate(long expectedItems, IBloomFilterParameters parameters) { double calcErrorRate = BloomUtils.CalculateErrorRate(expectedItems, parameters); return(Convert.ToInt64(1 / calcErrorRate)); }
/// <summary> /// Creates a new filter /// </summary> /// <param name="parameters">Parameters</param> /// <param name="h1">Hash function 1</param> /// <param name="h2">Hash function 2</param> public FastBloomFilter(IBloomFilterParameters parameters, Func <T, int> h1, Func <T, int> h2) : base(new ArrayStorage(parameters.NumberOfBits), parameters, h1, h2) { }
/// <summary> /// Creates a new filter /// </summary> /// <param name="parameters">Parameters</param> /// <param name="hashFunctions">Hash functions</param> public SparseHybridBloomFilter(IBloomFilterParameters parameters, IEnumerable <Func <T, int> > hashFunctions) : base(new SparseArrayStorage(parameters), parameters, hashFunctions) { }