public void UniversalHashTestBias() { Pseudorandom pseudo = new Pseudorandom(); UniversalHashFunction f = new UniversalHashFunction("Louis Tully as played by Rick Moranis!"); ulong trials = 100000000; ulong[] bitCounts = new ulong[64]; for (ulong trial = 0; trial < trials; trial++) { string randomString = pseudo.GetString(8); UInt64 supposedlyUnbiasedBits = f.Hash(randomString, UniversalHashFunction.MaximumNumberOfResultBitsAllowing32BiasedBits); for (int bit=0; bit < bitCounts.Length; bit++) { if ((supposedlyUnbiasedBits & (0x8000000000000000ul >> bit)) != 0ul) bitCounts[bit]++; } } double[] biases = bitCounts.Select(count => ( (0.5d - (((double)count) / ((double)trials)))) / 0.5d ).ToArray(); /// The first 32 bits should be unbiased for (int bit = 0; bit < 32; bit++) { double bias = biases[bit]; double biasAbs = Math.Abs(bias); Assert.True(biasAbs < 0.0005d); } }
/// <summary> /// Construct a filter array. /// </summary> /// <param name="numberOfBitsInArray">The size of the array in bits.</param> /// <param name="maximumBitIndexesPerElement">The maximum (and default) number of indexes (bits) in the array to associate with elements.</param> /// <param name="initilizeBitsOfArrayAtRandom">If set to true, the bits of the filter array will be set to 0 or 1 at random (indpendently, each with probability 0.5).</param> /// <param name="saltForHashFunctions">A salt used to generate the hash functions. /// Any two filter arrays generated with the same salt will use the same hash functions. /// The salt should be kept secret from attackerse who might try to manipulate the selection of elements, /// such as to intentionally cause bit collisions with the array.</param> public FilterArray(int numberOfBitsInArray, int maximumBitIndexesPerElement, bool initilizeBitsOfArrayAtRandom, string saltForHashFunctions = "") { // Align on byte boundary to guarantee no less than numberOfBitsInArray int capacityInBytes = (numberOfBitsInArray + 7) / 8; // Create hash functions to map elements to indexes in the bit array. HashFunctionsMappingElementsToBitsInTheArray = new UniversalHashFunction[maximumBitIndexesPerElement]; for (int i = 0; i < HashFunctionsMappingElementsToBitsInTheArray.Length; i++) { HashFunctionsMappingElementsToBitsInTheArray[i] = new UniversalHashFunction(i + ":" + saltForHashFunctions, 64); } if (initilizeBitsOfArrayAtRandom) { // Initialize the bit array setting ~half the bits randomly to zero by using the // cryptographic random number generator. byte[] initialBitValues = new byte[capacityInBytes]; StrongRandomNumberGenerator.GetBytes(initialBitValues); BitArray = new BitArray(initialBitValues); } else { // Start with all bits of the array set to zero. BitArray = new BitArray(capacityInBytes * 8); } }
/// <summary> /// Construct a binomial sketch, in which a set of k hash functions (k=NumberOfIndexes) will map any /// key to k points with an array of n bits (sizeInBits). /// When one Adds a key to a binomial sketch, a random bit among the subset of k that are currently 0 will be set to 1. /// To ensure roughly half the bits remain zero, a random index from the subset of all k bits that are currently 1 will be set to 0. /// /// Over time, popular keys will have almost all of their bits set and unpopular keys will be expected to have roughly half their bits set. /// </summary> /// <param name="sizeInBits">The total number of bits to maintain in the table. In theoretical discussions of bloom filters and sketches /// in general, this is usually referrted to by the letter n.</param> /// <param name="numberOfIndexes">The number of indexes to map each key to, each of which is assigned a unique pseudorandom /// hash function.</param> /// <param name="keyToPreventAlgorithmicComplexityAttacks">A pseudorandom seed that allows the same sketch to be created /// twice, but (if kept secret) prevents an attacker from knowing the distribution of hashes and thus counters /// algorithmic complexity attacks.</param> public BinomialSketch(int sizeInBits, int numberOfIndexes, string keyToPreventAlgorithmicComplexityAttacks) { NumberOfIndexes = numberOfIndexes; string keyToPreventAlgorithmicComplexityAttacks1 = keyToPreventAlgorithmicComplexityAttacks ?? ""; SizeInBits = sizeInBits; _maxNumberOfObservationsAccountingForAging = (ulong) SizeInBits/(ulong) (NumberOfIndexes*2); // Align on next byte boundary if ((SizeInBits & 7) != 0) SizeInBits = (sizeInBits + 8) ^ 0x7; int capacityInBytes = SizeInBits / 8; _universalHashFunctions = new UniversalHashFunction[numberOfIndexes]; for (int i = 0; i < _universalHashFunctions.Length; i++) { _universalHashFunctions[i] = new UniversalHashFunction(i.ToString() + keyToPreventAlgorithmicComplexityAttacks1, 64); } // Initialize the sketch setting ~half the bits randomly to zero by using the // cryptographic random number generator. byte[] initialSketchValues = new byte[capacityInBytes]; StrongRandomNumberGenerator.GetBytes(initialSketchValues); _sketch = new BitArray(initialSketchValues); // binomialProbability[i] = (n choose k) * (p)^k * (1-p)^(n-k) // since p=.5, this is (n choose k) 0.5^(n) double[] binomialProbability = new double[numberOfIndexes + 1]; double probabilityOfAnyGivenValue = Math.Pow(0.5d, numberOfIndexes); double nChooseK = 1d; for (int k = 0; k <= numberOfIndexes/2; k++) { binomialProbability[k] = binomialProbability[numberOfIndexes-k] = nChooseK * probabilityOfAnyGivenValue; nChooseK *= (numberOfIndexes - k)/(1d + k); } _cumulativeProbabilitySetByChance = new double[numberOfIndexes + 1]; _cumulativeProbabilitySetByChance[numberOfIndexes] = binomialProbability[numberOfIndexes]; for (int k = numberOfIndexes; k > 0; k--) _cumulativeProbabilitySetByChance[k-1] = _cumulativeProbabilitySetByChance[k] + binomialProbability[k-1]; }
/// <summary> /// Create a client for a distributed binomial ladder filter /// </summary> /// <param name="numberOfShards">The number of shards that the bit array of the binomial ladder filter will be divided into. /// The greater the number of shards, the more evently it can be distributed. However, the number of shards should still /// be a few orders of magnitude smaller than the ladder height.</param> /// <param name="defaultHeightOfLadder">The default ladder height for elements on the ladder.</param> /// <param name="shardToHostMapping">An object that maps each shard number to the host responsible for that shard.</param> /// <param name="configurationKey">A key used to protect the hashing from algorithmic complexity attacks. /// This key should not be unique to the application using the filter and should not be known to any untrusted /// systems that might control which elements get sent to the filter. If an attacker could submit elements to the filter /// and knew this key, the attacker could arrange for all elements to go to the same shard and in so doing overload that shard.</param> /// <param name="mininmumCacheFreshnessRequired">The maximum time that an element should be kept in the cache of elements at the top of their ladder. /// In other words, how long to bound the possible time that an element may still appear to be at the top of its ladder in the cache /// when it is no longer at the top of the ladder based on the filter array. Defaults to one minute.</param> public DistributedBinomialLadderFilterClient(int numberOfShards, int defaultHeightOfLadder, IDistributedResponsibilitySet<RemoteHost> shardToHostMapping, string configurationKey, TimeSpan? mininmumCacheFreshnessRequired = null) { NumberOfShards = numberOfShards; MaxLadderHeight = defaultHeightOfLadder; MinimumCacheFreshnessRequired = mininmumCacheFreshnessRequired ?? new TimeSpan(0,0,1); CacheOfElementsAtTopOfLadder = new FixedSizeLruCache<string, DateTime>(2*NumberOfShards); ShardHashFunction = new UniversalHashFunction(configurationKey); ShardToHostMapping = shardToHostMapping; }