コード例 #1
0
        /// <summary>
        /// Get the ideal error rate and adjustment factor function.
        /// </summary>
        /// <param name="configuration"></param>
        /// <param name="blockSize"></param>
        /// <param name="itemCount"></param>
        /// <param name="hashFunctionCount"></param>
        /// <param name="errorRate"></param>
        /// <returns></returns>
        internal static Tuple <float, Func <long, long, long> > GetAdjustmentFactor(
            IBloomFilterSizeConfiguration configuration,
            long blockSize,
            long itemCount,
            uint hashFunctionCount,
            float errorRate)
        {
            var idealBlockSize = configuration.BestCompressedSize(
                itemCount,
                errorRate);
            var idealErrorRate = configuration.ActualErrorRate(
                idealBlockSize,
                itemCount,
                hashFunctionCount);
            var actualErrorRate = Math.Max(
                idealErrorRate,
                configuration.ActualErrorRate(
                    blockSize,
                    itemCount,
                    hashFunctionCount));
            var factor = (actualErrorRate - idealErrorRate);

            if (actualErrorRate >= 0.9D &&
                blockSize > 0)
            {
                //arbitrary. Should really figure out what is behind this one day : - ). What happens is that the estimator has an extremely high
                //false-positive rate. Which is the reason why this approach is not ideal to begin with.
                factor = 2 * factor * ((float)idealBlockSize / blockSize);
            }
            return(new Tuple <float, Func <long, long, long> >(
                       idealErrorRate,
                       (membershipCount, sampleCount) => (long)Math.Floor(membershipCount - factor * (sampleCount - membershipCount))));
        }
コード例 #2
0
        /// <summary>
        /// Approximate the size of the set difference based upon an estimator and a (subset) of the other set
        /// </summary>
        /// <typeparam name="TEntity"></typeparam>
        /// <typeparam name="TId"></typeparam>
        /// <typeparam name="TCount"></typeparam>
        /// <param name="estimator">The hybrid estimator</param>
        /// <param name="bloomFilterSizeConfiguration"></param>
        /// <param name="otherSetSample">A (sub)set to compare against</param>
        /// <param name="otherSetSize">Total set of the size to compare against (when not given, the set sample size is used)</param>
        /// <returns>An estimate for the number of differences, or <c>null</c> when a reasonable estimate is not possible.</returns>
        /// <remarks>Not an ideal solution, due to the potentially high false positive rate of the estimator. But useful when you do not have a local estimator, but you do have a (sub)set of the data and know the total size of the data. Known issue is that small differences on very large sets are either grossly over estimated (when there is a count difference between the two sets) or not recognized at all (under estimated, when both sets have the same count, but different values). The estimate can be rather inexact. See 'Exact Set Reconciliation Based on Bloom Filters', Dafang Zhang, Kun Xie, 2011 International Conference on Computer Science and Network Technology, page 2001-2009 </remarks>
        public static long?QuasiDecode <TEntity, TId, TCount>(
            this IHybridEstimator <TEntity, TId, TCount> estimator,
            IBloomFilterSizeConfiguration bloomFilterSizeConfiguration,
            IEnumerable <TEntity> otherSetSample,
            long?otherSetSize = null)
            where TId : struct
            where TCount : struct
        {
            if (estimator == null)
            {
                return(otherSetSize ?? otherSetSample?.LongCount() ?? 0L);
            }
            //compensate for extremely high error rates that can occur with estimators. Without this, the difference goes to infinity.
            var factors = QuasiEstimator.GetAdjustmentFactor(
                bloomFilterSizeConfiguration,
                estimator.VirtualBlockSize,
                estimator.ItemCount,
                estimator.HashFunctionCount,
                estimator.ErrorRate);

            return(QuasiEstimator.Decode(
                       estimator.ItemCount,
                       factors.Item1,
                       estimator.Contains,
                       otherSetSample,
                       otherSetSize,
                       factors.Item2));
        }
コード例 #3
0
        /// <summary>
        /// Find a fold factor.
        /// </summary>
        /// <param name="blockSize">The size of the Bloom filter</param>
        /// <param name="capacity"></param>
        /// <param name="keyCount">The number of keys added to the Bloom filter. When not provided, the fold advice will not take the error rate into consideration and provide a maximal fold given the capacity.</param>
        /// <returns>A fold factor.</returns>
        public uint?FindCompressionFactor(IBloomFilterSizeConfiguration configuration, long blockSize, long capacity, long?keyCount = null)
        {
            if (keyCount.HasValue && !(keyCount > 0))
            {
                return(null);
            }
            var pieces = MathExtensions.GetFactors(blockSize)
                         .Where(factor =>
                                blockSize / factor > 1 &&
                                (!keyCount.HasValue || (capacity / factor >= keyCount.Value) &&
                                 factor < blockSize))
                         .DefaultIfEmpty()
                         .ToArray();
            var max = pieces.Max();

            return(max > 1 ? (uint?)(uint)max : null);
        }