/// <summary>
 /// Constructor
 /// </summary>
 /// <param name="createFilter">When <c>true</c> create the value Bloom filter, else <c>false</c></param>
 protected ConfigurationBase(bool createFilter = true)
 {
     if (createFilter)
     {
         _subFilterConfiguration = this.ConvertToKeyValueHash();
     }
 }
        /// <summary>
        /// Create an invertible Bloom filter
        /// </summary>
        /// <typeparam name="TEntity">The type of the entity</typeparam>
        /// <typeparam name="TId">The type of the entity identifier</typeparam>
        /// <typeparam name="TCount">Type of the counter.</typeparam>
        /// <param name="bloomFilterConfiguration">The Bloom filter configuration</param>
        /// <param name="capacity">The capacity</param>
        /// <param name="errorRate">The desired error rate (between 0 and 1)</param>
        /// <param name="reverse">When <c>true</c> a reverse IBF is created, else <c>false</c></param>
        /// <returns>The created Bloom filter</returns>
        /// <remarks>Assumption is that the utilization will be in line with the capacity, thus keeping individual counts low.</remarks>
        public IInvertibleBloomFilter <TEntity, TId, TCount> Create <TEntity, TId, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> bloomFilterConfiguration,
            long capacity,
            float?errorRate = null,
            bool reverse    = false)
            where TId : struct
            where TCount : struct
        {
            if (capacity <= 0)
            {
                capacity = 1;
            }
            var ibf = reverse
                ? new InvertibleReverseBloomFilter <TEntity, TId, TCount>(bloomFilterConfiguration)
                : new InvertibleBloomFilter <TEntity, TId, TCount>(bloomFilterConfiguration);

            if (errorRate.HasValue)
            {
                ibf.Initialize(capacity, errorRate.Value);
            }
            else
            {
                ibf.Initialize(capacity);
            }
            return(ibf);
        }
Exemple #3
0
 /// <summary>
 /// Creates a new Bloom filter
 /// </summary>
 /// <param name="bloomFilterConfiguration">The Bloom filter configuration</param>
 /// <param name="validateConfiguration">When <c>true</c> the configuration is validated on the first operation, else <c>false</c>.</param>
 public InvertibleBloomFilter(
     IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> bloomFilterConfiguration,
     bool validateConfiguration = true)
 {
     Configuration         = bloomFilterConfiguration;
     ValidateConfiguration = validateConfiguration;
 }
Exemple #4
0
        /// <summary>
        /// Determine if the decode succeeded.
        /// </summary>
        /// <typeparam name="TEntity">The entity type</typeparam>
        /// <typeparam name="TId">The identifier type</typeparam>
        /// <typeparam name="THash">The type of the hash</typeparam>
        /// <typeparam name="TCount">The type of the occurence counter</typeparam>
        /// <param name="filter">The IBF data</param>
        /// <param name="configuration">The Bloom filter configuration</param>
        /// <returns><c>true</c> when the decode was successful, else <c>false</c>.</returns>
        private static bool IsCompleteDecode <TEntity, TId, THash, TCount>(
            this IInvertibleBloomFilterData <TId, THash, TCount> filter,
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration)
            where TCount : struct
            where TId : struct
            where THash : struct
        {
            var idIdentity    = configuration.IdIdentity;
            var hashIdentity  = configuration.HashIdentity;
            var countIdentity = configuration.CountConfiguration.Identity;
            var isComplete    = 0;

            Parallel.ForEach(
                Partitioner.Create(0L, filter.BlockSize),
                (range, state) =>
            {
                for (var position = range.Item1; position < range.Item2; position++)
                {
                    if (configuration.CountConfiguration.IsPure(filter.Counts[position]))
                    {
                        //item is pure and was skipped on purpose.
                        continue;
                    }
                    if (!configuration.IdEqualityComparer.Equals(idIdentity, filter.IdSumProvider[position]) ||
                        !configuration.HashEqualityComparer.Equals(hashIdentity, filter.HashSumProvider[position]) ||
                        configuration.CountConfiguration.Comparer.Compare(filter.Counts[position], countIdentity) != 0)
                    {
                        Interlocked.Increment(ref isComplete);
                        state.Stop();
                    }
                }
            });
            return(isComplete == 0);
        }
Exemple #5
0
 /// <summary>
 /// Creates a new Bloom filter using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions.
 /// </summary>
 /// <param name="bloomFilterConfiguration">The Bloom filter configuration</param>
 public InvertibleHybridBloomFilter(
     IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> bloomFilterConfiguration) : base(bloomFilterConfiguration)
 {
     _reverseBloomFilter = new InvertibleReverseBloomFilter <KeyValuePair <TId, int>, TId, TCount>(
         bloomFilterConfiguration.ConvertToKeyValueHash());
     ValidateConfiguration = false;
 }
Exemple #6
0
 /// <summary>
 /// <c>true</c> when the filters are compatible, else <c>false</c>
 /// </summary>
 /// <typeparam name="TId">The type of entity identifier</typeparam>
 /// <typeparam name="TCount">The type of the occurence counter for the invertible Bloom filter.</typeparam>
 /// <typeparam name="TEntity">Type of the entity</typeparam>
 /// <param name="filter">Bloom filter data</param>
 /// <param name="otherFilter">The Bloom filter data to compare against</param>
 /// <param name="configuration">THe Bloom filter configuration</param>
 /// <returns></returns>
 public static bool IsCompatibleWith <TEntity, TId, THash, TCount>(
     this IInvertibleBloomFilterData <TId, THash, TCount> filter,
     IInvertibleBloomFilterData <TId, THash, TCount> otherFilter,
     IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration)
     where TId : struct
     where TCount : struct
     where THash : struct
 {
     if (filter == null || otherFilter == null)
     {
         return(true);
     }
     if (!filter.IsValid() || !otherFilter.IsValid())
     {
         return(false);
     }
     if (filter.IsReverse != otherFilter.IsReverse ||
         filter.HashFunctionCount != otherFilter.HashFunctionCount ||
         (filter.SubFilter != otherFilter.SubFilter &&
          !filter.SubFilter.IsCompatibleWith(otherFilter.SubFilter, configuration.SubFilterConfiguration)))
     {
         return(false);
     }
     if (filter.BlockSize != otherFilter.BlockSize)
     {
         var foldFactors = configuration.FoldingStrategy?.GetFoldFactors(filter.BlockSize, otherFilter.BlockSize);
         if (foldFactors?.Item1 > 1 || foldFactors?.Item2 > 1)
         {
             return(true);
         }
     }
     return(filter.BlockSize == otherFilter.BlockSize &&
            filter.IsReverse == otherFilter.IsReverse &&
            filter.Counts?.LongLength == otherFilter.Counts?.LongLength);
 }
Exemple #7
0
        /// <summary>
        /// Fold the strata estimator data.
        /// </summary>
        /// <typeparam name="TEntity">The entity type</typeparam>
        /// <typeparam name="TId">The identifier type</typeparam>
        /// <typeparam name="TCount">The count type</typeparam>
        /// <param name="estimatorData"></param>
        /// <param name="configuration"></param>
        /// <param name="factor">The factor to fold by</param>
        /// <returns>The <paramref name="estimatorData"/> folded by <paramref name="factor"/>.</returns>
        internal static HybridEstimatorFullData <int, TCount> Fold <TEntity, TId, TCount>(
            this IHybridEstimatorFullData <int, TCount> estimatorData,
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
            uint factor)
            where TCount : struct
            where TId : struct
        {
            if (estimatorData == null)
            {
                return(null);
            }
            var minWiseFold = Math.Max(
                1L,
                configuration
                .FoldingStrategy?
                .GetAllFoldFactors(estimatorData.BitMinwiseEstimator?.Capacity ?? 1L)
                .OrderBy(f => f)
                .FirstOrDefault(f => f > factor) ?? 1L);

            return(new HybridEstimatorFullData <int, TCount>
            {
                ItemCount = estimatorData.ItemCount,
                BitMinwiseEstimator = estimatorData.BitMinwiseEstimator?.Fold((uint)minWiseFold),
                StrataEstimator =
                    estimatorData.StrataEstimator?.Fold(configuration.ConvertToEstimatorConfiguration(), factor)
            });
        }
 public DefaultBloomFilterConfiguration() : base(new ByteCountConfiguration(), false)
 {
     //allows the reverse filter to only use PureCount or the pure function, while this configuration
     //considers both the hash value and the PureCount.
     //just exploring some flexibility.
     _valueFilterConfiguration = new KeyValuePairBloomFilterConfiguration(new ByteCountConfiguration(), false);
 }
        /// <summary>
        /// Create new Bloom filter data based upon the size and the hash function count.
        /// </summary>
        /// <typeparam name="TId">Type of the identifier</typeparam>
        /// <typeparam name="TCount">Type of the counter</typeparam>
        /// <typeparam name="TEntity"></typeparam>
        /// <param name="capacity"></param>
        /// <param name="m">Size per hash function</param>
        /// <param name="k">The number of hash functions.</param>
        /// <returns>The Bloom filter data</returns>
        public InvertibleBloomFilterData <TId, THash, TCount> Create <TEntity, TId, THash, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration,
            long capacity,
            long m,
            uint k)
            where TId : struct
            where TCount : struct
            where THash : struct
        {
            if (m < 1) // from overflow in bestM calculation
            {
                throw new ArgumentOutOfRangeException(
                          nameof(m),
                          "The provided capacity and errorRate values would result in an array of length > long.MaxValue. Please reduce either the capacity or the error rate.");
            }
            var res = new InvertibleBloomFilterData <TId, THash, TCount>
            {
                HashFunctionCount = k,
                BlockSize         = m,
                Counts            = new TCount[m],
                Capacity          = capacity,
                ErrorRate         = configuration.ActualErrorRate(m, capacity, k)
            };

            res.SyncCompressionProviders(configuration);
            return(res);
        }
        /// <summary>
        /// Extract filter data from the given <paramref name="precalculatedFilter"/> for capacity <paramref name="capacity"/>.
        /// </summary>
        /// <typeparam name="TEntity">The entity type</typeparam>
        /// <typeparam name="TId">The identifier type</typeparam>
        /// <typeparam name="TCount">The occurence count type</typeparam>
        /// <param name="configuration">Configuration</param>
        /// <param name="precalculatedFilter">The pre-calculated filter</param>
        /// <param name="capacity">The targeted capacity.</param>
        /// <returns>The IBF data sized for <paramref name="precalculatedFilter"/> for target capacity <paramref name="capacity"/>.</returns>
        public IInvertibleBloomFilterData <TId, int, TCount> Extract <TEntity, TId, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
            IInvertibleBloomFilter <TEntity, TId, TCount> precalculatedFilter,
            long?capacity)
            where TCount : struct
            where TId : struct
        {
            if (precalculatedFilter == null)
            {
                return(null);
            }
            if (!capacity.HasValue || capacity < 10)
            {
                //set capacity to arbitrary low capacity.
                capacity = 10;
            }
            var data       = precalculatedFilter.Extract();
            var foldFactor = configuration.FoldingStrategy?.FindCompressionFactor(configuration, data.BlockSize, data.Capacity, capacity);

            if (foldFactor > 1)
            {
                return(data.Fold(configuration, (uint)foldFactor));
            }
            return(data);
        }
        /// <summary>
        /// Get recommended strata (number of Bloom filters in an estimator)
        /// </summary>
        /// <typeparam name="TEntity">The entity to add</typeparam>
        /// <typeparam name="TId">The entity identifier type</typeparam>
        /// <typeparam name="THash">The hash type</typeparam>
        /// <typeparam name="TCount">The occurence count type.</typeparam>
        /// <param name="configuration">The Bloom filter configuration</param>
        /// <param name="setSize">Number of items to be added</param>
        /// <param name="failedDecodeCount">Number of times the estimator has failed to decode.</param>
        /// <returns></returns>
        public byte GetRecommendedStrata <TEntity, TId, THash, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration,
            long setSize,
            byte failedDecodeCount = 0)
            where TId : struct
            where THash : struct
            where TCount : struct
        {
            byte strata = 7;

            if (setSize > 16000L)
            {
                strata = 13;
            }
            else if (setSize > 8000L)
            {
                strata = 9;
            }
            if (failedDecodeCount >= 1)
            {
                strata = (byte)(setSize > 10000L || failedDecodeCount > 1
                    ? 13
                    : 9);
            }
            return(strata);
        }
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="configuration"></param>
 public ConfigurationKeyValueHashWrapper(
     IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration) :
     base(false)
 {
     _wrappedConfiguration = configuration;
     //hashSum no longer derived from idSum, so pure definition needs to be changed.
     _isPure = (d, position) => _wrappedConfiguration.CountConfiguration.IsPure(d.Counts[position]);
 }
Exemple #13
0
        /// <summary>
        /// Decode the hybrid estimator data instances.
        /// </summary>
        /// <typeparam name="TEntity">The type of the entity</typeparam>
        /// <typeparam name="TId">The type of the entity identifier</typeparam>
        /// <typeparam name="TCount">The type of the occurence count for the Bloom filter.</typeparam>
        /// <param name="estimator">The estimator</param>
        /// <param name="otherEstimatorData">The other estimator</param>
        /// <param name="configuration">Configuration</param>
        /// <param name="destructive">When <c>true</c> the values of <paramref name="estimator"/> will be altered rendering it useless, otherwise <c>false</c></param>
        /// <returns>An estimate of the difference between two sets based upon the estimators.</returns>
        internal static long?Decode <TEntity, TId, TCount>(this IHybridEstimatorData <int, TCount> estimator,
                                                           IHybridEstimatorData <int, TCount> otherEstimatorData,
                                                           IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
                                                           bool destructive = false)
            where TCount : struct
            where TId : struct
        {
            if (estimator == null &&
                otherEstimatorData == null)
            {
                return(0L);
            }
            if (estimator == null ||
                estimator.ItemCount <= 0L)
            {
                return(otherEstimatorData.ItemCount);
            }
            if (otherEstimatorData == null ||
                otherEstimatorData.ItemCount <= 0)
            {
                return(estimator.ItemCount);
            }
            var decodeFactor = Math.Max(estimator.StrataEstimator?.DecodeCountFactor ?? 1.0D,
                                        otherEstimatorData.StrataEstimator?.DecodeCountFactor ?? 1.0D);
            var strataDecode = estimator
                               .StrataEstimator
                               .Decode(otherEstimatorData.StrataEstimator, configuration, estimator.StrataEstimator.StrataCount, destructive);

            if (!strataDecode.HasValue)
            {
                return(null);
            }
            var similarity = estimator.BitMinwiseEstimator?.Similarity(otherEstimatorData.BitMinwiseEstimator);

            if (similarity.HasValue)
            {
                strataDecode += (long)(decodeFactor * ((1 - similarity) / (1 + similarity)) *
                                       (estimator.BitMinwiseEstimator.ItemCount + otherEstimatorData.BitMinwiseEstimator.ItemCount));
            }
            var strataMin = Math.Min(
                otherEstimatorData.StrataEstimator?.StrataCount ?? 0,
                estimator.StrataEstimator?.StrataCount ?? 0);

            var decodedItemCount = estimator.StrataEstimator.StrataItemCount(strataMin) +
                                   (similarity.HasValue ?(estimator.BitMinwiseEstimator?.ItemCount ?? 0L) : 0L) +
                                   otherEstimatorData.StrataEstimator.StrataItemCount(strataMin) +
                                   (similarity.HasValue ? (otherEstimatorData.BitMinwiseEstimator?.ItemCount ?? 0L) : 0L);

            if (decodedItemCount > 0)
            {
                //assume differences for the items counted, but not in the strata estimator or bit minwise estimator, contribute proportionally.
                strataDecode = (long)Math.Ceiling(1.0D * strataDecode.Value * (estimator.ItemCount + otherEstimatorData.ItemCount) / decodedItemCount);
            }
            //use upperbound on set difference.
            return(Math.Min(strataDecode.Value, estimator.ItemCount + otherEstimatorData.ItemCount));
        }
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="blockSize">Capacity for strata estimator (good default is 80)</param>
 /// <param name="maxStrata">Maximum strate for the strata estimator.</param>
 /// <param name="configuration">The configuration</param>
 /// <param name="fixedBlockSize">When <c>true</c> the block size should not be modified, else the folding strategy can be applied.</param>
 public HybridEstimator(
     long blockSize,
     byte maxStrata,
     IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
     bool fixedBlockSize = false)
 {
     _strataEstimator = new StrataEstimator <TEntity, TId, TCount>(blockSize, configuration, maxStrata, fixedBlockSize: fixedBlockSize);
     _strataEstimator.DecodeCountFactor = _strataEstimator.BlockSize >= 20 ? 1.45D : 1.0D;
     _configuration = configuration;
 }
        /// <summary>
        /// Get the recommended bit size.
        /// </summary>
        /// <typeparam name="TEntity">The entity to add</typeparam>
        /// <typeparam name="TId">The entity identifier type</typeparam>
        /// <typeparam name="THash">The hash type</typeparam>
        /// <typeparam name="TCount">The occurence count type.</typeparam>
        /// <param name="configuration">The Bloom filter configuration</param>
        /// <param name="setSize">Number of items to be added</param>
        /// <param name="failedDecodeCount">Number of times the estimator has failed to decode.</param>
        /// <returns></returns>
        public byte GetRecommendedBitSize <TEntity, TId, THash, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration,
            long setSize,
            byte failedDecodeCount = 0)
            where TId : struct
            where THash : struct
            where TCount : struct
        {
            byte bitSize = 2;

            return(bitSize);
        }
 /// <summary>
 /// Convert a known Bloom filter configuration <paramref name="configuration"/> to a configuration suitable for an estimator.
 /// </summary>
 /// <typeparam name="TEntity">The entity type</typeparam>
 /// <typeparam name="TId">The entity identifier type</typeparam>
 /// <typeparam name="TCount">The type for the occurrence counter</typeparam>
 /// <returns></returns>
 /// <remarks>Remarkably strange plumbing: for estimators, we want to handle the entity hash as the identifier.</remarks>
 internal static IInvertibleBloomFilterConfiguration <KeyValuePair <int, int>, int, int, TCount> ConvertToEstimatorConfiguration
 <TEntity, TId, TCount>(
     this IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration)
     where TCount : struct
     where TId : struct
 {
     if (configuration == null)
     {
         return(null);
     }
     return(new ConfigurationEstimatorWrapper <TEntity, TId, TCount>(configuration));
 }
Exemple #17
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="dataSet">The data set for this actor</param>
 /// <param name="hybridEstimatorFactory">Factory for creating estimators</param>
 /// <param name="bloomFilterFactory">Factory for creating Bloom filters</param>
 /// <param name="configuration">Bloom filter configuration to use</param>
 public Actor(IList <TestEntity> dataSet,
              IHybridEstimatorFactory hybridEstimatorFactory,
              IInvertibleBloomFilterFactory bloomFilterFactory,
              IInvertibleBloomFilterConfiguration <TestEntity, long, int, TCount> configuration)
 {
     _protobufModel = TypeModel.Create();
     _protobufModel.UseImplicitZeroDefaults = true;
     _dataSet = dataSet;
     _hybridEstimatorFactory = hybridEstimatorFactory;
     _bloomFilterFactory     = bloomFilterFactory;
     _configuration          = configuration;
 }
        /// <summary>
        /// Decode the given strata estimators.
        /// </summary>
        /// <typeparam name="TEntity">The entity type</typeparam>
        /// <typeparam name="TId">The type of the entity identifier</typeparam>
        /// <typeparam name="TCount">The type of the Bloom filter occurence count</typeparam>
        /// <param name="data">Estimator data</param>
        /// <param name="otherEstimatorData">The other estimate</param>
        /// <param name="configuration">The Bloom filter configuration</param>
        /// <param name="maxStrata">The maximum strata</param>
        /// <param name="destructive">When <c>true</c> the <paramref name="data"/> will be altered and no longer usable, else <c>false</c></param>
        /// <returns></returns>
        internal static long?Decode <TEntity, TId, TCount>(this IStrataEstimatorData <int, TCount> data,
                                                           IStrataEstimatorData <int, TCount> otherEstimatorData,
                                                           IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
                                                           byte maxStrata,
                                                           bool destructive = false)
            where TId : struct
            where TCount : struct
        {
            if (data == null || otherEstimatorData == null)
            {
                return(null);
            }
            var strataConfig = configuration.ConvertToEstimatorConfiguration();
            var decodeFactor = Math.Max(data.DecodeCountFactor, otherEstimatorData.DecodeCountFactor);
            var hasDecoded   = false;
            var setA         = new HashSet <int>();
            var minStrata    = Math.Min(data.StrataCount, otherEstimatorData.StrataCount);

            for (var i = minStrata - 1; i >= 0; i--)
            {
                var ibf          = data.GetFilterForStrata(i);
                var estimatorIbf = i >= otherEstimatorData.StrataCount
                    ? null
                    : otherEstimatorData.GetFilterForStrata(i);
                if (ibf == null &&
                    estimatorIbf == null)
                {
                    if (i < maxStrata)
                    {
                        hasDecoded = true;
                    }
                    continue;
                }
                var decodeResult = ibf.SubtractAndDecode(estimatorIbf, strataConfig, setA, setA, setA, destructive);
                if (decodeResult != true)
                {
                    if (!hasDecoded)
                    {
                        return(null);
                    }
                    //compensate for the fact that a failed decode can still contribute counts by lowering the i+1 as more decodes succeeded
                    var addedFactor = decodeResult.HasValue ? 1 / Math.Pow(2, data.StrataCount - (i + 1)) : 1;
                    return((long)(Math.Pow(2, i + addedFactor) * decodeFactor * setA.Count));
                }
                hasDecoded = true;
            }
            if (!hasDecoded)
            {
                return(null);
            }
            return((long)(decodeFactor * setA.Count));
        }
Exemple #19
0
 ConvertToKeyValueHash
 <TEntity, TId, THash, TCount>(
     this IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration)
     where TCount : struct
     where TId : struct
     where THash : struct
 {
     if (configuration == null)
     {
         return(null);
     }
     return(new ConfigurationKeyValueHashWrapper <TEntity, TId, THash, TCount>(configuration));
 }
        /// <summary>
        /// Create an invertible Bloom filter that is compatible with the given bloom filter data.
        /// </summary>
        /// <typeparam name="TEntity">The type of the entity</typeparam>
        /// <typeparam name="TId">The type of the entity identifier</typeparam>
        /// <typeparam name="TCount">The type of the counter</typeparam>
        /// <param name="bloomFilterConfiguration">The Bloom filter configuration</param>
        /// <param name="capacity">The capacity for the filter</param>
        /// <param name="invertibleBloomFilterData">The data to match with this filter.</param>
        /// <returns>The created Bloom filter</returns>
        /// <remarks>For the scenario where you need to match a received filter with the set you own, so you can find the differences.</remarks>
        public IInvertibleBloomFilter <TEntity, TId, TCount> CreateMatchingHighUtilizationFilter <TEntity, TId, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> bloomFilterConfiguration,
            long capacity,
            IInvertibleBloomFilterData <TId, int, TCount> invertibleBloomFilterData)
            where TId : struct
            where TCount : struct
        {
            var ibf = invertibleBloomFilterData.IsReverse
                ? new InvertibleReverseBloomFilter <TEntity, TId, TCount>(bloomFilterConfiguration)
                : new InvertibleBloomFilter <TEntity, TId, TCount>(bloomFilterConfiguration);

            ibf.Initialize(capacity, invertibleBloomFilterData.BlockSize, invertibleBloomFilterData.HashFunctionCount);
            return(ibf);
        }
Exemple #21
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="configuration">The configuration</param>
 /// <param name="bitSize">The number of bits to store per hash</param>
 /// <param name="hashCount">The number of hash functions to use.</param>
 /// <param name="capacity">The capacity (should be a close approximation of the number of elements added)</param>
 /// <remarks>By using bitSize = 1 or bitSize = 2, the accuracy is decreased, thus the hashCount needs to be increased. However, when resemblance is not too small, for example > 0.5, bitSize = 1 can yield similar results as bitSize = 64 with only 3 times the hash count.</remarks>
 public BitMinwiseHashEstimator(
     IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
     byte bitSize,
     int hashCount,
     long capacity)
 {
     _hashCount     = hashCount;
     _configuration = configuration;
     _hashFunctions = GenerateHashes();
     _bitSize       = bitSize;
     _capacity      = _configuration.FoldingStrategy?.ComputeFoldableSize(capacity, 0) ?? capacity;
     _entityHash    = e => unchecked ((int)(ulong)(_configuration.EntityHash(e) + configuration.IdHash(_configuration.GetId(e))));
     _slots         = new Lazy <int[]>(() => GetMinHashSlots(_hashCount, _capacity));
 }
        /// <summary>
        /// Compress the strata estimator data.
        /// </summary>
        /// <typeparam name="TEntity"></typeparam>
        /// <typeparam name="TId"></typeparam>
        /// <typeparam name="TCount"></typeparam>
        /// <param name="estimatorData"></param>
        /// <param name="configuration"></param>
        /// <returns></returns>
        internal static StrataEstimatorData <int, TCount> Compress <TEntity, TId, TCount>(
            this IStrataEstimatorData <int, TCount> estimatorData,
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration)
            where TCount : struct
            where TId : struct
        {
            if (configuration?.FoldingStrategy == null || estimatorData == null)
            {
                return(null);
            }
            var fold = configuration.FoldingStrategy?.FindCompressionFactor(configuration, estimatorData.BlockSize, estimatorData.BlockSize,
                                                                            estimatorData.ItemCount);
            var res = fold.HasValue ? estimatorData.Fold(configuration, fold.Value) : null;

            return(res);
        }
Exemple #23
0
 /// <summary>
 /// Clear the data
 /// </summary>
 /// <typeparam name="TEntity"></typeparam>
 /// <param name="configuration"></param>
 public void Clear <TEntity>(IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration)
 {
     ItemCount = 0;
     SyncCompressionProviders(configuration);
     Parallel.ForEach(
         Partitioner.Create(0L, BlockSize),
         (range, state) =>
     {
         for (var i = range.Item1; i < range.Item2; i++)
         {
             IdSumProvider[i]   = configuration.IdIdentity;
             HashSumProvider[i] = configuration.HashIdentity;
             Counts[i]          = configuration.CountConfiguration.Identity;
         }
     });
 }
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="configuration">The original configuration.</param>
 public ConfigurationEstimatorWrapper(
     IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration)
 {
     _wrappedConfiguration = configuration;
     _idEqualityComparer   = EqualityComparer <int> .Default;
     //ID is a full hash over the key and the value combined.
     _getId = e => BitConverter.ToInt32(_murmurHash.Hash(BitConverter.GetBytes(e.Value), unchecked ((uint)e.Key)), 0);
     //additional hash to ensure Id and IdHash are different.
     _idHash = id => BitConverter.ToInt32(_murmurHash.Hash(BitConverter.GetBytes(id), 12345678), 0);
     //entity hash equals identifier hash
     _entityHash = e => _idHash(_getId(e));
     //estimator uses XOR
     _idAdd       = _idRemove = (id1, id2) => id1 ^ id2;
     _idIntersect = (id1, id2) => id1 & id2;
     _isPure      = (d, p) => _wrappedConfiguration.CountConfiguration.IsPure(d.Counts[p]) &&
                    _idHash(d.IdSumProvider[p]) == d.HashSumProvider[p];
 }
Exemple #25
0
        /// <summary>
        /// Fold the data by the given factor
        /// </summary>
        /// <typeparam name="TId"></typeparam>
        /// <typeparam name="TCount"></typeparam>
        /// <typeparam name="TEntity"></typeparam>
        /// <typeparam name="THash"></typeparam>
        /// <param name="data"></param>
        /// <param name="configuration"></param>
        /// <param name="factor"></param>
        /// <returns></returns>
        /// <remarks>Captures the concept of reducing the size of a Bloom filter.</remarks>
        internal static InvertibleBloomFilterData <TId, THash, TCount> Fold <TEntity, TId, THash, TCount>(
            this IInvertibleBloomFilterData <TId, THash, TCount> data,
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration,
            uint factor)
            where TId : struct
            where TCount : struct
            where THash : struct
        {
            if (factor <= 0)
            {
                throw new ArgumentException($"Fold factor should be a positive number (given value was {factor}.");
            }
            if (data == null)
            {
                return(null);
            }
            if (data.BlockSize % factor != 0)
            {
                throw new ArgumentException($"Bloom filter data of size {data.BlockSize} cannot be folded by factor {factor}.", nameof(factor));
            }
            data.SyncCompressionProviders(configuration);
            var res = configuration.DataFactory.Create(
                configuration,
                data.Capacity / factor,
                data.BlockSize / factor,
                data.HashFunctionCount);

            res.IsReverse = data.IsReverse;
            res.ItemCount = data.ItemCount;
            Parallel.ForEach(
                Partitioner.Create(0L, res.BlockSize),
                (range, state) =>
            {
                for (var i = range.Item1; i < range.Item2; i++)
                {
                    res.Counts[i]          = data.Counts.GetFolded(i, factor, configuration.CountConfiguration.Add);
                    res.HashSumProvider[i] = data.HashSumProvider.GetFolded(i, data.BlockSize, factor, configuration.HashAdd);
                    res.IdSumProvider[i]   = data.IdSumProvider.GetFolded(i, data.BlockSize, factor, configuration.IdAdd);
                }
            });
            res.SubFilter = data
                            .SubFilter?
                            .Fold(configuration.SubFilterConfiguration, factor);
            return(res);
        }
        /// <summary>
        /// Create a hybrid estimator
        /// </summary>
        /// <typeparam name="TEntity">The entity type</typeparam>
        /// <typeparam name="TId">The type of the entity identifier</typeparam>
        /// <typeparam name="TCount">The type of occurence count.</typeparam>
        /// <param name="configuration">Bloom filter configuration</param>
        /// <param name="precalculatedEstimator"></param>
        /// <param name="failedDecodeCount">Number of times decoding has failed based upon the provided estimator.</param>
        /// <returns></returns>
        public IHybridEstimatorData <int, TCount> Extract <TEntity, TId, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
            HybridEstimator <TEntity, TId, TCount> precalculatedEstimator,
            byte failedDecodeCount = 0)
            where TCount : struct
            where TId : struct
        {
            if (precalculatedEstimator == null)
            {
                return(null);
            }
            //after two failed attempts, don't fold.
            if (failedDecodeCount > 2)
            {
                return(precalculatedEstimator.Extract());
            }
            var data       = precalculatedEstimator.FullExtract();
            var strata     = GetRecommendedStrata(configuration, data.ItemCount, failedDecodeCount);
            var blockSize  = GetRecommendedBlockSize(configuration, data.ItemCount, failedDecodeCount);
            var factors    = configuration.FoldingStrategy?.GetAllFoldFactors(precalculatedEstimator.BlockSize);
            var foldFactor = blockSize > 0L ?
                             (uint)factors?
                             .OrderByDescending(f => f)
                             //for estimators: capacity is the block size.
                             .Where(f => f > 1 && precalculatedEstimator.BlockSize / f > blockSize)
                             .Skip(failedDecodeCount)
                             .FirstOrDefault() :
                             0L;

            if (failedDecodeCount > 1)
            {
                //after more than 1 failed attempt, go for the lowest fold factor.
                foldFactor = (uint)factors.OrderBy(f => f).FirstOrDefault(f => f > 1 && precalculatedEstimator.BlockSize / f > blockSize);
            }
            if (foldFactor > 1)
            {
                data = data.Fold(configuration, (uint)foldFactor);
            }
            data.StrataEstimator.LowerStrata(strata);
            if (failedDecodeCount > 1)
            {
                data.StrataEstimator.DecodeCountFactor = Math.Pow(2, failedDecodeCount);
            }
            return(data.ToEstimatorData());
        }
Exemple #27
0
        /// <summary>
        /// Intersect two hybrid estimators
        /// </summary>
        /// <typeparam name="TEntity"></typeparam>
        /// <typeparam name="TId"></typeparam>
        /// <typeparam name="TCount"></typeparam>
        /// <param name="estimatorData"></param>
        /// <param name="configuration"></param>
        /// <returns></returns>
        internal static IHybridEstimatorFullData <int, TCount> Intersect <TEntity, TId, TCount>(
            this IHybridEstimatorFullData <int, TCount> estimatorData,
            IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
            IHybridEstimatorFullData <int, TCount> otherEstimatorData)
            where TId : struct
            where TCount : struct
        {
            if (estimatorData == null && otherEstimatorData == null)
            {
                return(null);
            }
            var res = new HybridEstimatorFullData <int, TCount>();

            res.BitMinwiseEstimator = estimatorData?.BitMinwiseEstimator.Intersect(otherEstimatorData?.BitMinwiseEstimator, configuration.FoldingStrategy);
            res.StrataEstimator     = estimatorData?.StrataEstimator.Intersect(otherEstimatorData?.StrataEstimator, configuration);
            res.ItemCount           = (res.BitMinwiseEstimator?.ItemCount ?? 0L) + (res.StrataEstimator?.ItemCount ?? 0L);
            return(res);
        }
Exemple #28
0
 /// <summary>
 /// Add an item from the given position.
 /// </summary>
 /// <typeparam name="TEntity">The entity type</typeparam>
 /// <typeparam name="TId">The type of the entity identifier</typeparam>
 /// <typeparam name="TCount">The type of the Bloom filter occurence count</typeparam>
 /// <param name="filter"></param>
 /// <param name="configuration"></param>
 /// <param name="idValue"></param>
 /// <param name="hashValue"></param>
 /// <param name="position"></param>
 internal static void Add <TEntity, TId, TCount>(
     this IInvertibleBloomFilterData <TId, int, TCount> filter,
     IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration,
     TId idValue,
     int hashValue,
     long position)
     where TCount : struct
     where TId : struct
 {
     if (filter == null)
     {
         return;
     }
     filter.ExecuteExclusively(position, () =>
     {
         filter.Counts[position]          = configuration.CountConfiguration.Increase(filter.Counts[position]);
         filter.HashSumProvider[position] = configuration.HashAdd(filter.HashSumProvider[position], hashValue);
         filter.IdSumProvider[position]   = configuration.IdAdd(filter.IdSumProvider[position], idValue);
     });
 }
Exemple #29
0
        /// <summary>
        /// Duplicate the invertible Bloom filter data
        /// </summary>
        /// <typeparam name="TId">The entity identifier type</typeparam>
        /// <typeparam name="TCount">The occurence count type</typeparam>
        /// <typeparam name="TEntity"></typeparam>
        /// <typeparam name="THash"></typeparam>
        /// <param name="data">The data to duplicate.</param>
        /// <param name="configuration">The Bloom filter configuration</param>
        /// <returns>Bloom filter data configured the same as <paramref name="data"/>, but with empty arrays.</returns>
        /// <remarks>Explicitly does not duplicate the reverse IBF data.</remarks>
        private static InvertibleBloomFilterData <TId, THash, TCount> CreateDummy <TEntity, TId, THash, TCount>(
            this IInvertibleBloomFilterData <TId, THash, TCount> data,
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration)
            where TCount : struct
            where TId : struct
            where THash : struct
        {
            if (data == null)
            {
                return(null);
            }
            var result = configuration.DataFactory.Create(
                configuration,
                data.Capacity,
                data.BlockSize,
                data.HashFunctionCount);

            result.IsReverse = data.IsReverse;
            return(result);
        }
        /// <summary>
        /// Get the recommended minwise hash count.
        /// </summary>
        /// <typeparam name="TEntity">The entity to add</typeparam>
        /// <typeparam name="TId">The entity identifier type</typeparam>
        /// <typeparam name="THash">The hash type</typeparam>
        /// <typeparam name="TCount">The occurence count type.</typeparam>
        /// <param name="configuration">The Bloom filter configuration</param>
        /// <param name="setSize">Number of items to be added</param>
        /// <param name="failedDecodeCount">Number of times the estimator has failed to decode.</param>
        /// <returns></returns>
        public int GetRecommendedMinwiseHashCount <TEntity, TId, THash, TCount>(
            IInvertibleBloomFilterConfiguration <TEntity, TId, THash, TCount> configuration,
            long setSize,
            byte failedDecodeCount = 0)
            where TId : struct
            where THash : struct
            where TCount : struct
        {
            var minwiseHashCount = 8;

            if (setSize > 16000L)
            {
                minwiseHashCount = 15;
            }
            else if (setSize > 8000L)
            {
                minwiseHashCount = 10;
            }
            return(minwiseHashCount);
        }