/// <summary> /// Creates a new Bloom filter using the optimal size for the underlying data structure based on the desired capacity and error rate, as well as the optimal number of hash functions. /// </summary> /// <param name="bloomFilterConfiguration">The Bloom filter configuration</param> public InvertibleHybridBloomFilter( IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> bloomFilterConfiguration) : base(bloomFilterConfiguration) { _reverseBloomFilter = new InvertibleReverseBloomFilter <KeyValuePair <TId, int>, TId, TCount>( bloomFilterConfiguration.ConvertToKeyValueHash()); ValidateConfiguration = false; }
/// <summary> /// Subtract and then decode. /// </summary> /// <param name="filter">Bloom filter to subtract</param> /// <param name="listA">Items in this filter, but not in <paramref name="filter"/></param> /// <param name="listB">Items not in this filter, but in <paramref name="filter"/></param> /// <param name="modifiedEntities">Entities in both filters, but with a different value</param> /// <returns><c>true</c> when the decode was successful, otherwise <c>false</c></returns> public bool?SubtractAndDecode(IInvertibleBloomFilter <TEntity, TId, TCount> filter, HashSet <TId> listA, HashSet <TId> listB, HashSet <TId> modifiedEntities) { return(SubtractAndDecode(listA, listB, modifiedEntities, filter.Extract())); }
/// <summary> /// Extract filter data from the given <paramref name="precalculatedFilter"/> for capacity <paramref name="capacity"/>. /// </summary> /// <typeparam name="TEntity">The entity type</typeparam> /// <typeparam name="TId">The identifier type</typeparam> /// <typeparam name="TCount">The occurence count type</typeparam> /// <param name="configuration">Configuration</param> /// <param name="precalculatedFilter">The pre-calculated filter</param> /// <param name="capacity">The targeted capacity.</param> /// <returns>The IBF data sized for <paramref name="precalculatedFilter"/> for target capacity <paramref name="capacity"/>.</returns> public IInvertibleBloomFilterData <TId, int, TCount> Extract <TEntity, TId, TCount>( IInvertibleBloomFilterConfiguration <TEntity, TId, int, TCount> configuration, IInvertibleBloomFilter <TEntity, TId, TCount> precalculatedFilter, long?capacity) where TCount : struct where TId : struct { if (precalculatedFilter == null) { return(null); } if (!capacity.HasValue || capacity < 10) { //set capacity to arbitrary low capacity. capacity = 10; } var data = precalculatedFilter.Extract(); var foldFactor = configuration.FoldingStrategy?.FindCompressionFactor(configuration, data.BlockSize, data.Capacity, capacity); if (foldFactor > 1) { return(data.Fold(configuration, (uint)foldFactor)); } return(data); }
/// <summary> /// Intersect a Bloom filter with the current Bloom filter. /// </summary> /// <param name="bloomFilter"></param> public void Intersect(IInvertibleBloomFilter <TEntity, TId, TCount> bloomFilter) { var result = Extract().Intersect(Configuration, bloomFilter.Extract()); if (result == null) { throw new ArgumentException("An incompatible Bloom filter cannot be intersected.", nameof(bloomFilter)); } Rehydrate(result); }
/// <summary> /// Add the Bloom filter /// </summary> /// <param name="bloomFilter">Bloom filter to add</param> /// <exception cref="ArgumentException">Bloom filter is not compatible</exception> public void Add(IInvertibleBloomFilter <TEntity, TId, TCount> bloomFilter) { if (bloomFilter == null) { return; } var result = Extract().Add(Configuration, bloomFilter.Extract()); if (result == null) { throw new ArgumentException("An incompatible Bloom filter cannot be added.", nameof(bloomFilter)); } Rehydrate(result); }
/// <summary> /// Constructor /// </summary> /// <param name="dataSet">The data set for this actor</param> /// <param name="hybridEstimatorFactory">Factory for creating estimators</param> /// <param name="bloomFilterFactory">Factory for creating Bloom filters</param> /// <param name="configuration">Bloom filter configuration to use</param> public PrecalculatedActor(IList <TestEntity> dataSet, IHybridEstimatorFactory hybridEstimatorFactory, IInvertibleBloomFilterFactory bloomFilterFactory, IInvertibleBloomFilterConfiguration <TestEntity, long, int, TCount> configuration) { _protobufModel = TypeModel.Create(); _protobufModel.UseImplicitZeroDefaults = true; _hybridEstimatorFactory = hybridEstimatorFactory; _configuration = configuration; //terribly over size the estimator. _estimator = _hybridEstimatorFactory.Create(_configuration, 100000); foreach (var itm in dataSet) { _estimator.Add(itm); } //sized to number of differences it can handle, not to the size of the data. _filter = bloomFilterFactory.Create(_configuration, 5000, 0.001F, true); foreach (var item in dataSet) { _filter.Add(item); } }
/// <summary> /// Quasi decode a given <paramref name="filter">filter</paramref>. /// </summary> /// <typeparam name="TEntity">The entity type</typeparam> /// <typeparam name="TId">The identifier type</typeparam> /// <typeparam name="TCount">The count type</typeparam> /// <param name="filter">The Bloom filter</param> /// <param name="otherSetSample"></param> /// <param name="otherSetSize"></param> /// <returns></returns> public static long?QuasiDecode <TEntity, TId, TCount>( this IInvertibleBloomFilter <TEntity, TId, TCount> filter, IEnumerable <TEntity> otherSetSample, long?otherSetSize = null) where TId : struct where TCount : struct { if (filter == null) { return(otherSetSize ?? otherSetSample?.LongCount() ?? 0L); } //compensate for extremely high error rates that can occur with estimators. Without this, the difference goes to infinity. var factor = QuasiEstimator.GetAdjustmentFactor(filter.Configuration, filter.BlockSize, filter.ItemCount, filter.HashFunctionCount, filter.ErrorRate); return(QuasiEstimator.Decode( filter.ItemCount, factor.Item1, filter.Contains, otherSetSample, otherSetSize, factor.Item2)); }