private ISpellChecker CreateSpellChecker(string[] testData) { var logger = LogManager.GetLogger(typeof(SpellCheckerTests)); var simpleBF = new SimpleBloomFilter(testData.Length + 23, new IHasher[] { new DJB2Hasher(), new SDBMHasher() }, logger); return(new PreloadedSpellChecker(simpleBF, testData, logger)); }
public SortedDictionary <Number640, Data> Get(Number640 from, Number640 to, SimpleBloomFilter <Number160> contentKeyBloomFilter, SimpleBloomFilter <Number160> versionKeyBloomFilter, SimpleBloomFilter <Number160> contentBloomFilter, int limit, bool ascending, bool isBloomFilterAnd) { var rLock = RangeLock.Lock(from, to); try { var tmp = _backend.SubMap(from, to, limit, ascending); foreach (var kvp in tmp.ToList()) // iterate over copy { // remove from original if (kvp.Value.HasPrepareFlag) { tmp.Remove(kvp.Key); continue; } if (isBloomFilterAnd) { if (!contentKeyBloomFilter.Contains(kvp.Key.ContentKey)) { tmp.Remove(kvp.Key); continue; } if (!versionKeyBloomFilter.Contains(kvp.Key.VersionKey)) { tmp.Remove(kvp.Key); continue; } if (!contentBloomFilter.Contains(kvp.Value.Hash)) { tmp.Remove(kvp.Key); } } else { if (contentKeyBloomFilter.Contains(kvp.Key.ContentKey)) { tmp.Remove(kvp.Key); continue; } if (versionKeyBloomFilter.Contains(kvp.Key.VersionKey)) { tmp.Remove(kvp.Key); continue; } if (contentBloomFilter.Contains(kvp.Value.Hash)) { tmp.Remove(kvp.Key); } } } return(tmp); } finally { rLock.Unlock(); } }
/// <summary> /// Searches for multiple content keys. There may be false positives. /// </summary> /// <param name="locationKey">The location key.</param> /// <param name="domainKey">The domain key.</param> /// <param name="keyBloomFilter">For Get() and Remove() one can provide a bloom filter of /// content keys and the remote peer indicates if those keys are on that peer.</param> public SearchValues(Number160 locationKey, Number160 domainKey, SimpleBloomFilter<Number160> keyBloomFilter) { LocationKey = locationKey; DomainKey = domainKey; ContentKey = null; KeyBloomFilter = keyBloomFilter; ContentBloomFilter = null; From = null; To = null; }
public Message SetBloomFilter(SimpleBloomFilter <Number160> bloomFilter) { if (!_presetContentTypes) { SetContentType(Content.BloomFilter); } if (_bloomFilterList == null) { _bloomFilterList = new List <SimpleBloomFilter <Number160> >(1); } _bloomFilterList.Add(bloomFilter); return(this); }
public void TestCase() { var hashes = new Func<string, uint>[]{StringHashFirstHalf,StringHashLastHalf}; var size = Convert.ToInt32(Math.Pow(2, 16)); IBloomFilter<string> bloomFilter = new SimpleBloomFilter<string>(size, hashes); bloomFilter.Add("haochi"); bloomFilter.Add("chen"); Assert.AreEqual(bloomFilter.Query("haochi"), Existence.MAYBE); Assert.AreEqual(bloomFilter.Query("chen"), Existence.MAYBE); Assert.AreEqual(bloomFilter.Query("orlando"), Existence.NO); Assert.AreEqual(bloomFilter.Query("bloom"), Existence.NO); }
public DigestInfo Digest(Number320 locationAndDomainKey, SimpleBloomFilter <Number160> keyBloomFilter, SimpleBloomFilter <Number160> contentBloomFilter, int limit, bool ascending, bool isBloomFilterAnd) { var digestInfo = new DigestInfo(); var rLock = Lock(locationAndDomainKey); try { var from = new Number640(locationAndDomainKey, Number160.Zero, Number160.Zero); var to = new Number640(locationAndDomainKey, Number160.MaxValue, Number160.MaxValue); var tmp = _backend.SubMap(from, to, limit, ascending); foreach (var kvp in tmp) { if (isBloomFilterAnd) { if (keyBloomFilter == null || keyBloomFilter.Contains(kvp.Key.ContentKey)) { if (contentBloomFilter == null || contentBloomFilter.Contains(kvp.Value.Hash)) { if (!kvp.Value.HasPrepareFlag) { digestInfo.Put(kvp.Key, kvp.Value.BasedOnSet); } } } } else { if (keyBloomFilter == null || !keyBloomFilter.Contains(kvp.Key.ContentKey)) { if (contentBloomFilter == null || !contentBloomFilter.Contains(kvp.Value.Hash)) { if (!kvp.Value.HasPrepareFlag) { digestInfo.Put(kvp.Key, kvp.Value.BasedOnSet); } } } } } return(digestInfo); } finally { rLock.Unlock(); } }
private static Message CreateMessageBloomFilter() { var sampleBf1 = new SimpleBloomFilter <Number160>(2, 5); sampleBf1.Add(_sample160_1); var sampleBf2 = new SimpleBloomFilter <Number160>(2, 5); sampleBf2.Add(_sample160_2); sampleBf2.Add(_sample160_1); var sampleBf3 = new SimpleBloomFilter <Number160>(2, 5); sampleBf3.Add(_sample160_1); sampleBf3.Add(_sample160_2); sampleBf3.Add(_sample160_3); var sampleBf4 = new SimpleBloomFilter <Number160>(2, 5); sampleBf4.Add(_sample160_1); sampleBf4.Add(_sample160_2); sampleBf4.Add(_sample160_3); sampleBf4.Add(_sample160_4); var sampleBf5 = new SimpleBloomFilter <Number160>(2, 5); sampleBf5.Add(_sample160_1); sampleBf5.Add(_sample160_2); sampleBf5.Add(_sample160_3); sampleBf5.Add(_sample160_4); sampleBf5.Add(_sample160_5); var m = Utils2.CreateDummyMessage(); m.SetBloomFilter(sampleBf1); m.SetBloomFilter(sampleBf2); m.SetBloomFilter(sampleBf3); m.SetBloomFilter(sampleBf4); m.SetBloomFilter(sampleBf5); m.SetBloomFilter(sampleBf1); m.SetBloomFilter(sampleBf2); m.SetBloomFilter(sampleBf3); return(m); }
public Message SetBloomFilter(SimpleBloomFilter<Number160> bloomFilter) { if (!_presetContentTypes) { SetContentType(Content.BloomFilter); } if (_bloomFilterList == null) { _bloomFilterList = new List<SimpleBloomFilter<Number160>>(1); } _bloomFilterList.Add(bloomFilter); return this; }
public DigestBuilder SetContentBloomFilter(SimpleBloomFilter <Number160> contentBloomFilter) { ContentBloomFilter = contentBloomFilter; return(this); }
public DigestBuilder SetKeyBloomFilter(SimpleBloomFilter <Number160> keyBloomFilter) { KeyBloomFilter = keyBloomFilter; return(this); }
public DigestInfo Digest(Number320 locationAndDomainKey, SimpleBloomFilter<Number160> keyBloomFilter, SimpleBloomFilter<Number160> contentBloomFilter, int limit, bool ascending, bool isBloomFilterAnd) { var digestInfo = new DigestInfo(); var rLock = Lock(locationAndDomainKey); try { var from = new Number640(locationAndDomainKey, Number160.Zero, Number160.Zero); var to = new Number640(locationAndDomainKey, Number160.MaxValue, Number160.MaxValue); var tmp = _backend.SubMap(from, to, limit, ascending); foreach (var kvp in tmp) { if (isBloomFilterAnd) { if (keyBloomFilter == null || keyBloomFilter.Contains(kvp.Key.ContentKey)) { if (contentBloomFilter == null || contentBloomFilter.Contains(kvp.Value.Hash)) { if (!kvp.Value.HasPrepareFlag) { digestInfo.Put(kvp.Key, kvp.Value.BasedOnSet); } } } } else { if (keyBloomFilter == null || !keyBloomFilter.Contains(kvp.Key.ContentKey)) { if (contentBloomFilter == null || !contentBloomFilter.Contains(kvp.Value.Hash)) { if (!kvp.Value.HasPrepareFlag) { digestInfo.Put(kvp.Key, kvp.Value.BasedOnSet); } } } } } return digestInfo; } finally { rLock.Unlock(); } }
public SortedDictionary<Number640, Data> Get(Number640 from, Number640 to, SimpleBloomFilter<Number160> contentKeyBloomFilter, SimpleBloomFilter<Number160> versionKeyBloomFilter, SimpleBloomFilter<Number160> contentBloomFilter, int limit, bool ascending, bool isBloomFilterAnd) { var rLock = RangeLock.Lock(from, to); try { var tmp = _backend.SubMap(from, to, limit, ascending); foreach (var kvp in tmp.ToList()) // iterate over copy { // remove from original if (kvp.Value.HasPrepareFlag) { tmp.Remove(kvp.Key); continue; } if (isBloomFilterAnd) { if (!contentKeyBloomFilter.Contains(kvp.Key.ContentKey)) { tmp.Remove(kvp.Key); continue; } if (!versionKeyBloomFilter.Contains(kvp.Key.VersionKey)) { tmp.Remove(kvp.Key); continue; } if (!contentBloomFilter.Contains(kvp.Value.Hash)) { tmp.Remove(kvp.Key); } } else { if (contentKeyBloomFilter.Contains(kvp.Key.ContentKey)) { tmp.Remove(kvp.Key); continue; } if (versionKeyBloomFilter.Contains(kvp.Key.VersionKey)) { tmp.Remove(kvp.Key); continue; } if (contentBloomFilter.Contains(kvp.Value.Hash)) { tmp.Remove(kvp.Key); } } } return tmp; } finally { rLock.Unlock(); } }
public GetBuilder SetVersionKeyBloomFilter(SimpleBloomFilter <Number160> versionKeyBloomFilter) { VersionKeyBloomFilter = versionKeyBloomFilter; return(this); }
/// <summary> /// Similar to <seealso cref="BooleanQueryWithExclusionsFastAlternativeVersion"/> using a BloomFilter instead of a HashSet /// Load up the BloomFilter with the exclusions, then loop through the Base Query, until we have pageSize + Skip items that aren't in the BloomFilter. /// Expensive when there are LOTS of exclusions, but cheaper when the BaseQuery is large because we don't process all of it (stop when we have enough) /// </summary> internal List <Question> BooleanQueryWithExclusionsBloomFilterVersion(QueryType type, string tag, IList <string> excludedTags, int pageSize, int skip) { var gcInfo = new GCCollectionInfo(); var timer = Stopwatch.StartNew(); TagByQueryLookup queryInfo = GetTagByQueryLookup(type); Func <Question, string> fieldSelector = GetFieldSelector(type); ThrowIfInvalidParameters(tag, pageSize, queryInfo); //int bloomFilterSize = 40 * 1000 * 1000; // million's, 40mil produces several False +ve's int bloomFilterSize = 100 * 1000 * 1000; // million's #if DEBUG var bloomFilterCreationTimer = Stopwatch.StartNew(); var bloomFilter = new SimpleBloomFilter(bloomFilterSize); bloomFilterCreationTimer.Stop(); Logger.Log("Took {0} ({1:N2} ms) to create the bloom filter with {2:N0} bits ({3:N2} bytes)", bloomFilterCreationTimer.Elapsed, bloomFilterCreationTimer.Elapsed.TotalMilliseconds, bloomFilterSize, bloomFilterSize / 8); #else var bloomFilter = new SimpleBloomFilter(bloomFilterSize); #endif #if DEBUG //var tests = new[] { 1066589, 2793150, 364114, 910374 }; // These are the Question Id's NOT the array index ([]) values!! var tests = new[] { 192257, 616585, 53029, 158368 }; // These ARE the array index ([]) values var debugging = cache.Value.GetCachedHashSet(); #endif foreach (var excludedTag in excludedTags) { foreach (var qu in queryInfo[excludedTag]) { bloomFilter.Add(qu); #if DEBUG debugging.Add(qu); if (tests.Contains(qu)) { // It it's false, it's DEFINITELY false // It it's true, it could really be false (false +ve) var possiblyExists = bloomFilter.PossiblyExists(qu, debugInfo: true); Logger.Log("Bloom Filter.PossiblyExists - {0,8} = {1} ****", qu, possiblyExists); Logger.Log(" DebuggingHashSet.Contains - {0,8} = {1} ****", qu, debugging.Contains(qu)); } #endif } } var baseQuery = queryInfo[tag]; #if DEBUG var result = baseQuery.Where(b => { var possiblyExists = bloomFilter.PossiblyExists(b); if (possiblyExists == false) { return(true); // we can use it } if (debugging.Contains(b) == false) { var qu = questions[b]; Logger.Log("FALSE +VE: {0,8}, PossiblyExists = {1}, debugging.Contains() = {2}, Id = {3,8}, Tags = {4}", b, possiblyExists, debugging.Contains(b), qu.Id, string.Join(",", qu.Tags)); } return(false); // we can't use it }) #else var result = baseQuery.Where(b => bloomFilter.PossiblyExists(b) == false) #endif .Skip(skip) .Take(pageSize) .Select(i => questions[i]) .ToList(); timer.Stop(); gcInfo.UpdateCollectionInfo(); Logger.Log("Base Query: {0}, there are {1:N0} Excluded Tags", tag, excludedTags.Count); Results.AddData(timer.Elapsed.TotalMilliseconds.ToString("#.##")); using (Utils.SetConsoleColour(Utils.GetColorForTimespan(timer.Elapsed))) { Logger.Log("Boolean Query {0} against tag \"{1}\", pageSize = {2}, skip = {3}, took {4} ({5:N2} ms) - BLOOM", type, tag, pageSize, skip, timer.Elapsed, timer.Elapsed.TotalMilliseconds); } //Log("Got {0} results, Bloom Filter contains {1:N0} items (some could be dupes), Truthiness {2:N2}", // result.Count(), bloomFilter.NumberOfItems, bloomFilter.Truthiness); Logger.Log("Got {0} results, Bloom Filter contains {1:N0} items (some could be dupes)", result.Count(), bloomFilter.NumberOfItems); Logger.Log(gcInfo.ToString()); //var formattedResults = result.Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); //Log(" {0}", string.Join("\n ", formattedResults)); Logger.Log(""); #if DEBUG foreach (var item in tests) { var possiblyExists = bloomFilter.PossiblyExists(item, debugInfo: true); Logger.Log("Bloom Filter.PossiblyExists - {0,8} = {1}", item, possiblyExists); Logger.Log(" DebuggingHashSet.Contains - {0,8} = {1}", item, debugging.Contains(item)); Logger.Log(""); } // When the values in "tests" represent Question Id //var testResults = tests.Select(t => questions.First(qu => qu.Id == t)) // .Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); // When the values in "tests" represent array indexes, i.e. questions[x] var testResults = tests.Select(t => questions[t]) .Select(r => string.Format("Id: {0,8}, {1}: {2,4}, Tags: {3}, ", r.Id, type, fieldSelector(r), string.Join(",", r.Tags))); Logger.Log(" {0}", string.Join("\n ", testResults)); #endif return(result); }