public IndexResult UpdateExistingIndex(int recnum, string text) { Log("UpdateExistingIndex started"); LogIndex(); Log(string.Format("text size = {0}", text.Length)); Dictionary <string, int> wordFrequencies = GenerateWordFreq(text); Log(string.Format("word count = {0}", wordFrequencies.Count)); var result = new IndexResult { DocNumber = recnum }; using (var bmp = CreateBitmapStream()) { foreach (KeyValuePair <string, Cache> vc in _index) { string indexedWord = vc.Key; Cache cache = vc.Value; LoadCacheIfNotLoaded(cache, bmp); bool isWordIndexed = cache.GetBitmap().Get(recnum); bool isWordInText = wordFrequencies.ContainsKey(indexedWord); if (isWordIndexed && isWordInText) { wordFrequencies.Remove(indexedWord); } else if (isWordIndexed) { result.WordsRemoved.Add(indexedWord); cache.SetBit(recnum, false); } else if (isWordInText) { result.WordsAdded.Add(indexedWord, wordFrequencies[indexedWord]); wordFrequencies.Remove(indexedWord); cache.SetBit(recnum, true); } } } foreach (var wordFrequency in wordFrequencies) { result.WordsAdded.Add(wordFrequency.Key, wordFrequency.Value); var cache = new Cache { isLoaded = true }; cache.SetBit(recnum, true); _index.Add(wordFrequency.Key, cache); } LogIndex(); Log("UpdateExistingIndex ended"); return(result); }
private IndexResult AddtoIndex(int recnum, string text /*, bool optimizeIndexes*/) { _log("text size = " + text.Length); Dictionary <string, int> wordFrequences = GenerateWordFreq(text); _log("word count = " + wordFrequences.Count); var result = new IndexResult { DocNumber = recnum }; using (var bmp = CreateBitmapStream()) { foreach (string word in wordFrequences.Keys) { Cache cache; if (_index.TryGetValue(word, out cache)) { LoadCacheIfNotLoaded(cache, bmp); cache.SetBit(recnum, true); } else { cache = new Cache { isLoaded = true }; cache.SetBit(recnum, true); _index.Add(word, cache); } result.WordsAdded.Add(word, wordFrequences[word]); } } return(result); }
public WAHBitArray ExecutionPlan(string filter, bool freeCache = false) { _log(string.Format("Hoot::ExecutionPlan start freeCache is {0}", freeCache)); _log(string.Format("query : {0}", filter)); DateTime now = FastDateTime.Now; string[] words = filter.Split(' ').OrderByDescending(w => w.Length).ToArray(); WAHBitArray bits = null; var wildcardMatchers = new List <WildcardMatcher>(); var cacheToFree = new List <Tuple <string, Cache> >(); for (int i = 0; i < words.Length; i++) { string originWord = words[i]; string preparedWord = words[i]; var op = Cache.OPERATION.OR; if (originWord.StartsWith("+")) { op = Cache.OPERATION.AND; preparedWord = originWord.Replace("+", ""); } else if (originWord.StartsWith("-")) { op = Cache.OPERATION.ANDNOT; preparedWord = originWord.Replace("-", ""); } if (originWord.Contains("*") || originWord.Contains("?")) { wildcardMatchers.Add(new WildcardMatcher(originWord.ToLower())); } else { Cache c; var lowerWord = preparedWord.ToLowerInvariant(); if (_index.TryGetValue(lowerWord, out c)) { LoadCacheIfNotLoaded(c); cacheToFree.Add(Tuple.Create(lowerWord, c)); bits = DoBitOperation(bits, c, op); } else if (op == Cache.OPERATION.AND) { var cache = new Cache { isLoaded = true }; cache.SetBit(0, false); bits = DoBitOperation(bits, cache, op); } } if (i == words.Length - 1) { //asc: brutal hack - only for wildcards op = Cache.OPERATION.AND; WAHBitArray wildbits = null; var foundMatcherWords = wildcardMatchers.ToDictionary(w => w, _ => new ConcurrentQueue <string>()); Parallel.ForEach(_index.Keys, w => { foreach (var matcher in wildcardMatchers) { if (matcher.IsMatch(w)) { foundMatcherWords[matcher].Enqueue(w); break; } } }); var loadWatch = Stopwatch.StartNew(); using (var bmp = CreateBitmapStream()) { foreach (string key in foundMatcherWords.Values.SelectMany(_ => _)) { var c = _index[key]; LoadCacheIfNotLoaded(c, bmp); cacheToFree.Add(Tuple.Create(key, c)); } } _log(string.Format("Hoot wildcard load operation: {0} ms", loadWatch.Elapsed.TotalMilliseconds)); var bitWatch = Stopwatch.StartNew(); var matcherPlans = foundMatcherWords.Select(p => { if (p.Value.Count == 0) { var falsePlan = new WAHBitArray(); falsePlan.Set(0, false); return(falsePlan); } WAHBitArray matcherPlan = null; foreach (string word in p.Value) { matcherPlan = DoBitOperation(matcherPlan, _index[word], Cache.OPERATION.OR); } return(matcherPlan); }).Where(p => p != null).ToList(); wildbits = matcherPlans.Aggregate(wildbits, (acc, matcherPlan) => acc != null ? acc.And(matcherPlan) : matcherPlan); _log(string.Format("Hoot wildcard bit operation: {0} ms", bitWatch.Elapsed.TotalMilliseconds)); if (wildbits != null) { bits = bits == null ? wildbits : (op == Cache.OPERATION.AND ? bits.And(wildbits) : bits.Or(wildbits)); } } } if (bits == null) { return(new WAHBitArray()); } // remove deleted docs if (bits.Length > _deleted.Length) { _deleted.Length = bits.Length; } else if (bits.Length < _deleted.Length) { bits.Length = _deleted.Length; } WAHBitArray result = bits.And(_deleted.Not()); _log(string.Format("Hoot::ExecutionPlan freeCache is {0}", freeCache)); if (freeCache) { foreach (var c in cacheToFree) { _log(string.Format("Free cache from ExecutionPlan::ExecutionPlan for {0}", c.Item1)); c.Item2.FreeMemory(unload: true, freeUncompressedMemory: false); } //asc: clean cache buckets cache _hash.Commit(); } _log(string.Format("query time (ms) = {0}", FastDateTime.Now.Subtract(now).TotalMilliseconds)); return(result); }