/// <summary> /// Compute the union of the provided sets. this method is much faster than /// computing the union manually since it operates directly at the byte level. /// </summary> public static WAH8DocIdSet Union(ICollection <WAH8DocIdSet> docIdSets, int indexInterval) { switch (docIdSets.Count) { case 0: return(EMPTY); case 1: var iter = docIdSets.GetEnumerator(); iter.MoveNext(); return(iter.Current); } // The logic below is very similar to DisjunctionScorer int numSets = docIdSets.Count; PriorityQueue <Iterator> iterators = new PriorityQueueAnonymousInnerClassHelper(numSets); foreach (WAH8DocIdSet set in docIdSets) { Iterator iterator = (Iterator)set.GetIterator(); iterator.NextWord(); iterators.Add(iterator); } Iterator top = iterators.Top(); if (top.WordNum == int.MaxValue) { return(EMPTY); } int wordNum = top.WordNum; byte word = top.Word; WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval); while (true) { top.NextWord(); iterators.UpdateTop(); top = iterators.Top(); if (top.WordNum == wordNum) { word |= top.Word; } else { builder.AddWord(wordNum, word); if (top.WordNum == int.MaxValue) { break; } wordNum = top.WordNum; word = top.Word; } } return(builder.Build()); }
/// <summary> /// Compute the intersection of the provided sets. this method is much faster than /// computing the intersection manually since it operates directly at the byte level. /// </summary> public static WAH8DocIdSet Intersect(ICollection <WAH8DocIdSet> docIdSets, int indexInterval) { switch (docIdSets.Count) { case 0: throw new System.ArgumentException("There must be at least one set to intersect"); case 1: var iter = docIdSets.GetEnumerator(); iter.MoveNext(); return(iter.Current); } // The logic below is similar to ConjunctionScorer int numSets = docIdSets.Count; var iterators = new Iterator[numSets]; int i = 0; foreach (WAH8DocIdSet set in docIdSets) { var it = (Iterator)set.GetIterator(); iterators[i++] = it; } Array.Sort(iterators, SERIALIZED_LENGTH_COMPARATOR); WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval); int wordNum = 0; while (true) { // Advance the least costly iterator first iterators[0].AdvanceWord(wordNum); wordNum = iterators[0].WordNum; if (wordNum == DocIdSetIterator.NO_MORE_DOCS) { break; } byte word = iterators[0].Word; for (i = 1; i < numSets; ++i) { if (iterators[i].WordNum < wordNum) { iterators[i].AdvanceWord(wordNum); } if (iterators[i].WordNum > wordNum) { wordNum = iterators[i].WordNum; goto mainContinue; } Debug.Assert(iterators[i].WordNum == wordNum); word &= iterators[i].Word; if (word == 0) { // There are common words, but they don't share any bit ++wordNum; goto mainContinue; } } // Found a common word Debug.Assert(word != 0); builder.AddWord(wordNum, word); ++wordNum; mainContinue :; } //mainBreak: return(builder.Build()); }