/// <summary> /// Merge a list of sorted temporary files (partitions) into an output file. </summary> internal void MergePartitions(IList <FileInfo> merges, FileInfo outputFile) { long start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results var @out = new ByteSequencesWriter(outputFile); PriorityQueue <FileAndTop> queue = new PriorityQueueAnonymousClass(this, merges.Count); var streams = new ByteSequencesReader[merges.Count]; try { // Open streams and read the top for each file for (int i = 0; i < merges.Count; i++) { streams[i] = new ByteSequencesReader(merges[i]); byte[] line = streams[i].Read(); if (line != null) { queue.InsertWithOverflow(new FileAndTop(i, line)); } } // Unix utility sort() uses ordered array of files to pick the next line from, updating // it as it reads new lines. The PQ used here is a more elegant solution and has // a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway // so it shouldn't make much of a difference (didn't check). FileAndTop top; while ((top = queue.Top) != null) { @out.Write(top.Current); if (!streams[top.Fd].Read(top.Current)) { queue.Pop(); } else { queue.UpdateTop(); } } sortInfo.MergeTime += (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - start; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results sortInfo.MergeRounds++; } finally { // The logic below is: if an exception occurs in closing out, it has a priority over exceptions // happening in closing streams. try { IOUtils.Dispose(streams); } finally { IOUtils.Dispose(@out); } } }
/// <summary> /// Compute the union of the provided sets. This method is much faster than /// computing the union manually since it operates directly at the byte level. /// </summary> public static WAH8DocIdSet Union(ICollection <WAH8DocIdSet> docIdSets, int indexInterval) { switch (docIdSets.Count) { case 0: return(EMPTY); case 1: return(docIdSets.First()); } // The logic below is very similar to DisjunctionScorer int numSets = docIdSets.Count; PriorityQueue <Iterator> iterators = new PriorityQueueAnonymousClass(numSets); foreach (WAH8DocIdSet set in docIdSets) { Iterator iterator = (Iterator)set.GetIterator(); iterator.NextWord(); iterators.Add(iterator); } Iterator top = iterators.Top; if (top.wordNum == int.MaxValue) { return(EMPTY); } int wordNum = top.wordNum; byte word = top.word; WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval); while (true) { top.NextWord(); iterators.UpdateTop(); top = iterators.Top; if (top.wordNum == wordNum) { word |= top.word; } else { builder.AddWord(wordNum, word); if (top.wordNum == int.MaxValue) { break; } wordNum = top.wordNum; word = top.word; } } return(builder.Build()); }
public void TestRandomIndex() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); CreateRandomIndex(AtLeast(50), w, Random.NextInt64()); DirectoryReader reader = w.GetReader(); AtomicReader wrapper = SlowCompositeReaderWrapper.Wrap(reader); string field = @"body"; Terms terms = wrapper.GetTerms(field); var lowFreqQueue = new PriorityQueueAnonymousClass(5); var highFreqQueue = new PriorityQueueAnonymousClass1(5); try { TermsEnum iterator = terms.GetEnumerator(); while (iterator.MoveNext()) { if (highFreqQueue.Count < 5) { highFreqQueue.Add(new TermAndFreq( BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); lowFreqQueue.Add(new TermAndFreq( BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); } else { if (highFreqQueue.Top.freq < iterator.DocFreq) { highFreqQueue.Top.freq = iterator.DocFreq; highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); highFreqQueue.UpdateTop(); } if (lowFreqQueue.Top.freq > iterator.DocFreq) { lowFreqQueue.Top.freq = iterator.DocFreq; lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); lowFreqQueue.UpdateTop(); } } } int lowFreq = lowFreqQueue.Top.freq; int highFreq = highFreqQueue.Top.freq; AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq); List <TermAndFreq> highTerms = QueueToList(highFreqQueue); List <TermAndFreq> lowTerms = QueueToList(lowFreqQueue); IndexSearcher searcher = NewSearcher(reader); Occur lowFreqOccur = RandomOccur(Random); BooleanQuery verifyQuery = new BooleanQuery(); CommonTermsQuery cq = new CommonTermsQuery(RandomOccur(Random), lowFreqOccur, highFreq - 1, Random.NextBoolean()); foreach (TermAndFreq termAndFreq in lowTerms) { cq.Add(new Term(field, termAndFreq.term)); verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur)); } foreach (TermAndFreq termAndFreq in highTerms) { cq.Add(new Term(field, termAndFreq.term)); } TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc); TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc); assertEquals(verifySearch.TotalHits, cqSearch.TotalHits); var hits = new JCG.HashSet <int>(); foreach (ScoreDoc doc in verifySearch.ScoreDocs) { hits.Add(doc.Doc); } foreach (ScoreDoc doc in cqSearch.ScoreDocs) { assertTrue(hits.Remove(doc.Doc)); } assertTrue(hits.Count == 0); /* * need to force merge here since QueryUtils adds checks based * on leave readers which have different statistics than the top * level reader if we have more than one segment. This could * result in a different query / results. */ w.ForceMerge(1); DirectoryReader reader2 = w.GetReader(); QueryUtils.Check( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, cq, NewSearcher(reader2)); reader2.Dispose(); } finally { reader.Dispose(); wrapper.Dispose(); w.Dispose(); dir.Dispose(); } }