internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.IsIndexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparer; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: IndexOptions currentFieldIndexOptions = fieldInfo.IndexOptions; Debug.Assert(currentFieldIndexOptions != IndexOptions.NONE); bool writeTermFreq = currentFieldIndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS) >= 0; bool writePositions = currentFieldIndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; bool writeOffsets = currentFieldIndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; bool readTermFreq = this.hasFreq; bool readPositions = this.hasProx; bool readOffsets = this.hasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.terms.Count > 0) { segDeletes = state.SegUpdates.terms; } else { segDeletes = null; } int[] termIDs = termsHashPerField.SortPostings(termComp); int numTerms = termsHashPerField.bytesHash.Count; BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.textStarts[termID]; termsHashPerField.bytePool.SetBytesRef(text, textStart); termsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { termsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.lastDocCodes[termID] != -1) { // Return last doc docID = postings.lastDocIDs[termID]; if (readTermFreq) { termFreq = postings.termFreqs[termID]; } else { termFreq = -1; } postings.lastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt32(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt32(); } } Debug.Assert(docID != postings.lastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = docState.docWriter.codec.LiveDocsFormat.NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt32(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt32(); if (payload == null) { payload = new BytesRef(); payload.Bytes = new byte[payloadLength]; } else if (payload.Bytes.Length < payloadLength) { payload.Grow(payloadLength); } prox.ReadBytes(payload.Bytes, 0, payloadLength); payload.Length = payloadLength; thisPayload = payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt32(); int endOffset = startOffset + prox.ReadVInt32(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
public virtual void TestBasic() { // LUCENENET specific: NUnit will crash with an OOM if we do the full test // with verbosity enabled. So, making this a manual setting that can be // turned on if, and only if, needed for debugging. If the setting is turned // on, we are decresing the number of iterations by 1/3, which seems to // keep it from crashing. bool isVerbose = false; if (!isVerbose) { Console.WriteLine("Verbosity disabled to keep NUnit from running out of memory - enable manually"); } ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Random.Next(100))); int NUM_STREAM = AtLeast(100); ByteSliceWriter writer = new ByteSliceWriter(pool); int[] starts = new int[NUM_STREAM]; int[] uptos = new int[NUM_STREAM]; int[] counters = new int[NUM_STREAM]; ByteSliceReader reader = new ByteSliceReader(); for (int ti = 0; ti < 100; ti++) { for (int stream = 0; stream < NUM_STREAM; stream++) { starts[stream] = -1; counters[stream] = 0; } // LUCENENET NOTE: Since upgrading to NUnit 3, this test // will crash if VERBOSE is true because of an OutOfMemoryException. // This not only keeps this test from finishing, it crashes NUnit // and no other tests will run. // So, we need to allocate a smaller size to ensure this // doesn't happen with verbosity enabled. int num = isVerbose ? AtLeast(2000) : AtLeast(3000); for (int iter = 0; iter < num; iter++) { int stream; if (Random.NextBoolean()) { stream = Random.Next(3); } else { stream = Random.Next(NUM_STREAM); } if (isVerbose) { Console.WriteLine("write stream=" + stream); } if (starts[stream] == -1) { int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); starts[stream] = uptos[stream] = spot + pool.ByteOffset; if (isVerbose) { Console.WriteLine(" init to " + starts[stream]); } } writer.Init(uptos[stream]); int numValue; if (Random.Next(10) == 3) { numValue = Random.Next(100); } else if (Random.Next(5) == 3) { numValue = Random.Next(3); } else { numValue = Random.Next(20); } for (int j = 0; j < numValue; j++) { if (isVerbose) { Console.WriteLine(" write " + (counters[stream] + j)); } // write some large (incl. negative) ints: writer.WriteVInt32(Random.Next()); writer.WriteVInt32(counters[stream] + j); } counters[stream] += numValue; uptos[stream] = writer.Address; if (isVerbose) { Console.WriteLine(" addr now " + uptos[stream]); } } for (int stream = 0; stream < NUM_STREAM; stream++) { if (isVerbose) { Console.WriteLine(" stream=" + stream + " count=" + counters[stream]); } if (starts[stream] != -1 && starts[stream] != uptos[stream]) { reader.Init(pool, starts[stream], uptos[stream]); for (int j = 0; j < counters[stream]; j++) { reader.ReadVInt32(); Assert.AreEqual(j, reader.ReadVInt32()); } } } pool.Reset(); } }