public void InitReader(ByteSliceReader reader, RawPostingList p, int stream) { System.Diagnostics.Debug.Assert(stream < streamCount); int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK; reader.Init(bytePool, p.byteStart + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]); }
public void InitReader(ByteSliceReader reader, int termID, int stream) { Debug.Assert(stream < streamCount); int intStart = postingsArray.intStarts[termID]; int[] ints = intPool.Buffers[intStart >> Int32BlockPool.INT32_BLOCK_SHIFT]; int upto = intStart & Int32BlockPool.INT32_BLOCK_MASK; reader.Init(bytePool, postingsArray.byteStarts[termID] + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.Indexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparator; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions; Debug.Assert(currentFieldIndexOptions != null); bool writeTermFreq = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool writeOffsets = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; bool readTermFreq = this.HasFreq; bool readPositions = this.HasProx; bool readOffsets = this.HasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0) { segDeletes = state.SegUpdates.Terms; } else { segDeletes = null; } int[] termIDs = TermsHashPerField.SortPostings(termComp); int numTerms = TermsHashPerField.BytesHash.Size(); BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.TextStarts[termID]; TermsHashPerField.BytePool.SetBytesRef(text, textStart); TermsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { TermsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes_Renamed = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.LastDocCodes[termID] != -1) { // Return last doc docID = postings.LastDocIDs[termID]; if (readTermFreq) { termFreq = postings.TermFreqs[termID]; } else { termFreq = -1; } postings.LastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt(); } } Debug.Assert(docID != postings.LastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt(); if (Payload == null) { Payload = new BytesRef(); Payload.Bytes = new sbyte[payloadLength]; } else if (Payload.Bytes.Length < payloadLength) { Payload.Grow(payloadLength); } prox.ReadBytes(Payload.Bytes, 0, payloadLength); Payload.Length = payloadLength; thisPayload = Payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt(); int endOffset = startOffset + prox.ReadVInt(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
internal void FinishDocument() { Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.bytesHash.Count; BytesRef flushTerm = termsWriter.flushTerm; Debug.Assert(numPostings >= 0); if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } // this is called once, after inverting all occurrences // of a given field in the doc. At this point we flush // our hash into the DocWriter. Debug.Assert(termsWriter.VectorFieldsInOrder(fieldInfo)); TermVectorsPostingsArray postings = (TermVectorsPostingsArray)termsHashPerField.postingsArray; TermVectorsWriter tv = termsWriter.writer; int[] termIDs = termsHashPerField.SortPostings(tv.Comparer); tv.StartField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads); ByteSliceReader posReader = doVectorPositions ? termsWriter.vectorSliceReaderPos : null; ByteSliceReader offReader = doVectorOffsets ? termsWriter.vectorSliceReaderOff : null; ByteBlockPool termBytePool = termsHashPerField.termBytePool; for (int j = 0; j < numPostings; j++) { int termID = termIDs[j]; int freq = postings.freqs[termID]; // Get BytesRef termBytePool.SetBytesRef(flushTerm, postings.textStarts[termID]); tv.StartTerm(flushTerm, freq); if (doVectorPositions || doVectorOffsets) { if (posReader != null) { termsHashPerField.InitReader(posReader, termID, 0); } if (offReader != null) { termsHashPerField.InitReader(offReader, termID, 1); } tv.AddProx(freq, posReader, offReader); } tv.FinishTerm(); } tv.FinishField(); termsHashPerField.Reset(); fieldInfo.SetStoreTermVectors(); }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// * the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.tvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); perThread.termsHashPerThread.Reset(false); }
public virtual void TestBasic() { ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Random().Next(100))); int NUM_STREAM = AtLeast(100); ByteSliceWriter writer = new ByteSliceWriter(pool); int[] starts = new int[NUM_STREAM]; int[] uptos = new int[NUM_STREAM]; int[] counters = new int[NUM_STREAM]; ByteSliceReader reader = new ByteSliceReader(); for (int ti = 0; ti < 100; ti++) { for (int stream = 0; stream < NUM_STREAM; stream++) { starts[stream] = -1; counters[stream] = 0; } int num = AtLeast(3000); for (int iter = 0; iter < num; iter++) { int stream; if (Random().NextBoolean()) { stream = Random().Next(3); } else { stream = Random().Next(NUM_STREAM); } if (VERBOSE) { Console.WriteLine("write stream=" + stream); } if (starts[stream] == -1) { int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); starts[stream] = uptos[stream] = spot + pool.ByteOffset; if (VERBOSE) { Console.WriteLine(" init to " + starts[stream]); } } writer.Init(uptos[stream]); int numValue; if (Random().Next(10) == 3) { numValue = Random().Next(100); } else if (Random().Next(5) == 3) { numValue = Random().Next(3); } else { numValue = Random().Next(20); } for (int j = 0; j < numValue; j++) { if (VERBOSE) { Console.WriteLine(" write " + (counters[stream] + j)); } // write some large (incl. negative) ints: writer.WriteVInt(Random().Next()); writer.WriteVInt(counters[stream] + j); } counters[stream] += numValue; uptos[stream] = writer.Address; if (VERBOSE) { Console.WriteLine(" addr now " + uptos[stream]); } } for (int stream = 0; stream < NUM_STREAM; stream++) { if (VERBOSE) { Console.WriteLine(" stream=" + stream + " count=" + counters[stream]); } if (starts[stream] != -1 && starts[stream] != uptos[stream]) { reader.Init(pool, starts[stream], uptos[stream]); for (int j = 0; j < counters[stream]; j++) { reader.ReadVInt(); Assert.AreEqual(j, reader.ReadVInt()); } } } pool.Reset(); } }
public virtual void TestBasic() { ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator(), false); int NUM_STREAM = 25; ByteSliceWriter writer = new ByteSliceWriter(pool); int[] starts = new int[NUM_STREAM]; int[] uptos = new int[NUM_STREAM]; int[] counters = new int[NUM_STREAM]; System.Random r = NewRandom(); ByteSliceReader reader = new ByteSliceReader(); for (int ti = 0; ti < 100; ti++) { for (int stream = 0; stream < NUM_STREAM; stream++) { starts[stream] = -1; counters[stream] = 0; } bool debug = false; for (int iter = 0; iter < 10000; iter++) { int stream = r.Next(NUM_STREAM); if (debug) { System.Console.Out.WriteLine("write stream=" + stream); } if (starts[stream] == -1) { int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE_ForNUnit); starts[stream] = uptos[stream] = spot + pool.byteOffset; if (debug) { System.Console.Out.WriteLine(" init to " + starts[stream]); } } writer.Init(uptos[stream]); int numValue = r.Next(20); for (int j = 0; j < numValue; j++) { if (debug) { System.Console.Out.WriteLine(" write " + (counters[stream] + j)); } writer.WriteVInt(counters[stream] + j); //writer.writeVInt(ti); } counters[stream] += numValue; uptos[stream] = writer.GetAddress(); if (debug) { System.Console.Out.WriteLine(" addr now " + uptos[stream]); } } for (int stream = 0; stream < NUM_STREAM; stream++) { if (debug) { System.Console.Out.WriteLine(" stream=" + stream + " count=" + counters[stream]); } if (starts[stream] != uptos[stream]) { reader.Init(pool, starts[stream], uptos[stream]); for (int j = 0; j < counters[stream]; j++) { Assert.AreEqual(j, reader.ReadVInt()); } //Assert.AreEqual(ti, reader.readVInt()); } } pool.Reset(); } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void AppendPostings(FreqProxTermsWriterPerField[] fields, FormatPostingsFieldsConsumer consumer) { int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo); // Should always be true bool result = fms.NextTerm(); System.Diagnostics.Debug.Assert(result); } FormatPostingsTermsConsumer termsConsumer = consumer.AddField(fields[0].fieldInfo); FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; bool currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } FormatPostingsDocsConsumer docConsumer = termsConsumer.AddTerm(termStates[0].text, termStates[0].textOffset); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } int termDocFreq = minState.termFreq; FormatPostingsPositionsConsumer posConsumer = docConsumer.AddDoc(minState.docID, termDocFreq); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTermFreqAndPositions) { // omitTermFreqAndPositions == false so we do write positions & // payload int position = 0; for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); position += (code >> 1); int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); if (payloadBuffer == null || payloadBuffer.Length < payloadLength) { payloadBuffer = new byte[payloadLength]; } prox.ReadBytes(payloadBuffer, 0, payloadLength); } else { payloadLength = 0; } posConsumer.AddPosition(position, payloadBuffer, 0, payloadLength); } //End for posConsumer.Finish(); } if (!minState.NextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.NextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) { if (mergeStates[i] != minState) { mergeStates[upto++] = mergeStates[i]; } } numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } docConsumer.Finish(); } termsConsumer.Finish(); }
public void InitReader(ByteSliceReader reader, int termID, int stream) { Debug.Assert(stream < StreamCount); int intStart = PostingsArray.IntStarts[termID]; int[] ints = IntPool.Buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT]; int upto = intStart & IntBlockPool.INT_BLOCK_MASK; reader.Init(BytePool, PostingsArray.ByteStarts[termID] + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]); }
public virtual void TestBasic() { // LUCENENET specific: NUnit will crash with an OOM if we do the full test // with verbosity enabled. So, making this a manual setting that can be // turned on if, and only if, needed for debugging. If the setting is turned // on, we are decresing the number of iterations by 1/3, which seems to // keep it from crashing. bool isVerbose = false; if (!isVerbose) { Console.WriteLine("Verbosity disabled to keep NUnit from running out of memory - enable manually"); } ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Random.Next(100))); int NUM_STREAM = AtLeast(100); ByteSliceWriter writer = new ByteSliceWriter(pool); int[] starts = new int[NUM_STREAM]; int[] uptos = new int[NUM_STREAM]; int[] counters = new int[NUM_STREAM]; ByteSliceReader reader = new ByteSliceReader(); for (int ti = 0; ti < 100; ti++) { for (int stream = 0; stream < NUM_STREAM; stream++) { starts[stream] = -1; counters[stream] = 0; } // LUCENENET NOTE: Since upgrading to NUnit 3, this test // will crash if VERBOSE is true because of an OutOfMemoryException. // This not only keeps this test from finishing, it crashes NUnit // and no other tests will run. // So, we need to allocate a smaller size to ensure this // doesn't happen with verbosity enabled. int num = isVerbose ? AtLeast(2000) : AtLeast(3000); for (int iter = 0; iter < num; iter++) { int stream; if (Random.NextBoolean()) { stream = Random.Next(3); } else { stream = Random.Next(NUM_STREAM); } if (isVerbose) { Console.WriteLine("write stream=" + stream); } if (starts[stream] == -1) { int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); starts[stream] = uptos[stream] = spot + pool.ByteOffset; if (isVerbose) { Console.WriteLine(" init to " + starts[stream]); } } writer.Init(uptos[stream]); int numValue; if (Random.Next(10) == 3) { numValue = Random.Next(100); } else if (Random.Next(5) == 3) { numValue = Random.Next(3); } else { numValue = Random.Next(20); } for (int j = 0; j < numValue; j++) { if (isVerbose) { Console.WriteLine(" write " + (counters[stream] + j)); } // write some large (incl. negative) ints: writer.WriteVInt32(Random.Next()); writer.WriteVInt32(counters[stream] + j); } counters[stream] += numValue; uptos[stream] = writer.Address; if (isVerbose) { Console.WriteLine(" addr now " + uptos[stream]); } } for (int stream = 0; stream < NUM_STREAM; stream++) { if (isVerbose) { Console.WriteLine(" stream=" + stream + " count=" + counters[stream]); } if (starts[stream] != -1 && starts[stream] != uptos[stream]) { reader.Init(pool, starts[stream], uptos[stream]); for (int j = 0; j < counters[stream]; j++) { reader.ReadVInt32(); Assert.AreEqual(j, reader.ReadVInt32()); } } } pool.Reset(); } }
public virtual void TestBasic() { ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator(), false); int NUM_STREAM = 25; ByteSliceWriter writer = new ByteSliceWriter(pool); int[] starts = new int[NUM_STREAM]; int[] uptos = new int[NUM_STREAM]; int[] counters = new int[NUM_STREAM]; System.Random r = NewRandom(); ByteSliceReader reader = new ByteSliceReader(); for (int ti = 0; ti < 100; ti++) { for (int stream = 0; stream < NUM_STREAM; stream++) { starts[stream] = - 1; counters[stream] = 0; } bool debug = false; for (int iter = 0; iter < 10000; iter++) { int stream = r.Next(NUM_STREAM); if (debug) System.Console.Out.WriteLine("write stream=" + stream); if (starts[stream] == - 1) { int spot = pool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE_ForNUnit); starts[stream] = uptos[stream] = spot + pool.byteOffset; if (debug) System.Console.Out.WriteLine(" init to " + starts[stream]); } writer.Init(uptos[stream]); int numValue = r.Next(20); for (int j = 0; j < numValue; j++) { if (debug) System.Console.Out.WriteLine(" write " + (counters[stream] + j)); writer.WriteVInt(counters[stream] + j); //writer.writeVInt(ti); } counters[stream] += numValue; uptos[stream] = writer.GetAddress(); if (debug) System.Console.Out.WriteLine(" addr now " + uptos[stream]); } for (int stream = 0; stream < NUM_STREAM; stream++) { if (debug) System.Console.Out.WriteLine(" stream=" + stream + " count=" + counters[stream]); if (starts[stream] != uptos[stream]) { reader.Init(pool, starts[stream], uptos[stream]); for (int j = 0; j < counters[stream]; j++) Assert.AreEqual(j, reader.ReadVInt()); //Assert.AreEqual(ti, reader.readVInt()); } } pool.Reset(); } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) { proxPointer = proxOut.GetFilePointer(); } else { proxPointer = 0; } skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else { payloadLength = 0; } if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else { proxOut.WriteVInt(code & (~1)); } if (payloadLength > 0) { copyBytes(prox, proxOut, payloadLength); } } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) { if (mergeStates[i] != minState) { mergeStates[upto++] = mergeStates[i]; } } numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }