private void MergeTerms(SegmentWriteState segmentWriteState) { IList <Fields> fields = new JCG.List <Fields>(); IList <ReaderSlice> slices = new JCG.List <ReaderSlice>(); int docBase = 0; for (int readerIndex = 0; readerIndex < mergeState.Readers.Count; readerIndex++) { AtomicReader reader = mergeState.Readers[readerIndex]; Fields f = reader.Fields; int maxDoc = reader.MaxDoc; if (f != null) { slices.Add(new ReaderSlice(docBase, maxDoc, readerIndex)); fields.Add(f); } docBase += maxDoc; } FieldsConsumer consumer = codec.PostingsFormat.FieldsConsumer(segmentWriteState); bool success = false; try { consumer.Merge(mergeState, new MultiFields(fields.ToArray(/*Fields.EMPTY_ARRAY*/), slices.ToArray(/*ReaderSlice.EMPTY_ARRAY*/))); success = true; } finally { if (success) { IOUtils.Dispose(consumer); } else { IOUtils.DisposeWhileHandlingException(consumer); } } }
private void Write(FieldInfos fieldInfos, Directory dir, FieldData[] fields, bool allowPreFlex) { int termIndexInterval = TestUtil.NextInt32(Random, 13, 27); Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null); SegmentWriteState state = new SegmentWriteState((InfoStream)InfoStream.Default, dir, si, fieldInfos, termIndexInterval, null, NewIOContext(Random)); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using FieldsConsumer consumer = codec.PostingsFormat.FieldsConsumer(state); Array.Sort(fields); foreach (FieldData field in fields) { #pragma warning disable 612, 618 if (!allowPreFlex && codec is Lucene3xCodec) #pragma warning restore 612, 618 { // code below expects unicode sort order continue; } field.Write(consumer); } }
private void Write(FieldInfos fieldInfos, Directory dir, FieldData[] fields, bool allowPreFlex) { int termIndexInterval = TestUtil.NextInt(Random(), 13, 27); Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null); SegmentWriteState state = new SegmentWriteState(InfoStream.Default, dir, si, fieldInfos, termIndexInterval, null, NewIOContext(Random())); FieldsConsumer consumer = codec.PostingsFormat.FieldsConsumer(state); Array.Sort(fields); foreach (FieldData field in fields) { #pragma warning disable 612, 618 if (!allowPreFlex && codec is Lucene3xCodec) #pragma warning restore 612, 618 { // code below expects unicode sort order continue; } field.Write(consumer); } consumer.Dispose(); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.Indexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparator; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions; Debug.Assert(currentFieldIndexOptions != null); bool writeTermFreq = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool writeOffsets = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; bool readTermFreq = this.HasFreq; bool readPositions = this.HasProx; bool readOffsets = this.HasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0) { segDeletes = state.SegUpdates.Terms; } else { segDeletes = null; } int[] termIDs = TermsHashPerField.SortPostings(termComp); int numTerms = TermsHashPerField.BytesHash.Size(); BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.TextStarts[termID]; TermsHashPerField.BytePool.SetBytesRef(text, textStart); TermsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { TermsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes_Renamed = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.LastDocCodes[termID] != -1) { // Return last doc docID = postings.LastDocIDs[termID]; if (readTermFreq) { termFreq = postings.TermFreqs[termID]; } else { termFreq = -1; } postings.LastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt(); } } Debug.Assert(docID != postings.LastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt(); if (Payload == null) { Payload = new BytesRef(); Payload.Bytes = new sbyte[payloadLength]; } else if (Payload.Bytes.Length < payloadLength) { Payload.Grow(payloadLength); } prox.ReadBytes(Payload.Bytes, 0, payloadLength); Payload.Length = payloadLength; thisPayload = Payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt(); int endOffset = startOffset + prox.ReadVInt(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
public override void Flush(IDictionary <string, TermsHashConsumerPerField> fieldsToFlush, SegmentWriteState state) { // Gather all FieldData's that have postings, across all // ThreadStates IList <FreqProxTermsWriterPerField> allFields = new List <FreqProxTermsWriterPerField>(); foreach (TermsHashConsumerPerField f in fieldsToFlush.Values) { FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)f; if (perField.termsHashPerField.bytesHash.Count > 0) { allFields.Add(perField); } } int numAllFields = allFields.Count; // Sort by field name CollectionUtil.IntroSort(allFields); FieldsConsumer consumer = state.SegmentInfo.Codec.PostingsFormat.FieldsConsumer(state); bool success = false; try { TermsHash termsHash = null; /* * Current writer chain: * FieldsConsumer * -> IMPL: FormatPostingsTermsDictWriter * -> TermsConsumer * -> IMPL: FormatPostingsTermsDictWriter.TermsWriter * -> DocsConsumer * -> IMPL: FormatPostingsDocsWriter * -> PositionsConsumer * -> IMPL: FormatPostingsPositionsWriter */ for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) { FieldInfo fieldInfo = allFields[fieldNumber].fieldInfo; FreqProxTermsWriterPerField fieldWriter = allFields[fieldNumber]; // If this field has postings then add them to the // segment fieldWriter.Flush(fieldInfo.Name, consumer, state); TermsHashPerField perField = fieldWriter.termsHashPerField; if (Debugging.AssertsEnabled) { Debugging.Assert(termsHash == null || termsHash == perField.termsHash); } termsHash = perField.termsHash; int numPostings = perField.bytesHash.Count; perField.Reset(); perField.ShrinkHash(numPostings); fieldWriter.Reset(); } if (termsHash != null) { termsHash.Reset(); } success = true; } finally { if (success) { IOUtils.Dispose(consumer); } else { IOUtils.DisposeWhileHandlingException(consumer); } } }
internal AssertingFieldsConsumer(FieldsConsumer @in) { this.@in = @in; }
internal AssertingFieldsConsumer(FieldsConsumer @in) { this.@in = @in; }
public BloomFilteredFieldsConsumer(BloomFilteringPostingsFormat outerInstance, FieldsConsumer fieldsConsumer, SegmentWriteState state) { this.outerInstance = outerInstance; _delegateFieldsConsumer = fieldsConsumer; _state = state; }
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state, BloomFilteringPostingsFormat bfpf) { _delegateFieldsConsumer = fieldsConsumer; _state = state; _bfpf = bfpf; }