private void MergeTerms() { try { freqOutput = directory.CreateOutput(segment + ".frq"); proxOutput = directory.CreateOutput(segment + ".prx"); termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); skipInterval = termInfosWriter.skipInterval; maxSkipLevels = termInfosWriter.maxSkipLevels; skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput); queue = new SegmentMergeQueue(readers.Count); MergeTermInfos(); } finally { if (freqOutput != null) { freqOutput.Close(); } if (proxOutput != null) { proxOutput.Close(); } if (termInfosWriter != null) { termInfosWriter.Close(); } if (queue != null) { queue.Close(); } } }
internal FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent, IState s) : base() { this.parent = parent; System.String fileName = IndexFileNames.SegmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION); state.flushedFiles.Add(fileName); out_Renamed = parent.parent.dir.CreateOutput(fileName, s); totalNumDocs = parent.parent.totalNumDocs; // TODO: abstraction violation skipInterval = parent.parent.termsOut.skipInterval; skipListWriter = parent.parent.skipListWriter; skipListWriter.SetFreqOutput(out_Renamed); posWriter = new FormatPostingsPositionsWriter(state, this, s); }
internal FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent):base() { this.parent = parent; System.String fileName = IndexFileNames.SegmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION); SupportClass.CollectionsHelper.AddIfNotContains(state.flushedFiles, fileName); out_Renamed = parent.parent.dir.CreateOutput(fileName); totalNumDocs = parent.parent.totalNumDocs; // TODO: abstraction violation skipInterval = parent.parent.termsOut.skipInterval; skipListWriter = parent.parent.skipListWriter; skipListWriter.SetFreqOutput(out_Renamed); posWriter = new FormatPostingsPositionsWriter(state, this); }
public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) : base() { dir = state.directory; segment = state.segmentName; totalNumDocs = state.numDocs; this.fieldInfos = fieldInfos; termsOut = new TermInfosWriter(dir, segment, fieldInfos, state.termIndexInterval); // TODO: this is a nasty abstraction violation (that we // peek down to find freqOut/proxOut) -- we need a // better abstraction here whereby these child consumers // can provide skip data or not skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, totalNumDocs, null, null); state.flushedFiles.Add(state.SegmentFileName(IndexFileNames.TERMS_EXTENSION)); state.flushedFiles.Add(state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); termsWriter = new FormatPostingsTermsWriter(state, this); }
public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos):base() { dir = state.directory; segment = state.segmentName; totalNumDocs = state.numDocs; this.fieldInfos = fieldInfos; termsOut = new TermInfosWriter(dir, segment, fieldInfos, state.termIndexInterval); // TODO: this is a nasty abstraction violation (that we // peek down to find freqOut/proxOut) -- we need a // better abstraction here whereby these child consumers // can provide skip data or not skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, totalNumDocs, null, null); SupportClass.CollectionsHelper.AddIfNotContains(state.flushedFiles, state.SegmentFileName(IndexFileNames.TERMS_EXTENSION)); SupportClass.CollectionsHelper.AddIfNotContains(state.flushedFiles, state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); termsWriter = new FormatPostingsTermsWriter(state, this); }
// TODO: would be nice to factor out morme of this, eg the // FreqProxFieldMergeState, and code to visit all Fields // under the same FieldInfo together, up into TermsHash*. // Other writers would presumably share alot of this... internal override void flush(IDictionary<object, object> threadsAndFields, DocumentsWriter.FlushState state) { // Gather all FieldData's that have postings, across all // ThreadStates List<object> allFields = new List<object>(); IEnumerator<KeyValuePair<object, object>> it = threadsAndFields.GetEnumerator(); while (it.MoveNext()) { KeyValuePair<object, object> entry = (KeyValuePair<object, object>)it.Current; ICollection<object> fields = (ICollection<object>)entry.Value; IEnumerator<object> fieldsIt = fields.GetEnumerator(); while (fieldsIt.MoveNext()) { FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)fieldsIt.Current; if (perField.termsHashPerField.numPostings > 0) allFields.Add(perField); } } // Sort by field name allFields.Sort(); int numAllFields = allFields.Count; TermInfosWriter termsOut = new TermInfosWriter(state.directory, state.segmentName, fieldInfos, state.docWriter.writer.GetTermIndexInterval()); IndexOutput freqOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.FREQ_EXTENSION)); IndexOutput proxOut; if (fieldInfos.HasProx()) proxOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.PROX_EXTENSION)); else proxOut = null; DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, state.numDocsInRAM, freqOut, proxOut); int start = 0; while (start < numAllFields) { FieldInfo fieldInfo = ((FreqProxTermsWriterPerField)allFields[start]).fieldInfo; string fieldName = fieldInfo.name; int end = start + 1; while (end < numAllFields && ((FreqProxTermsWriterPerField)allFields[end]).fieldInfo.name.Equals(fieldName)) end++; FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end - start]; for (int i = start; i < end; i++) { fields[i - start] = (FreqProxTermsWriterPerField)allFields[i]; // Aggregate the storePayload as seen by the same // field across multiple threads fieldInfo.storePayloads |= fields[i - start].hasPayloads; } // If this field has postings then add them to the // segment AppendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter); for (int i = 0; i < fields.Length; i++) { TermsHashPerField perField = fields[i].termsHashPerField; int numPostings = perField.numPostings; perField.reset(); perField.shrinkHash(numPostings); fields[i].reset(); } start = end; } it = threadsAndFields.GetEnumerator(); while (it.MoveNext()) { KeyValuePair<object, object> entry = (KeyValuePair<object, object>)it.Current; FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread)entry.Key; perThread.termsHashPerThread.reset(true); } freqOut.Close(); if (proxOut != null) { state.flushedFiles[state.SegmentFileName(IndexFileNames.PROX_EXTENSION)] = state.SegmentFileName(IndexFileNames.PROX_EXTENSION); proxOut.Close(); } termsOut.Close(); // Record all files we have flushed state.flushedFiles[state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)] = state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION); state.flushedFiles[state.SegmentFileName(IndexFileNames.FREQ_EXTENSION)] = state.SegmentFileName(IndexFileNames.FREQ_EXTENSION); state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_EXTENSION); state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) termStates[numToMerge++] = mergeStates[i]; } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) proxPointer = proxOut.GetFilePointer(); else proxPointer = 0; skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) if (termStates[i].docID < minState.docID) minState = termStates[i]; int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else payloadLength = 0; if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else proxOut.WriteVInt(code & (~1)); if (payloadLength > 0) copyBytes(prox, proxOut, payloadLength); } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) if (termStates[i] != minState) termStates[upto++] = termStates[i]; numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }
/// <summary>Creates a segment from all Postings in the Postings /// hashes across all ThreadStates & FieldDatas. /// </summary> private System.Collections.IList WriteSegment() { System.Diagnostics.Debug.Assert(AllThreadsIdle()); System.Diagnostics.Debug.Assert(nextDocID == numDocsInRAM); System.String segmentName; segmentName = segment; TermInfosWriter termsOut = new TermInfosWriter(directory, segmentName, fieldInfos, writer.GetTermIndexInterval()); IndexOutput freqOut = directory.CreateOutput(segmentName + ".frq"); IndexOutput proxOut = directory.CreateOutput(segmentName + ".prx"); // Gather all FieldData's that have postings, across all // ThreadStates System.Collections.ArrayList allFields = new System.Collections.ArrayList(); System.Diagnostics.Debug.Assert(AllThreadsIdle()); for (int i = 0; i < threadStates.Length; i++) { ThreadState state = threadStates[i]; state.TrimFields(); int numFields = state.numAllFieldData; for (int j = 0; j < numFields; j++) { ThreadState.FieldData fp = state.allFieldDataArray[j]; if (fp.numPostings > 0) allFields.Add(fp); } } // Sort by field name allFields.Sort(); int numAllFields = allFields.Count; skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, numDocsInRAM, freqOut, proxOut); int start = 0; while (start < numAllFields) { System.String fieldName = ((ThreadState.FieldData) allFields[start]).fieldInfo.name; int end = start + 1; while (end < numAllFields && ((ThreadState.FieldData) allFields[end]).fieldInfo.name.Equals(fieldName)) end++; ThreadState.FieldData[] fields = new ThreadState.FieldData[end - start]; for (int i = start; i < end; i++) fields[i - start] = (ThreadState.FieldData) allFields[i]; // If this field has postings then add them to the // segment AppendPostings(fields, termsOut, freqOut, proxOut); for (int i = 0; i < fields.Length; i++) fields[i].ResetPostingArrays(); start = end; } freqOut.Close(); proxOut.Close(); termsOut.Close(); // Record all files we have flushed System.Collections.IList flushedFiles = new System.Collections.ArrayList(); flushedFiles.Add(SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.FREQ_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.PROX_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_EXTENSION)); flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); if (hasNorms) { WriteNorms(segmentName, numDocsInRAM); flushedFiles.Add(SegmentFileName(IndexFileNames.NORMS_EXTENSION)); } if (infoStream != null) { long newSegmentSize = SegmentSize(segmentName); System.String message = String.Format(nf, " oldRAMSize={0:d} newFlushedSize={1:d} docs/MB={2:f} new/old={3:%}", new Object[] { numBytesUsed, newSegmentSize, (numDocsInRAM / (newSegmentSize / 1024.0 / 1024.0)), (newSegmentSize / numBytesUsed) }); infoStream.WriteLine(message); } ResetPostingsData(); nextDocID = 0; nextWriteDocID = 0; numDocsInRAM = 0; files = null; // Maybe downsize postingsFreeList array if (postingsFreeList.Length > 1.5 * postingsFreeCount) { int newSize = postingsFreeList.Length; while (newSize > 1.25 * postingsFreeCount) { newSize = (int) (newSize * 0.8); } Posting[] newArray = new Posting[newSize]; Array.Copy(postingsFreeList, 0, newArray, 0, postingsFreeCount); postingsFreeList = newArray; } return flushedFiles; }
private void MergeTerms() { try { freqOutput = directory.CreateOutput(segment + ".frq"); if (HasProx()) proxOutput = directory.CreateOutput(segment + ".prx"); termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); skipInterval = termInfosWriter.skipInterval; maxSkipLevels = termInfosWriter.maxSkipLevels; skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput); queue = new SegmentMergeQueue(readers.Count); MergeTermInfos(); } finally { if (freqOutput != null) freqOutput.Close(); if (proxOutput != null) proxOutput.Close(); if (termInfosWriter != null) termInfosWriter.Close(); if (queue != null) queue.Close(); } }
// TODO: would be nice to factor out morme of this, eg the // FreqProxFieldMergeState, and code to visit all Fields // under the same FieldInfo together, up into TermsHash*. // Other writers would presumably share alot of this... internal override void flush(IDictionary <object, object> threadsAndFields, DocumentsWriter.FlushState state) { // Gather all FieldData's that have postings, across all // ThreadStates List <object> allFields = new List <object>(); IEnumerator <KeyValuePair <object, object> > it = threadsAndFields.GetEnumerator(); while (it.MoveNext()) { KeyValuePair <object, object> entry = (KeyValuePair <object, object>)it.Current; ICollection <object> fields = (ICollection <object>)entry.Value; IEnumerator <object> fieldsIt = fields.GetEnumerator(); while (fieldsIt.MoveNext()) { FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)fieldsIt.Current; if (perField.termsHashPerField.numPostings > 0) { allFields.Add(perField); } } } // Sort by field name allFields.Sort(); int numAllFields = allFields.Count; TermInfosWriter termsOut = new TermInfosWriter(state.directory, state.segmentName, fieldInfos, state.docWriter.writer.GetTermIndexInterval()); IndexOutput freqOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.FREQ_EXTENSION)); IndexOutput proxOut; if (fieldInfos.HasProx()) { proxOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.PROX_EXTENSION)); } else { proxOut = null; } DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, state.numDocsInRAM, freqOut, proxOut); int start = 0; while (start < numAllFields) { FieldInfo fieldInfo = ((FreqProxTermsWriterPerField)allFields[start]).fieldInfo; string fieldName = fieldInfo.name; int end = start + 1; while (end < numAllFields && ((FreqProxTermsWriterPerField)allFields[end]).fieldInfo.name.Equals(fieldName)) { end++; } FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end - start]; for (int i = start; i < end; i++) { fields[i - start] = (FreqProxTermsWriterPerField)allFields[i]; // Aggregate the storePayload as seen by the same // field across multiple threads fieldInfo.storePayloads |= fields[i - start].hasPayloads; } // If this field has postings then add them to the // segment AppendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter); for (int i = 0; i < fields.Length; i++) { TermsHashPerField perField = fields[i].termsHashPerField; int numPostings = perField.numPostings; perField.reset(); perField.shrinkHash(numPostings); fields[i].reset(); } start = end; } it = threadsAndFields.GetEnumerator(); while (it.MoveNext()) { KeyValuePair <object, object> entry = (KeyValuePair <object, object>)it.Current; FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread)entry.Key; perThread.termsHashPerThread.reset(true); } freqOut.Close(); if (proxOut != null) { state.flushedFiles[state.SegmentFileName(IndexFileNames.PROX_EXTENSION)] = state.SegmentFileName(IndexFileNames.PROX_EXTENSION); proxOut.Close(); } termsOut.Close(); // Record all files we have flushed state.flushedFiles[state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)] = state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION); state.flushedFiles[state.SegmentFileName(IndexFileNames.FREQ_EXTENSION)] = state.SegmentFileName(IndexFileNames.FREQ_EXTENSION); state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_EXTENSION); state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) { proxPointer = proxOut.GetFilePointer(); } else { proxPointer = 0; } skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else { payloadLength = 0; } if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else { proxOut.WriteVInt(code & (~1)); } if (payloadLength > 0) { copyBytes(prox, proxOut, payloadLength); } } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) { if (mergeStates[i] != minState) { mergeStates[upto++] = mergeStates[i]; } } numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }