/// <summary> Sets the values for the current skip data. </summary> internal virtual void SetSkipData(int doc, bool storePayloads, int payloadLength) { this.curDoc = doc; this.curStorePayloads = storePayloads; this.curPayloadLength = payloadLength; this.curFreqPointer = freqOutput.GetFilePointer(); this.curProxPointer = proxOutput.GetFilePointer(); }
/// <summary> /// Sets the values for the current skip data. /// </summary> public virtual void SetSkipData(int doc, bool storePayloads, int payloadLength) { this.CurDoc = doc; this.CurStorePayloads = storePayloads; this.CurPayloadLength = payloadLength; this.CurFreqPointer = FreqOutput.GetFilePointer(); if (ProxOutput != null) { this.CurProxPointer = ProxOutput.GetFilePointer(); } }
/// <summary>Copy the contents of the file with specified extension into the /// provided output stream. Use the provided buffer for moving data /// to reduce memory allocation. /// </summary> private void CopyFile(FileEntry source, IndexOutput os, byte[] buffer) { IndexInput is_Renamed = null; try { long startPtr = os.GetFilePointer(); is_Renamed = directory.OpenInput(source.file); long length = is_Renamed.Length(); long remainder = length; int chunk = buffer.Length; while (remainder > 0) { int len = (int)System.Math.Min(chunk, remainder); is_Renamed.ReadBytes(buffer, 0, len, false); os.WriteBytes(buffer, len); remainder -= len; if (checkAbort != null) { // Roughly every 2 MB we will check if // it's time to abort checkAbort.Work(80); } } // Verify that remainder is 0 if (remainder != 0) { throw new System.IO.IOException("Non-zero remainder length after copying: " + remainder + " (id: " + source.file + ", length: " + length + ", buffer size: " + chunk + ")"); } // Verify that the output length diff is equal to original file long endPtr = os.GetFilePointer(); long diff = endPtr - startPtr; if (diff != length) { throw new System.IO.IOException("Difference in the output file offsets " + diff + " does not match the original file length " + length); } } finally { if (is_Renamed != null) { is_Renamed.Close(); } } }
private TermInfo termInfo = new TermInfo(); // minimize consing /// <summary>Merge one term found in one or more segments. The array <code>smis</code> /// contains segments that are positioned at the same term. <code>N</code> /// is the number of cells in the array actually occupied. /// /// </summary> /// <param name="smis">array of segments /// </param> /// <param name="n">number of cells in the array actually occupied /// </param> private void MergeTermInfo(SegmentMergeInfo[] smis, int n) { long freqPointer = freqOutput.GetFilePointer(); long proxPointer = proxOutput.GetFilePointer(); int df = AppendPostings(smis, n); // append posting data long skipPointer = WriteSkip(); if (df > 0) { // add an entry to the dictionary with pointers to prox and freq files termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); termInfosWriter.Add(smis[0].term, termInfo); } }
/// <summary>Fills in no-term-vectors for all docs we haven't seen /// since the last doc that had term vectors. /// </summary> internal void Fill(int docID) { int docStoreOffset = docWriter.GetDocStoreOffset(); int end = docID + docStoreOffset; if (lastDocID < end) { long tvfPosition = tvf.GetFilePointer(); while (lastDocID < end) { tvx.WriteLong(tvd.GetFilePointer()); tvd.WriteVInt(0); tvx.WriteLong(tvfPosition); lastDocID++; } } }
public virtual void TestLargeWrites() { IndexOutput os = dir.CreateOutput("testBufferStart.txt"); byte[] largeBuf = new byte[2048]; for (int i = 0; i < largeBuf.Length; i++) { largeBuf[i] = (byte)((new System.Random().NextDouble()) * 256); } long currentPos = os.GetFilePointer(); os.WriteBytes(largeBuf, largeBuf.Length); try { Assert.AreEqual(currentPos + largeBuf.Length, os.GetFilePointer()); } finally { os.Close(); } }
/// <summary> Writes the buffered skip lists to the given output. /// /// </summary> /// <param name="output">the IndexOutput the skip lists shall be written to /// </param> /// <returns> the pointer the skip list starts /// </returns> internal virtual long WriteSkip(IndexOutput output) { long skipPointer = output.GetFilePointer(); if (skipBuffer == null || skipBuffer.Length == 0) { return(skipPointer); } for (int level = numberOfSkipLevels - 1; level > 0; level--) { long length = skipBuffer[level].GetFilePointer(); if (length > 0) { output.WriteVLong(length); skipBuffer[level].WriteTo(output); } } skipBuffer[0].WriteTo(output); return(skipPointer); }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; foreach (Field field in doc.Fields()) { if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); foreach (Field field in doc.Fields()) { if (field.IsStored()) { fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name())); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } } } }
/// <summary>Produce _X.nrm if any document had a field with norms /// not disabled /// </summary> public override void Flush(System.Collections.IDictionary threadsAndFields, SegmentWriteState state) { System.Collections.IDictionary byField = new System.Collections.Hashtable(); // Typically, each thread will have encountered the same // field. So first we collate by field, ie, all // per-thread field instances that correspond to the // same FieldInfo System.Collections.IEnumerator it = new System.Collections.Hashtable(threadsAndFields).GetEnumerator(); while (it.MoveNext()) { System.Collections.DictionaryEntry entry = (System.Collections.DictionaryEntry)it.Current; System.Collections.ICollection fields = (System.Collections.ICollection)entry.Value; System.Collections.IEnumerator fieldsIt = fields.GetEnumerator(); System.Collections.ArrayList fieldsToRemove = new System.Collections.ArrayList(); while (fieldsIt.MoveNext()) { NormsWriterPerField perField = (NormsWriterPerField)((System.Collections.DictionaryEntry)fieldsIt.Current).Key; if (perField.upto > 0) { // It has some norms System.Collections.IList l = (System.Collections.IList)byField[perField.fieldInfo]; if (l == null) { l = new System.Collections.ArrayList(); byField[perField.fieldInfo] = l; } l.Add(perField); } // Remove this field since we haven't seen it // since the previous flush else { fieldsToRemove.Add(perField); } } System.Collections.Hashtable fieldsHT = (System.Collections.Hashtable)fields; for (int i = 0; i < fieldsToRemove.Count; i++) { fieldsHT.Remove(fieldsToRemove[i]); } } System.String normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION; state.flushedFiles[normsFileName] = normsFileName; IndexOutput normsOut = state.directory.CreateOutput(normsFileName); try { normsOut.WriteBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.Length); int numField = fieldInfos.Size(); int normCount = 0; for (int fieldNumber = 0; fieldNumber < numField; fieldNumber++) { FieldInfo fieldInfo = fieldInfos.FieldInfo(fieldNumber); System.Collections.IList toMerge = (System.Collections.IList)byField[fieldInfo]; int upto = 0; if (toMerge != null) { int numFields = toMerge.Count; normCount++; NormsWriterPerField[] fields = new NormsWriterPerField[numFields]; int[] uptos = new int[numFields]; for (int j = 0; j < numFields; j++) { fields[j] = (NormsWriterPerField)toMerge[j]; } int numLeft = numFields; while (numLeft > 0) { System.Diagnostics.Debug.Assert(uptos [0] < fields [0].docIDs.Length, " uptos[0]=" + uptos [0] + " len=" + (fields [0].docIDs.Length)); int minLoc = 0; int minDocID = fields[0].docIDs[uptos[0]]; for (int j = 1; j < numLeft; j++) { int docID = fields[j].docIDs[uptos[j]]; if (docID < minDocID) { minDocID = docID; minLoc = j; } } System.Diagnostics.Debug.Assert(minDocID < state.numDocs); // Fill hole for (; upto < minDocID; upto++) { normsOut.WriteByte(defaultNorm); } normsOut.WriteByte(fields[minLoc].norms[uptos[minLoc]]); (uptos[minLoc])++; upto++; if (uptos[minLoc] == fields[minLoc].upto) { fields[minLoc].Reset(); if (minLoc != numLeft - 1) { fields[minLoc] = fields[numLeft - 1]; uptos[minLoc] = uptos[numLeft - 1]; } numLeft--; } } // Fill final hole with defaultNorm for (; upto < state.numDocs; upto++) { normsOut.WriteByte(defaultNorm); } } else if (fieldInfo.isIndexed && !fieldInfo.omitNorms) { normCount++; // Fill entire field with default norm: for (; upto < state.numDocs; upto++) { normsOut.WriteByte(defaultNorm); } } System.Diagnostics.Debug.Assert(4 + normCount * state.numDocs == normsOut.GetFilePointer(), ".nrm file size mismatch: expected=" + (4 + normCount * state.numDocs) + " actual=" + normsOut.GetFilePointer()); } } finally { normsOut.Close(); } }
/// <summary> Writes the buffered skip lists to the given output. /// /// </summary> /// <param name="output">the IndexOutput the skip lists shall be written to /// </param> /// <returns> the pointer the skip list starts /// </returns> internal virtual long WriteSkip(IndexOutput output) { long skipPointer = output.GetFilePointer(); if (skipBuffer == null || skipBuffer.Length == 0) return skipPointer; for (int level = numberOfSkipLevels - 1; level > 0; level--) { long length = skipBuffer[level].GetFilePointer(); if (length > 0) { output.WriteVLong(length); skipBuffer[level].WriteTo(output); } } skipBuffer[0].WriteTo(output); return skipPointer; }
private void WritePostings(Posting[] postings, System.String segment) { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.CreateOutput(segment + ".frq"); prox = directory.CreateOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); System.String currentField = null; for (int i = 0; i < postings.Length; i++) { Posting posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), -1); tis.Add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) { // optimize freq=1 freq.WriteVInt(1); } // set low bit of doc num. else { freq.WriteVInt(0); // the document number freq.WriteVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; prox.WriteVInt(position - lastPosition); lastPosition = position; } // check to see if we switched to a new field System.String termField = posting.term.Field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.FieldInfo(currentField); if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.OpenDocument(); } termVectorWriter.OpenField(currentField); } else if (termVectorWriter != null) { termVectorWriter.CloseField(); } } if (termVectorWriter != null && termVectorWriter.IsFieldOpen()) { termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) { termVectorWriter.CloseDocument(); } } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process System.IO.IOException keep = null; if (freq != null) { try { freq.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (prox != null) { try { prox.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (tis != null) { try { tis.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (termVectorWriter != null) { try { termVectorWriter.Close(); } catch (System.IO.IOException e) { if (keep == null) { keep = e; } } } if (keep != null) { throw new System.IO.IOException(keep.StackTrace); } } }
/// <summary>Merge files with the extensions added up to now. /// All files with these extensions are combined sequentially into the /// compound stream. After successful merge, the source files /// are deleted. /// </summary> /// <throws> IllegalStateException if close() had been called before or </throws> /// <summary> if no file has been added to this object /// </summary> public void Close() { if (merged) { throw new System.SystemException("Merge already performed"); } if ((entries.Count == 0)) { throw new System.SystemException("No entries to merge have been defined"); } merged = true; // open the compound stream IndexOutput os = null; try { os = directory.CreateOutput(fileName); // Write the number of entries os.WriteVInt(entries.Count); // Write the directory with all offsets at 0. // Remember the positions of directory entries so that we can // adjust the offsets later System.Collections.IEnumerator it = entries.GetEnumerator(); long totalSize = 0; while (it.MoveNext()) { FileEntry fe = (FileEntry)it.Current; fe.directoryOffset = os.GetFilePointer(); os.WriteLong(0); // for now os.WriteString(fe.file); totalSize += directory.FileLength(fe.file); } // Pre-allocate size of file as optimization -- // this can potentially help IO performance as // we write the file and also later during // searching. It also uncovers a disk-full // situation earlier and hopefully without // actually filling disk to 100%: long finalLength = totalSize + os.GetFilePointer(); os.SetLength(finalLength); // Open the files and copy their data into the stream. // Remember the locations of each file's data section. byte[] buffer = new byte[16384]; it = entries.GetEnumerator(); while (it.MoveNext()) { FileEntry fe = (FileEntry)it.Current; fe.dataOffset = os.GetFilePointer(); CopyFile(fe, os, buffer); } // Write the data offsets into the directory of the compound stream it = entries.GetEnumerator(); while (it.MoveNext()) { FileEntry fe = (FileEntry)it.Current; os.Seek(fe.directoryOffset); os.WriteLong(fe.dataOffset); } System.Diagnostics.Debug.Assert(finalLength == os.Length()); // Close the output stream. Set the os to null before trying to // close so that if an exception occurs during the close, the // finally clause below will not attempt to close the stream // the second time. IndexOutput tmp = os; os = null; tmp.Close(); } finally { if (os != null) { try { os.Close(); } catch (System.IO.IOException e) { } } } }
/// <summary>Merge files with the extensions added up to now. /// All files with these extensions are combined sequentially into the /// compound stream. After successful merge, the source files /// are deleted. /// </summary> /// <throws> IllegalStateException if close() had been called before or </throws> /// <summary> if no file has been added to this object /// </summary> public void Close() { if (merged) { throw new System.SystemException("Merge already performed"); } if ((entries.Count == 0)) { throw new System.SystemException("No entries to merge have been defined"); } merged = true; // open the compound stream IndexOutput os = null; try { os = directory.CreateOutput(fileName); // Write the number of entries os.WriteVInt(entries.Count); // Write the directory with all offsets at 0. // Remember the positions of directory entries so that we can // adjust the offsets later System.Collections.IEnumerator it = entries.GetEnumerator(); while (it.MoveNext()) { FileEntry fe = (FileEntry)it.Current; fe.directoryOffset = os.GetFilePointer(); os.WriteLong(0); // for now os.WriteString(fe.file); } // Open the files and copy their data into the stream. // Remember the locations of each file's data section. byte[] buffer = new byte[1024]; it = entries.GetEnumerator(); while (it.MoveNext()) { FileEntry fe = (FileEntry)it.Current; fe.dataOffset = os.GetFilePointer(); CopyFile(fe, os, buffer); } // Write the data offsets into the directory of the compound stream it = entries.GetEnumerator(); while (it.MoveNext()) { FileEntry fe = (FileEntry)it.Current; os.Seek(fe.directoryOffset); os.WriteLong(fe.dataOffset); } // Close the output stream. Set the os to null before trying to // close so that if an exception occurs during the close, the // finally clause below will not attempt to close the stream // the second time. IndexOutput tmp = os; os = null; tmp.Close(); } finally { if (os != null) { try { os.Close(); } catch (System.IO.IOException) { } } } }
public virtual void TestEncodeDecode() { int iterations = RandomInts.RandomInt32Between(Random, 1, 1000); float AcceptableOverheadRatio = (float)Random.NextDouble(); int[] values = new int[(iterations - 1) * Lucene41PostingsFormat.BLOCK_SIZE + ForUtil.MAX_DATA_SIZE]; for (int i = 0; i < iterations; ++i) { int bpv = Random.Next(32); if (bpv == 0) { int value = RandomInts.RandomInt32Between(Random, 0, int.MaxValue); for (int j = 0; j < Lucene41PostingsFormat.BLOCK_SIZE; ++j) { values[i * Lucene41PostingsFormat.BLOCK_SIZE + j] = value; } } else { for (int j = 0; j < Lucene41PostingsFormat.BLOCK_SIZE; ++j) { values[i * Lucene41PostingsFormat.BLOCK_SIZE + j] = RandomInts.RandomInt32Between(Random, 0, (int)PackedInt32s.MaxValue(bpv)); } } } Directory d = new RAMDirectory(); long endPointer; { // encode IndexOutput @out = d.CreateOutput("test.bin", IOContext.DEFAULT); ForUtil forUtil = new ForUtil(AcceptableOverheadRatio, @out); for (int i = 0; i < iterations; ++i) { forUtil.WriteBlock(Arrays.CopyOfRange(values, i * Lucene41PostingsFormat.BLOCK_SIZE, values.Length), new byte[Lucene41.ForUtil.MAX_ENCODED_SIZE], @out); } endPointer = @out.GetFilePointer(); @out.Dispose(); } { // decode IndexInput @in = d.OpenInput("test.bin", IOContext.READ_ONCE); ForUtil forUtil = new ForUtil(@in); for (int i = 0; i < iterations; ++i) { if (Random.NextBoolean()) { forUtil.SkipBlock(@in); continue; } int[] restored = new int[Lucene41.ForUtil.MAX_DATA_SIZE]; forUtil.ReadBlock(@in, new byte[Lucene41.ForUtil.MAX_ENCODED_SIZE], restored); Assert.AreEqual(Arrays.CopyOfRange(values, i * Lucene41PostingsFormat.BLOCK_SIZE, (i + 1) * Lucene41PostingsFormat.BLOCK_SIZE), Arrays.CopyOf(restored, Lucene41PostingsFormat.BLOCK_SIZE)); } assertEquals(endPointer, @in.GetFilePointer()); @in.Dispose(); } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void AppendPostings(ThreadState.FieldData[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FieldMergeState[] mergeStates = new FieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FieldMergeState fms = mergeStates[i] = new FieldMergeState(); fms.field = fields[i]; fms.postings = fms.field.SortPostings(); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo); // Should always be true bool result = fms.NextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; currentFieldStorePayloads = fields[0].fieldInfo.storePayloads; FieldMergeState[] termStates = new FieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = CompareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) termStates[numToMerge++] = mergeStates[i]; } int df = 0; int lastPayloadLength = - 1; int lastDoc = 0; char[] text2 = termStates[0].text; int start = termStates[0].textOffset; int pos = start; while (text2[pos] != 0xffff) pos++; long freqPointer = freqOut.GetFilePointer(); long proxPointer = proxOut.GetFilePointer(); skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) if (termStates[i].docID < minState.docID) minState = termStates[i]; int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); int newDocCode = (doc - lastDoc) << 1; lastDoc = doc; ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else payloadLength = 0; if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else proxOut.WriteVInt(code & (~ 1)); if (payloadLength > 0) CopyBytes(prox, proxOut, payloadLength); } else { System.Diagnostics.Debug.Assert(0 ==(code & 1)); proxOut.WriteVInt(code >> 1); } } if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } if (!minState.NextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) if (termStates[i] != minState) termStates[upto++] = termStates[i]; numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.NextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); termsOut.Add(fieldNumber, text2, start, pos - start, termInfo); } }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); if (field.IsStored()) { fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name())); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data = null; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.BinaryValue(); } else { // check if it is a binary field if (field.IsBinary()) { data = Compress(field.BinaryValue()); } else { data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue())); } } int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data = field.BinaryValue(); int len = data.Length; fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, len); } else { fieldsStream.WriteString(field.StringValue()); } } } } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) termStates[numToMerge++] = mergeStates[i]; } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) proxPointer = proxOut.GetFilePointer(); else proxPointer = 0; skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) if (termStates[i].docID < minState.docID) minState = termStates[i]; int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else payloadLength = 0; if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else proxOut.WriteVInt(code & (~1)); if (payloadLength > 0) copyBytes(prox, proxOut, payloadLength); } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) if (termStates[i] != minState) termStates[upto++] = termStates[i]; numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }
// Writes the contents of buffer into the fields stream // and adds a new entry for this document into the index // stream. This assumes the buffer was already written // in the correct fields format. internal void FlushDocument(int numStoredFields, RAMOutputStream buffer) { indexStream.WriteLong(fieldsStream.GetFilePointer()); fieldsStream.WriteVInt(numStoredFields); buffer.WriteTo(fieldsStream); }
/** Produce _X.nrm if any document had a field with norms * not disabled */ internal override void flush(IDictionary <object, ICollection <object> > threadsAndFields, DocumentsWriter.FlushState state) { IDictionary <object, object> byField = new Dictionary <object, object>(); // Typically, each thread will have encountered the same // field. So first we collate by field, ie, all // per-thread field instances that correspond to the // same FieldInfo IEnumerator <KeyValuePair <object, ICollection <object> > > it = threadsAndFields.GetEnumerator(); while (it.MoveNext()) { KeyValuePair <object, ICollection <object> > entry = it.Current; ICollection <object> fields = entry.Value; IEnumerator <object> fieldsIt = fields.GetEnumerator(); List <object> fieldsToRemove = new List <object>(fields.Count); while (fieldsIt.MoveNext()) { NormsWriterPerField perField = (NormsWriterPerField)fieldsIt.Current; if (perField.upto > 0) { // It has some norms IList <object> l; if (byField.ContainsKey(perField.fieldInfo)) { l = (IList <object>)byField[perField.fieldInfo]; } else { l = new List <object>(); byField[perField.fieldInfo] = l; } //IList<object> l = (IList<object>)byField[perField.fieldInfo]; //if (l == null) //{ // l = new List<object>(); // byField[perField.fieldInfo] = l; //} l.Add(perField); } else { // Remove this field since we haven't seen it // since the previous flush fieldsToRemove.Add(perField); //fields.Remove(perField); } } for (int i = 0; i < fieldsToRemove.Count; i++) { fields.Remove(fieldsToRemove[i]); } } string normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION; state.flushedFiles[normsFileName] = normsFileName; IndexOutput normsOut = state.directory.CreateOutput(normsFileName); try { normsOut.WriteBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.Length); int numField = fieldInfos.Size(); int normCount = 0; for (int fieldNumber = 0; fieldNumber < numField; fieldNumber++) { FieldInfo fieldInfo = fieldInfos.FieldInfo(fieldNumber); List <object> toMerge; int upto = 0; if (byField.ContainsKey(fieldInfo)) { toMerge = (List <object>)byField[fieldInfo]; int numFields = toMerge.Count; normCount++; NormsWriterPerField[] fields = new NormsWriterPerField[numFields]; int[] uptos = new int[numFields]; for (int j = 0; j < numFields; j++) { fields[j] = (NormsWriterPerField)toMerge[j]; } int numLeft = numFields; while (numLeft > 0) { System.Diagnostics.Debug.Assert(uptos[0] < fields[0].docIDs.Length, " uptos[0]=" + uptos[0] + " len=" + (fields[0].docIDs.Length)); int minLoc = 0; int minDocID = fields[0].docIDs[uptos[0]]; for (int j = 1; j < numLeft; j++) { int docID = fields[j].docIDs[uptos[j]]; if (docID < minDocID) { minDocID = docID; minLoc = j; } } System.Diagnostics.Debug.Assert(minDocID < state.numDocsInRAM); // Fill hole for (; upto < minDocID; upto++) { normsOut.WriteByte(defaultNorm); } normsOut.WriteByte(fields[minLoc].norms[uptos[minLoc]]); (uptos[minLoc])++; upto++; if (uptos[minLoc] == fields[minLoc].upto) { fields[minLoc].reset(); if (minLoc != numLeft - 1) { fields[minLoc] = fields[numLeft - 1]; uptos[minLoc] = uptos[numLeft - 1]; } numLeft--; } } // Fill final hole with defaultNorm for (; upto < state.numDocsInRAM; upto++) { normsOut.WriteByte(defaultNorm); } } else if (fieldInfo.isIndexed && !fieldInfo.omitNorms) { normCount++; // Fill entire field with default norm: for (; upto < state.numDocsInRAM; upto++) { normsOut.WriteByte(defaultNorm); } } System.Diagnostics.Debug.Assert(4 + normCount * state.numDocsInRAM == normsOut.GetFilePointer(), ".nrm file size mismatch: expected=" + (4 + normCount * state.numDocsInRAM) + " actual=" + normsOut.GetFilePointer()); } } finally { normsOut.Close(); } }
/// <summary>Copy the contents of the file with specified extension into the /// provided output stream. Use the provided buffer for moving data /// to reduce memory allocation. /// </summary> private void CopyFile(FileEntry source, IndexOutput os, byte[] buffer) { IndexInput is_Renamed = null; try { long startPtr = os.GetFilePointer(); is_Renamed = directory.OpenInput(source.file); long length = is_Renamed.Length(); long remainder = length; int chunk = buffer.Length; while (remainder > 0) { int len = (int) System.Math.Min(chunk, remainder); is_Renamed.ReadBytes(buffer, 0, len, false); os.WriteBytes(buffer, len); remainder -= len; if (checkAbort != null) // Roughly every 2 MB we will check if // it's time to abort checkAbort.Work(80); } // Verify that remainder is 0 if (remainder != 0) throw new System.IO.IOException("Non-zero remainder length after copying: " + remainder + " (id: " + source.file + ", length: " + length + ", buffer size: " + chunk + ")"); // Verify that the output length diff is equal to original file long endPtr = os.GetFilePointer(); long diff = endPtr - startPtr; if (diff != length) throw new System.IO.IOException("Difference in the output file offsets " + diff + " does not match the original file length " + length); } finally { if (is_Renamed != null) is_Renamed.Close(); } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void AppendPostings(DocumentsWriter.FlushState flushState, FreqProxTermsWriterPerField[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut, DefaultSkipListWriter skipListWriter) { int fieldNumber = fields[0].fieldInfo.number; int numFields = fields.Length; FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for (int i = 0; i < numFields; i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo); // Should always be true bool result = fms.nextTerm(); System.Diagnostics.Debug.Assert(result); } int skipInterval = termsOut.skipInterval; bool currentFieldOmitTf = fields[0].fieldInfo.omitTf; // If current field omits tf then it cannot store // payloads. We silently drop the payloads in this case: bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while (numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for (int i = 1; i < numFields; i++) { char[] text = mergeStates[i].text; int textOffset = mergeStates[i].textOffset; int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } int df = 0; int lastPayloadLength = -1; int lastDoc = 0; char[] text_Renamed = termStates[0].text; int start = termStates[0].textOffset; long freqPointer = freqOut.GetFilePointer(); long proxPointer; if (proxOut != null) { proxPointer = proxOut.GetFilePointer(); } else { proxPointer = 0; } skipListWriter.ResetSkip(); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter.BufferSkip(df); } FreqProxFieldMergeState minState = termStates[0]; for (int i = 1; i < numToMerge; i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } int doc = minState.docID; int termDocFreq = minState.termFreq; System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM); System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1); ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTf) { // omitTf == false so we do write positions & payload System.Diagnostics.Debug.Assert(proxOut != null); for (int j = 0; j < termDocFreq; j++) { int code = prox.ReadVInt(); if (currentFieldStorePayloads) { int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.ReadVInt(); } else { payloadLength = 0; } if (payloadLength != lastPayloadLength) { proxOut.WriteVInt(code | 1); proxOut.WriteVInt(payloadLength); lastPayloadLength = payloadLength; } else { proxOut.WriteVInt(code & (~1)); } if (payloadLength > 0) { copyBytes(prox, proxOut, payloadLength); } } else { System.Diagnostics.Debug.Assert(0 == (code & 1)); proxOut.WriteVInt(code >> 1); } } //End for int newDocCode = (doc - lastDoc) << 1; if (1 == termDocFreq) { freqOut.WriteVInt(newDocCode | 1); } else { freqOut.WriteVInt(newDocCode); freqOut.WriteVInt(termDocFreq); } } else { // omitTf==true: we store only the docs, without // term freq, positions, payloads freqOut.WriteVInt(doc - lastDoc); } lastDoc = doc; if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for (int i = 0; i < numToMerge; i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; System.Diagnostics.Debug.Assert(upto == numToMerge); // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for (int i = 0; i < numFields; i++) { if (mergeStates[i] != minState) { mergeStates[upto++] = mergeStates[i]; } } numFields--; System.Diagnostics.Debug.Assert(upto == numFields); } } } System.Diagnostics.Debug.Assert(df > 0); // Done merging this term long skipPointer = skipListWriter.WriteSkip(freqOut); // Write term termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8); // TODO: we could save O(n) re-scan of the term by // computing the shared prefix with the last term // while during the UTF8 encoding termsOut.Add(fieldNumber, termsUTF8.result, termsUTF8.length, termInfo); } }