/// <summary> Save this segment's info.</summary> internal void Write(IndexOutput output) { output.WriteString(name); output.WriteInt(docCount); output.WriteLong(delGen); output.WriteInt(docStoreOffset); if (docStoreOffset != -1) { output.WriteString(docStoreSegment); output.WriteByte((byte)(docStoreIsCompoundFile?1:0)); } output.WriteByte((byte)(hasSingleNormFile?1:0)); if (normGen == null) { output.WriteInt(NO); } else { output.WriteInt(normGen.Length); for (int j = 0; j < normGen.Length; j++) { output.WriteLong(normGen[j]); } } output.WriteByte((byte)isCompoundFile); output.WriteInt(delCount); output.WriteByte((byte)(hasProx?1:0)); output.WriteStringStringMap(diagnostics); }
private void MergeNorms() { byte[] normBuffer = null; IndexOutput output = null; try { int numFieldInfos = fieldInfos.Size(); for (int i = 0; i < numFieldInfos; i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { if (output == null) { output = directory.CreateOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); output.WriteBytes(NORMS_HEADER, NORMS_HEADER.Length); } for (System.Collections.IEnumerator iter = readers.GetEnumerator(); iter.MoveNext();) { IndexReader reader = (IndexReader)iter.Current; int maxDoc = reader.MaxDoc(); if (normBuffer == null || normBuffer.Length < maxDoc) { // the buffer is too small for the current segment normBuffer = new byte[maxDoc]; } reader.Norms(fi.name, normBuffer, 0); if (!reader.HasDeletions()) { //optimized case for segments without deleted docs output.WriteBytes(normBuffer, maxDoc); } else { // this segment has deleted docs, so we have to // check for every doc if it is deleted or not for (int k = 0; k < maxDoc; k++) { if (!reader.IsDeleted(k)) { output.WriteByte(normBuffer[k]); } } } checkAbort.Work(maxDoc); } } } } finally { if (output != null) { output.Close(); } } }
/// <summary>Write as a d-gaps list </summary> private void WriteDgaps(IndexOutput output) { output.WriteInt(-1); // mark using d-gaps output.WriteInt(Size()); // write size output.WriteInt(Count()); // write count int last = 0; int n = Count(); int m = bits.Length; for (int i = 0; i < m && n > 0; i++) { if (bits[i] != 0) { output.WriteVInt(i - last); output.WriteByte(bits[i]); last = i; n -= BYTE_COUNTS[bits[i] & 0xFF]; } } }
public void Write(IndexOutput output) { output.WriteVInt(CURRENT_FORMAT); output.WriteVInt(Size()); for (int i = 0; i < Size(); i++) { FieldInfo fi = FieldInfo(i); byte bits = (byte)(0x0); if (fi.isIndexed) { bits |= IS_INDEXED; } if (fi.storeTermVector) { bits |= STORE_TERMVECTOR; } if (fi.storePositionWithTermVector) { bits |= STORE_POSITIONS_WITH_TERMVECTOR; } if (fi.storeOffsetWithTermVector) { bits |= STORE_OFFSET_WITH_TERMVECTOR; } if (fi.omitNorms) { bits |= OMIT_NORMS; } if (fi.storePayloads) { bits |= STORE_PAYLOADS; } if (fi.omitTermFreqAndPositions) { bits |= OMIT_TERM_FREQ_AND_POSITIONS; } output.WriteString(fi.name); output.WriteByte(bits); } }
public void Write(IndexOutput output) { output.WriteVInt(CURRENT_FORMAT); output.WriteVInt(Size()); for (int i = 0; i < Size(); i++) { FieldInfo fi = FieldInfo(i); byte bits = (byte) (0x0); if (fi.isIndexed) bits |= IS_INDEXED; if (fi.storeTermVector) bits |= STORE_TERMVECTOR; if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; if (fi.omitNorms) bits |= OMIT_NORMS; if (fi.storePayloads) bits |= STORE_PAYLOADS; if (fi.omitTermFreqAndPositions) bits |= OMIT_TERM_FREQ_AND_POSITIONS; output.WriteString(fi.name); output.WriteByte(bits); } }
/// <summary>Produce _X.nrm if any document had a field with norms /// not disabled /// </summary> public override void Flush(System.Collections.IDictionary threadsAndFields, SegmentWriteState state) { System.Collections.IDictionary byField = new System.Collections.Hashtable(); // Typically, each thread will have encountered the same // field. So first we collate by field, ie, all // per-thread field instances that correspond to the // same FieldInfo System.Collections.IEnumerator it = new System.Collections.Hashtable(threadsAndFields).GetEnumerator(); while (it.MoveNext()) { System.Collections.DictionaryEntry entry = (System.Collections.DictionaryEntry)it.Current; System.Collections.ICollection fields = (System.Collections.ICollection)entry.Value; System.Collections.IEnumerator fieldsIt = fields.GetEnumerator(); System.Collections.ArrayList fieldsToRemove = new System.Collections.ArrayList(); while (fieldsIt.MoveNext()) { NormsWriterPerField perField = (NormsWriterPerField)((System.Collections.DictionaryEntry)fieldsIt.Current).Key; if (perField.upto > 0) { // It has some norms System.Collections.IList l = (System.Collections.IList)byField[perField.fieldInfo]; if (l == null) { l = new System.Collections.ArrayList(); byField[perField.fieldInfo] = l; } l.Add(perField); } // Remove this field since we haven't seen it // since the previous flush else { fieldsToRemove.Add(perField); } } System.Collections.Hashtable fieldsHT = (System.Collections.Hashtable)fields; for (int i = 0; i < fieldsToRemove.Count; i++) { fieldsHT.Remove(fieldsToRemove[i]); } } System.String normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION; state.flushedFiles[normsFileName] = normsFileName; IndexOutput normsOut = state.directory.CreateOutput(normsFileName); try { normsOut.WriteBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.Length); int numField = fieldInfos.Size(); int normCount = 0; for (int fieldNumber = 0; fieldNumber < numField; fieldNumber++) { FieldInfo fieldInfo = fieldInfos.FieldInfo(fieldNumber); System.Collections.IList toMerge = (System.Collections.IList)byField[fieldInfo]; int upto = 0; if (toMerge != null) { int numFields = toMerge.Count; normCount++; NormsWriterPerField[] fields = new NormsWriterPerField[numFields]; int[] uptos = new int[numFields]; for (int j = 0; j < numFields; j++) { fields[j] = (NormsWriterPerField)toMerge[j]; } int numLeft = numFields; while (numLeft > 0) { System.Diagnostics.Debug.Assert(uptos [0] < fields [0].docIDs.Length, " uptos[0]=" + uptos [0] + " len=" + (fields [0].docIDs.Length)); int minLoc = 0; int minDocID = fields[0].docIDs[uptos[0]]; for (int j = 1; j < numLeft; j++) { int docID = fields[j].docIDs[uptos[j]]; if (docID < minDocID) { minDocID = docID; minLoc = j; } } System.Diagnostics.Debug.Assert(minDocID < state.numDocs); // Fill hole for (; upto < minDocID; upto++) { normsOut.WriteByte(defaultNorm); } normsOut.WriteByte(fields[minLoc].norms[uptos[minLoc]]); (uptos[minLoc])++; upto++; if (uptos[minLoc] == fields[minLoc].upto) { fields[minLoc].Reset(); if (minLoc != numLeft - 1) { fields[minLoc] = fields[numLeft - 1]; uptos[minLoc] = uptos[numLeft - 1]; } numLeft--; } } // Fill final hole with defaultNorm for (; upto < state.numDocs; upto++) { normsOut.WriteByte(defaultNorm); } } else if (fieldInfo.isIndexed && !fieldInfo.omitNorms) { normCount++; // Fill entire field with default norm: for (; upto < state.numDocs; upto++) { normsOut.WriteByte(defaultNorm); } } System.Diagnostics.Debug.Assert(4 + normCount * state.numDocs == normsOut.GetFilePointer(), ".nrm file size mismatch: expected=" + (4 + normCount * state.numDocs) + " actual=" + normsOut.GetFilePointer()); } } finally { normsOut.Close(); } }
internal void WriteField(FieldInfo fi, Fieldable field) { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case bool disableCompression = (field is FieldsReader.FieldForMerge); fieldsStream.WriteVInt(fi.number); byte bits = 0; if (field.IsTokenized()) { bits |= FieldsWriter.FIELD_IS_TOKENIZED; } if (field.IsBinary()) { bits |= FieldsWriter.FIELD_IS_BINARY; } if (field.IsCompressed()) { bits |= FieldsWriter.FIELD_IS_COMPRESSED; } fieldsStream.WriteByte(bits); if (field.IsCompressed()) { // compression is enabled for the current field byte[] data; int len; int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.GetBinaryValue(); System.Diagnostics.Debug.Assert(data != null); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); } else { // check if it is a binary field if (field.IsBinary()) { data = CompressionTools.Compress(field.GetBinaryValue(), field.GetBinaryOffset(), field.GetBinaryLength()); } else { byte[] x = System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()); data = CompressionTools.Compress(x, 0, x.Length); } len = data.Length; offset = 0; } fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { // compression is disabled for the current field if (field.IsBinary()) { byte[] data; int len; int offset; data = field.GetBinaryValue(); len = field.GetBinaryLength(); offset = field.GetBinaryOffset(); fieldsStream.WriteVInt(len); fieldsStream.WriteBytes(data, offset, len); } else { fieldsStream.WriteString(field.StringValue()); } } }
/// <summary> Save this segment's info.</summary> internal void Write(IndexOutput output) { output.WriteString(name); output.WriteInt(docCount); output.WriteLong(delGen); output.WriteInt(docStoreOffset); if (docStoreOffset != - 1) { output.WriteString(docStoreSegment); output.WriteByte((byte) (docStoreIsCompoundFile?1:0)); } output.WriteByte((byte) (hasSingleNormFile?1:0)); if (normGen == null) { output.WriteInt(NO); } else { output.WriteInt(normGen.Length); for (int j = 0; j < normGen.Length; j++) { output.WriteLong(normGen[j]); } } output.WriteByte((byte) isCompoundFile); output.WriteInt(delCount); output.WriteByte((byte) (hasProx?1:0)); output.WriteStringStringMap(diagnostics); }
/// <summary>Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// the real term vectors files in the Directory. /// </summary> internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) { return; } if (numPostings > maxNumPostings) { maxNumPostings = numPostings; } IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte)(0x0); if (doVectorPositions) { bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; } if (doVectorOffsets) { bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; } tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList)postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) { break; } prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.Reset(false); }
/// <summary>Write as a d-gaps list </summary> private void WriteDgaps(IndexOutput output) { output.WriteInt(- 1); // mark using d-gaps output.WriteInt(Size()); // write size output.WriteInt(Count()); // write count int last = 0; int n = Count(); int m = bits.Length; for (int i = 0; i < m && n > 0; i++) { if (bits[i] != 0) { output.WriteVInt(i - last); output.WriteByte(bits[i]); last = i; n -= BYTE_COUNTS[bits[i] & 0xFF]; } } }