/// <summary>Add a new position & payload </summary> internal override void AddPosition(int position, byte[] payload, int payloadOffset, int payloadLength) { System.Diagnostics.Debug.Assert(!omitTermFreqAndPositions, "omitTermFreqAndPositions is true"); System.Diagnostics.Debug.Assert(out_Renamed != null); int delta = position - lastPosition; lastPosition = position; if (storePayloads) { if (payloadLength != lastPayloadLength) { lastPayloadLength = payloadLength; out_Renamed.WriteVInt((delta << 1) | 1); out_Renamed.WriteVInt(payloadLength); } else { out_Renamed.WriteVInt(delta << 1); } if (payloadLength > 0) { out_Renamed.WriteBytes(payload, payloadLength); } } else { out_Renamed.WriteVInt(delta); } }
internal virtual void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out) { int freq = @in.Freq(); @out.WriteVInt(freq); int previousPosition = 0; int previousEndOffset = 0; for (int i = 0; i < freq; i++) { int pos = @in.NextPosition(); BytesRef payload = @in.Payload; // The low-order bit of token is set only if there is a payload, the // previous bits are the delta-encoded position. int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); @out.WriteVInt(token); previousPosition = pos; if (storeOffsets) // don't encode offsets if they are not stored { int startOffset = @in.StartOffset(); int endOffset = @in.EndOffset(); @out.WriteVInt(startOffset - previousEndOffset); @out.WriteVInt(endOffset - startOffset); previousEndOffset = endOffset; } if (payload != null) { @out.WriteVInt(payload.Length); @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } } }
/// <summary>Adds a new doc in this term. If this returns null /// then we just skip consuming positions/payloads. /// </summary> internal override FormatPostingsPositionsConsumer AddDoc(int docID, int termDocFreq) { int delta = docID - lastDocID; if (docID < 0 || (df > 0 && delta <= 0)) { throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); } if ((++df % skipInterval) == 0) { // TODO: abstraction violation skipListWriter.SetSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); skipListWriter.BufferSkip(df); } System.Diagnostics.Debug.Assert(docID < totalNumDocs, "docID=" + docID + " totalNumDocs=" + totalNumDocs); lastDocID = docID; if (omitTermFreqAndPositions) { out_Renamed.WriteVInt(delta); } else if (1 == termDocFreq) { out_Renamed.WriteVInt((delta << 1) | 1); } else { out_Renamed.WriteVInt(delta << 1); out_Renamed.WriteVInt(termDocFreq); } return(posWriter); }
protected override void Dispose(bool disposing) { _wrappedPostingsWriter.Dispose(); if (_wrappedPostingsWriter is PulsingPostingsWriter || VERSION_CURRENT < VERSION_META_ARRAY) { return; } var summaryFileName = IndexFileNames.SegmentFileName(_segmentState.SegmentInfo.Name, _segmentState.SegmentSuffix, SUMMARY_EXTENSION); IndexOutput output = null; try { output = _segmentState.Directory.CreateOutput(summaryFileName, _segmentState.Context); CodecUtil.WriteHeader(output, CODEC, VERSION_CURRENT); output.WriteVInt(_fields.Count); foreach (var field in _fields) { output.WriteVInt(field.FieldNumber); output.WriteVInt(field.LongsSize); } output.Dispose(); } finally { IOUtils.CloseWhileHandlingException(output); } }
/// <summary> /// Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set. /// Term must be lexicographically greater than all previous Terms added. /// TermInfo pointers must be positive and greater than all previous. /// </summary> public void Add(int fieldNumber, BytesRef term, TermInfo ti) { Debug.Assert(CompareToLastTerm(fieldNumber, term) < 0 || (IsIndex && term.Length == 0 && LastTerm.Length == 0), "Terms are out of order: field=" + FieldName(FieldInfos, fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + FieldName(FieldInfos, LastFieldNumber) + " (number " + LastFieldNumber + ")" + " text=" + term.Utf8ToString() + " lastText=" + LastTerm.Utf8ToString()); Debug.Assert(ti.FreqPointer >= LastTi.FreqPointer, "freqPointer out of order (" + ti.FreqPointer + " < " + LastTi.FreqPointer + ")"); Debug.Assert(ti.ProxPointer >= LastTi.ProxPointer, "proxPointer out of order (" + ti.ProxPointer + " < " + LastTi.ProxPointer + ")"); if (!IsIndex && Size % IndexInterval == 0) { Other.Add(LastFieldNumber, LastTerm, LastTi); // add an index term } WriteTerm(fieldNumber, term); // write term Output.WriteVInt(ti.DocFreq); // write doc freq Output.WriteVLong(ti.FreqPointer - LastTi.FreqPointer); // write pointers Output.WriteVLong(ti.ProxPointer - LastTi.ProxPointer); if (ti.DocFreq >= SkipInterval) { Output.WriteVInt(ti.SkipOffset); } if (IsIndex) { Output.WriteVLong(Other.Output.FilePointer - LastIndexPointer); LastIndexPointer = Other.Output.FilePointer; // write pointer } LastFieldNumber = fieldNumber; LastTi.Set(ti); Size++; }
private void Persist() { lock (this) { string fileName = SNAPSHOTS_PREFIX + NextWriteGen; IndexOutput @out = Dir.CreateOutput(fileName, IOContext.DEFAULT); bool success = false; try { CodecUtil.WriteHeader(@out, CODEC_NAME, VERSION_CURRENT); @out.WriteVInt(RefCounts.Count); foreach (KeyValuePair <long, int> ent in RefCounts) { @out.WriteVLong(ent.Key); @out.WriteVInt(ent.Value); } success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(@out); try { Dir.DeleteFile(fileName); } catch (Exception e) { // Suppress so we keep throwing original exception } } else { IOUtils.Close(@out); } } Dir.Sync(/*Collections.singletonList(*/ new[] { fileName } /*)*/); if (NextWriteGen > 0) { string lastSaveFile = SNAPSHOTS_PREFIX + (NextWriteGen - 1); try { Dir.DeleteFile(lastSaveFile); } catch (IOException ioe) { // OK: likely it didn't exist } } NextWriteGen++; } }
private void WriteHeader(int docBase, int numBufferedDocs, int[] numStoredFields, int[] lengths) { // save docBase and numBufferedDocs FieldsStream.WriteVInt(docBase); FieldsStream.WriteVInt(numBufferedDocs); // save numStoredFields SaveInts(numStoredFields, numBufferedDocs, FieldsStream); // save lengths SaveInts(lengths, numBufferedDocs, FieldsStream); }
/// <summary> /// Sole constructor. </summary> public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) { Debug.Assert(directory != null); this.Directory = directory; this.Segment = si.Name; this.SegmentSuffix = segmentSuffix; this.CompressionMode = compressionMode; this.Compressor = compressionMode.NewCompressor(); this.ChunkSize = chunkSize; NumDocs = 0; PendingDocs = new LinkedList <DocData>(); TermSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1)); PayloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1)); LastTerm = new BytesRef(ArrayUtil.Oversize(30, 1)); bool success = false; IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context); try { VectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_EXTENSION), context); string codecNameIdx = formatName + CODEC_SFX_IDX; string codecNameDat = formatName + CODEC_SFX_DAT; CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); CodecUtil.WriteHeader(VectorsStream, codecNameDat, VERSION_CURRENT); Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == VectorsStream.FilePointer); Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer); IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; VectorsStream.WriteVInt(PackedInts.VERSION_CURRENT); VectorsStream.WriteVInt(chunkSize); Writer = new BlockPackedWriter(VectorsStream, BLOCK_SIZE); PositionsBuf = new int[1024]; StartOffsetsBuf = new int[1024]; LengthsBuf = new int[1024]; PayloadLengthsBuf = new int[1024]; success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(indexStream); Abort(); } } }
protected internal FixedIntBlockIndexOutput(IndexOutput output, int fixedBlockSize) { blockSize = fixedBlockSize; this.output = output; output.WriteVInt(blockSize); buffer = new int[blockSize]; }
public override void Init(IndexOutput termsOut) { _termsOut = termsOut; CodecUtil.WriteHeader(termsOut, CODEC, VERSION_CURRENT); termsOut.WriteVInt(_pending.Length); // encode maxPositions in header _wrappedPostingsWriter.Init(termsOut); }
/// <summary> /// Write a block of data (<code>For</code> format). /// </summary> /// <param name="data"> the data to write </param> /// <param name="encoded"> a buffer to use to encode data </param> /// <param name="out"> the destination output </param> /// <exception cref="IOException"> If there is a low-level I/O error </exception> public void WriteBlock(int[] data, byte[] encoded, IndexOutput @out) { if (IsAllEqual(data)) { @out.WriteByte((byte)(sbyte)ALL_VALUES_EQUAL); @out.WriteVInt(data[0]); return; } int numBits = BitsRequired(data); Debug.Assert(numBits > 0 && numBits <= 32, numBits.ToString()); PackedInts.Encoder encoder = Encoders[numBits]; int iters = Iterations[numBits]; Debug.Assert(iters * encoder.ByteValueCount() >= Lucene41PostingsFormat.BLOCK_SIZE); int encodedSize = EncodedSizes[numBits]; Debug.Assert(iters * encoder.ByteBlockCount() >= encodedSize); @out.WriteByte((byte)numBits); encoder.Encode(data, 0, encoded, 0, iters); @out.WriteBytes(encoded, encodedSize); }
public override void Finish(long sumTotalTermFreq, long sumDocFreq, int docCount) { if (termCount > 0) { @out.WriteVInt(termCount); @out.WriteVInt(field.Number); if (field.FieldIndexOptions != IndexOptions.DOCS_ONLY) { @out.WriteVLong(sumTotalTermFreq); } @out.WriteVLong(sumDocFreq); @out.WriteVInt(docCount); FST <BytesRef> fst = builder.Finish(); fst.Save(@out); //System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); } }
private int NumBufferedDocs; // docBase + numBufferedDocs == current doc ID /// <summary> /// Sole constructor. </summary> public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) { Debug.Assert(directory != null); this.Directory = directory; this.Segment = si.Name; this.SegmentSuffix = segmentSuffix; this.CompressionMode = compressionMode; this.Compressor = compressionMode.NewCompressor(); this.ChunkSize = chunkSize; this.DocBase = 0; this.BufferedDocs = new GrowableByteArrayDataOutput(chunkSize); this.NumStoredFields = new int[16]; this.EndOffsets = new int[16]; this.NumBufferedDocs = 0; bool success = false; IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION), context); try { FieldsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, Lucene40StoredFieldsWriter.FIELDS_EXTENSION), context); string codecNameIdx = formatName + CODEC_SFX_IDX; string codecNameDat = formatName + CODEC_SFX_DAT; CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); CodecUtil.WriteHeader(FieldsStream, codecNameDat, VERSION_CURRENT); Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == FieldsStream.FilePointer); Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer); IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; FieldsStream.WriteVInt(chunkSize); FieldsStream.WriteVInt(PackedInts.VERSION_CURRENT); success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(indexStream); Abort(); } } }
/// <summary>Fills in no-term-vectors for all docs we haven't seen /// since the last doc that had term vectors. /// </summary> internal void Fill(int docID) { int docStoreOffset = docWriter.DocStoreOffset; int end = docID + docStoreOffset; if (lastDocID < end) { long tvfPosition = tvf.FilePointer; while (lastDocID < end) { tvx.WriteLong(tvd.FilePointer); tvd.WriteVInt(0); tvx.WriteLong(tvfPosition); lastDocID++; } } }
/// <summary> /// Add a new position & payload </summary> public override void AddPosition(int position, BytesRef payload, int startOffset, int endOffset) { // if (DEBUG) { // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: "")); // } PosDeltaBuffer[PosBufferUpto] = position - LastPosition; if (FieldHasPayloads) { if (payload == null || payload.Length == 0) { // no payload PayloadLengthBuffer[PosBufferUpto] = 0; } else { PayloadLengthBuffer[PosBufferUpto] = payload.Length; if (PayloadByteUpto + payload.Length > PayloadBytes.Length) { PayloadBytes = ArrayUtil.Grow(PayloadBytes, PayloadByteUpto + payload.Length); } Array.Copy(payload.Bytes, payload.Offset, PayloadBytes, PayloadByteUpto, payload.Length); PayloadByteUpto += payload.Length; } } if (FieldHasOffsets) { Debug.Assert(startOffset >= LastStartOffset); Debug.Assert(endOffset >= startOffset); OffsetStartDeltaBuffer[PosBufferUpto] = startOffset - LastStartOffset; OffsetLengthBuffer[PosBufferUpto] = endOffset - startOffset; LastStartOffset = startOffset; } PosBufferUpto++; LastPosition = position; if (PosBufferUpto == Lucene41PostingsFormat.BLOCK_SIZE) { // if (DEBUG) { // System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer()); // } ForUtil.WriteBlock(PosDeltaBuffer, Encoded, PosOut); if (FieldHasPayloads) { ForUtil.WriteBlock(PayloadLengthBuffer, Encoded, PayOut); PayOut.WriteVInt(PayloadByteUpto); PayOut.WriteBytes(PayloadBytes, 0, PayloadByteUpto); PayloadByteUpto = 0; } if (FieldHasOffsets) { ForUtil.WriteBlock(OffsetStartDeltaBuffer, Encoded, PayOut); ForUtil.WriteBlock(OffsetLengthBuffer, Encoded, PayOut); } PosBufferUpto = 0; } }
internal CompressingStoredFieldsIndexWriter(IndexOutput indexOutput) { this.FieldsIndexOut = indexOutput; Reset(); TotalDocs = 0; DocBaseDeltas = new int[BLOCK_SIZE]; StartPointerDeltas = new long[BLOCK_SIZE]; FieldsIndexOut.WriteVInt(PackedInts.VERSION_CURRENT); }
/// <summary> /// Add a new position & payload </summary> public override void AddPosition(int position, BytesRef payload, int startOffset, int endOffset) { //if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.Length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); Debug.Assert(IndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, "invalid indexOptions: " + IndexOptions); Debug.Assert(ProxOut != null); int delta = position - LastPosition; Debug.Assert(delta >= 0, "position=" + position + " lastPosition=" + LastPosition); // not quite right (if pos=0 is repeated twice we don't catch it) LastPosition = position; int payloadLength = 0; if (StorePayloads) { payloadLength = payload == null ? 0 : payload.Length; if (payloadLength != LastPayloadLength) { LastPayloadLength = payloadLength; ProxOut.WriteVInt((delta << 1) | 1); ProxOut.WriteVInt(payloadLength); } else { ProxOut.WriteVInt(delta << 1); } } else { ProxOut.WriteVInt(delta); } if (StoreOffsets) { // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, // and the numbers aren't that much smaller anyways. int offsetDelta = startOffset - LastOffset; int offsetLength = endOffset - startOffset; Debug.Assert(offsetDelta >= 0 && offsetLength >= 0, "startOffset=" + startOffset + ",lastOffset=" + LastOffset + ",endOffset=" + endOffset); if (offsetLength != LastOffsetLength) { ProxOut.WriteVInt(offsetDelta << 1 | 1); ProxOut.WriteVInt(offsetLength); } else { ProxOut.WriteVInt(offsetDelta << 1); } LastOffset = startOffset; LastOffsetLength = offsetLength; } if (payloadLength > 0) { ProxOut.WriteBytes(payload.Bytes, payload.Offset, payloadLength); } }
public void Write(IndexOutput output) { output.WriteVInt(CURRENT_FORMAT); output.WriteVInt(Size()); for (int i = 0; i < Size(); i++) { FieldInfo fi = FieldInfo(i); var bits = (byte)(0x0); if (fi.isIndexed) { bits |= IS_INDEXED; } if (fi.storeTermVector) { bits |= STORE_TERMVECTOR; } if (fi.storePositionWithTermVector) { bits |= STORE_POSITIONS_WITH_TERMVECTOR; } if (fi.storeOffsetWithTermVector) { bits |= STORE_OFFSET_WITH_TERMVECTOR; } if (fi.omitNorms) { bits |= OMIT_NORMS; } if (fi.storePayloads) { bits |= STORE_PAYLOADS; } if (fi.omitTermFreqAndPositions) { bits |= OMIT_TERM_FREQ_AND_POSITIONS; } output.WriteString(fi.name); output.WriteByte(bits); } }
public override void Dispose() { if (Output == null) { return; } try { long dirStart = Output.FilePointer; int fieldCount = _fields.Count; int nonNullFieldCount = 0; for (int i = 0; i < fieldCount; i++) { FstFieldWriter field = _fields[i]; if (field.Fst != null) { nonNullFieldCount++; } } Output.WriteVInt(nonNullFieldCount); for (int i = 0; i < fieldCount; i++) { FstFieldWriter field = _fields[i]; if (field.Fst != null) { Output.WriteVInt(field.FieldInfo.Number); Output.WriteVLong(field.IndexStart); } } WriteTrailer(dirStart); CodecUtil.WriteFooter(Output); } finally { Output.Dispose(); Output = null; } }
public override void Dispose() { // EOF marker: try { @out.WriteVInt(0); CodecUtil.WriteFooter(@out); } finally { @out.Dispose(); } }
public override void StartDoc(int docID, int termDocFreq) { // if (DEBUG) System.out.println("SPW: startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer()); int delta = docID - LastDocID; if (docID < 0 || (Df > 0 && delta <= 0)) { throw new CorruptIndexException("docs out of order (" + docID + " <= " + LastDocID + " ) (freqOut: " + FreqOut + ")"); } if ((++Df % SkipInterval) == 0) { SkipListWriter.SetSkipData(LastDocID, StorePayloads, LastPayloadLength, StoreOffsets, LastOffsetLength); SkipListWriter.BufferSkip(Df); } Debug.Assert(docID < TotalNumDocs, "docID=" + docID + " totalNumDocs=" + TotalNumDocs); LastDocID = docID; if (IndexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { FreqOut.WriteVInt(delta); } else if (1 == termDocFreq) { FreqOut.WriteVInt((delta << 1) | 1); } else { FreqOut.WriteVInt(delta << 1); FreqOut.WriteVInt(termDocFreq); } LastPosition = 0; LastOffset = 0; }
/// <summary>Adds a new <fieldNumber, termBytes>, TermInfo> pair to the set. /// Term must be lexicographically greater than all previous Terms added. /// TermInfo pointers must be positive and greater than all previous. /// </summary> internal void Add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) { System.Diagnostics.Debug.Assert(CompareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || (isIndex && termBytesLength == 0 && lastTermBytesLength == 0), "Terms are out of order: field=" + fieldInfos.FieldName(fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + fieldInfos.FieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + " text=" + System.Text.Encoding.UTF8.GetString(termBytes, 0, termBytesLength) + " lastText=" + System.Text.Encoding.UTF8.GetString(lastTermBytes, 0, lastTermBytesLength)); System.Diagnostics.Debug.Assert(ti.freqPointer >= lastTi.freqPointer, "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"); System.Diagnostics.Debug.Assert(ti.proxPointer >= lastTi.proxPointer, "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"); if (!isIndex && size % indexInterval == 0) { other.Add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term } WriteTerm(fieldNumber, termBytes, termBytesLength); // write term output.WriteVInt(ti.docFreq); // write doc freq output.WriteVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.WriteVLong(ti.proxPointer - lastTi.proxPointer); if (ti.docFreq >= skipInterval) { output.WriteVInt(ti.skipOffset); } if (isIndex) { output.WriteVLong(other.output.FilePointer - lastIndexPointer); lastIndexPointer = other.output.FilePointer; // write pointer } lastFieldNumber = fieldNumber; lastTi.Set(ti); size++; }
protected internal override void WriteSkipData(int level, IndexOutput skipBuffer) { // To efficiently store payloads in the posting lists we do not store the length of // every payload. Instead we omit the length for a payload if the previous payload had // the same length. // However, in order to support skipping the payload length at every skip point must be known. // So we use the same length encoding that we use for the posting lists for the skip data as well: // Case 1: current field does not store payloads // SkipDatum --> DocSkip, FreqSkip, ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // DocSkip records the document number before every SkipInterval th document in TermFreqs. // Document numbers are represented as differences from the previous value in the sequence. // Case 2: current field stores payloads // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // PayloadLength --> VInt // In this case DocSkip/2 is the difference between // the current and the previous value. If DocSkip // is odd, then a PayloadLength encoded as VInt follows, // if DocSkip is even, then it is assumed that the // current payload length equals the length at the previous // skip point if (curStorePayloads) { int delta = curDoc - lastSkipDoc[level]; if (curPayloadLength == lastSkipPayloadLength[level]) { // the current payload length equals the length at the previous skip point, // so we don't store the length again skipBuffer.WriteVInt(delta * 2); } else { // the payload length is different from the previous one. We shift the DocSkip, // set the lowest bit and store the current payload length as VInt. skipBuffer.WriteVInt(delta * 2 + 1); skipBuffer.WriteVInt(curPayloadLength); lastSkipPayloadLength[level] = curPayloadLength; } } else { // current field does not store payloads skipBuffer.WriteVInt(curDoc - lastSkipDoc[level]); } skipBuffer.WriteVInt((int)(curFreqPointer - lastSkipFreqPointer[level])); skipBuffer.WriteVInt((int)(curProxPointer - lastSkipProxPointer[level])); lastSkipDoc[level] = curDoc; //System.out.println("write doc at level " + level + ": " + curDoc); lastSkipFreqPointer[level] = curFreqPointer; lastSkipProxPointer[level] = curProxPointer; }
private void Flush() { int chunkDocs = PendingDocs.Count; Debug.Assert(chunkDocs > 0, chunkDocs.ToString()); // write the index file IndexWriter.WriteIndex(chunkDocs, VectorsStream.FilePointer); int docBase = NumDocs - chunkDocs; VectorsStream.WriteVInt(docBase); VectorsStream.WriteVInt(chunkDocs); // total number of fields of the chunk int totalFields = FlushNumFields(chunkDocs); if (totalFields > 0) { // unique field numbers (sorted) int[] fieldNums = FlushFieldNums(); // offsets in the array of unique field numbers FlushFields(totalFields, fieldNums); // flags (does the field have positions, offsets, payloads?) FlushFlags(totalFields, fieldNums); // number of terms of each field FlushNumTerms(totalFields); // prefix and suffix lengths for each field FlushTermLengths(); // term freqs - 1 (because termFreq is always >=1) for each term FlushTermFreqs(); // positions for all terms, when enabled FlushPositions(); // offsets for all terms, when enabled FlushOffsets(fieldNums); // payload lengths for all terms, when enabled FlushPayloadLengths(); // compress terms and payloads and write them to the output Compressor.Compress(TermSuffixes.Bytes, 0, TermSuffixes.Length, VectorsStream); } // reset PendingDocs.Clear(); CurDoc = null; CurField = null; TermSuffixes.Length = 0; }
/// <summary>Write as a d-gaps list </summary> private void WriteDgaps(IndexOutput output) { output.WriteInt(-1); // mark using d-gaps output.WriteInt(Size()); // write size output.WriteInt(Count()); // write count int last = 0; int n = Count(); int m = bits.Length; for (int i = 0; i < m && n > 0; i++) { if (bits[i] != 0) { output.WriteVInt(i - last); output.WriteByte(bits[i]); last = i; n -= BYTE_COUNTS[bits[i] & 0xFF]; } } }
/// <summary> /// Write as a d-gaps list </summary> private void WriteClearedDgaps(IndexOutput output) { output.WriteInt(-1); // mark using d-gaps output.WriteInt(Size()); // write size output.WriteInt(Count()); // write count int last = 0; int numCleared = Size() - Count(); for (int i = 0; i < Bits.Length && numCleared > 0; i++) { if (Bits[i] != unchecked ((byte)0xff)) { output.WriteVInt(i - last); output.WriteByte(Bits[i]); last = i; numCleared -= (8 - BitUtil.BitCount(Bits[i])); Debug.Assert(numCleared >= 0 || (i == (Bits.Length - 1) && numCleared == -(8 - (Size_Renamed & 7)))); } } }
public override FieldsConsumer FieldsConsumer(SegmentWriteState writeState) { int id = (int)NextID.IncrementAndGet(); // TODO -- ok to do this up front instead of // on close....? should be ok? // Write our ID: string idFileName = IndexFileNames.SegmentFileName(writeState.SegmentInfo.Name, writeState.SegmentSuffix, ID_EXTENSION); IndexOutput @out = writeState.Directory.CreateOutput(idFileName, writeState.Context); bool success = false; try { CodecUtil.WriteHeader(@out, RAM_ONLY_NAME, VERSION_LATEST); @out.WriteVInt(id); success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(@out); } else { IOUtils.Close(@out); } } RAMPostings postings = new RAMPostings(); RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings); lock (State) { State[id] = postings; } return(consumer); }
/// <summary> /// expert: writes a value dictionary for a sorted/sortedset field </summary> protected internal virtual void AddTermsDict(FieldInfo field, IEnumerable <BytesRef> values) { // first check if its a "fixed-length" terms dict int minLength = int.MaxValue; int maxLength = int.MinValue; foreach (BytesRef v in values) { minLength = Math.Min(minLength, v.Length); maxLength = Math.Max(maxLength, v.Length); } if (minLength == maxLength) { // no index needed: direct addressing by mult AddBinaryField(field, values); } else { // header Meta.WriteVInt(field.Number); Meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY); Meta.WriteVInt(BINARY_PREFIX_COMPRESSED); Meta.WriteLong(-1L); // now write the bytes: sharing prefixes within a block long startFP = Data.FilePointer; // currently, we have to store the delta from expected for every 1/nth term // we could avoid this, but its not much and less overall RAM than the previous approach! RAMOutputStream addressBuffer = new RAMOutputStream(); MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE); BytesRef lastTerm = new BytesRef(); long count = 0; foreach (BytesRef v in values) { if (count % ADDRESS_INTERVAL == 0) { termAddresses.Add(Data.FilePointer - startFP); // force the first term in a block to be abs-encoded lastTerm.Length = 0; } // prefix-code int sharedPrefix = StringHelper.BytesDifference(lastTerm, v); Data.WriteVInt(sharedPrefix); Data.WriteVInt(v.Length - sharedPrefix); Data.WriteBytes(v.Bytes, v.Offset + sharedPrefix, v.Length - sharedPrefix); lastTerm.CopyBytes(v); count++; } long indexStartFP = Data.FilePointer; // write addresses of indexed terms termAddresses.Finish(); addressBuffer.WriteTo(Data); addressBuffer = null; termAddresses = null; Meta.WriteVInt(minLength); Meta.WriteVInt(maxLength); Meta.WriteVLong(count); Meta.WriteLong(startFP); Meta.WriteVInt(ADDRESS_INTERVAL); Meta.WriteLong(indexStartFP); Meta.WriteVInt(PackedInts.VERSION_CURRENT); Meta.WriteVInt(BLOCK_SIZE); } }
public override void Write(Directory directory, string segmentName, string segmentSuffix, FieldInfos infos, IOContext context) { string fileName = IndexFileNames.SegmentFileName(segmentName, "", FIELD_INFOS_EXTENSION); IndexOutput output = directory.CreateOutput(fileName, context); bool success = false; try { output.WriteVInt(FORMAT_PREFLEX_RW); output.WriteVInt(infos.Size()); foreach (FieldInfo fi in infos) { sbyte bits = 0x0; if (fi.HasVectors()) { bits |= STORE_TERMVECTOR; } if (fi.OmitsNorms()) { bits |= OMIT_NORMS; } if (fi.HasPayloads()) { bits |= STORE_PAYLOADS; } if (fi.Indexed) { bits |= IS_INDEXED; Debug.Assert(fi.FieldIndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.HasPayloads()); if (fi.FieldIndexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { bits |= OMIT_TERM_FREQ_AND_POSITIONS; } else if (fi.FieldIndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS) { bits |= OMIT_POSITIONS; } } output.WriteString(fi.Name); /* * we need to write the field number since IW tries * to stabelize the field numbers across segments so the * FI ordinal is not necessarily equivalent to the field number */ output.WriteInt(fi.Number); output.WriteByte(bits); if (fi.Indexed && !fi.OmitsNorms()) { // to allow null norm types we need to indicate if norms are written // only in RW case output.WriteByte((sbyte)(fi.NormType == null ? 0 : 1)); } Debug.Assert(fi.Attributes() == null); // not used or supported } success = true; } finally { if (success) { output.Dispose(); } else { IOUtils.CloseWhileHandlingException(output); } } }
/// <summary> /// Sole constructor. </summary> public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) { Debug.Assert(directory != null); this.Directory = directory; this.Segment = si.Name; this.SegmentSuffix = segmentSuffix; this.CompressionMode = compressionMode; this.Compressor = compressionMode.NewCompressor(); this.ChunkSize = chunkSize; NumDocs = 0; PendingDocs = new LinkedList<DocData>(); TermSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1)); PayloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1)); LastTerm = new BytesRef(ArrayUtil.Oversize(30, 1)); bool success = false; IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context); try { VectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, VECTORS_EXTENSION), context); string codecNameIdx = formatName + CODEC_SFX_IDX; string codecNameDat = formatName + CODEC_SFX_DAT; CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); CodecUtil.WriteHeader(VectorsStream, codecNameDat, VERSION_CURRENT); Debug.Assert(CodecUtil.HeaderLength(codecNameDat) == VectorsStream.FilePointer); Debug.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.FilePointer); IndexWriter = new CompressingStoredFieldsIndexWriter(indexStream); indexStream = null; VectorsStream.WriteVInt(PackedInts.VERSION_CURRENT); VectorsStream.WriteVInt(chunkSize); Writer = new BlockPackedWriter(VectorsStream, BLOCK_SIZE); PositionsBuf = new int[1024]; StartOffsetsBuf = new int[1024]; LengthsBuf = new int[1024]; PayloadLengthsBuf = new int[1024]; success = true; } finally { if (!success) { IOUtils.CloseWhileHandlingException(indexStream); Abort(); } } }
/// <summary>Merge files with the extensions added up to now. /// All files with these extensions are combined sequentially into the /// compound stream. After successful merge, the source files /// are deleted. /// </summary> /// <throws> IllegalStateException if close() had been called before or </throws> /// <summary> if no file has been added to this object /// </summary> public void Dispose() { // Extract into protected method if class ever becomes unsealed // TODO: Dispose shouldn't throw exceptions! if (merged) { throw new SystemException("Merge already performed"); } if ((entries.Count == 0)) { throw new SystemException("No entries to merge have been defined"); } merged = true; // open the compound stream IndexOutput os = null; try { os = directory.CreateOutput(fileName); // Write the number of entries os.WriteVInt(entries.Count); // Write the directory with all offsets at 0. // Remember the positions of directory entries so that we can // adjust the offsets later long totalSize = 0; foreach (FileEntry fe in entries) { fe.directoryOffset = os.FilePointer; os.WriteLong(0); // for now os.WriteString(fe.file); totalSize += directory.FileLength(fe.file); } // Pre-allocate size of file as optimization -- // this can potentially help IO performance as // we write the file and also later during // searching. It also uncovers a disk-full // situation earlier and hopefully without // actually filling disk to 100%: long finalLength = totalSize + os.FilePointer; os.SetLength(finalLength); // Open the files and copy their data into the stream. // Remember the locations of each file's data section. var buffer = new byte[16384]; foreach (FileEntry fe in entries) { fe.dataOffset = os.FilePointer; CopyFile(fe, os, buffer); } // Write the data offsets into the directory of the compound stream foreach (FileEntry fe in entries) { os.Seek(fe.directoryOffset); os.WriteLong(fe.dataOffset); } System.Diagnostics.Debug.Assert(finalLength == os.Length); // Close the output stream. Set the os to null before trying to // close so that if an exception occurs during the close, the // finally clause below will not attempt to close the stream // the second time. IndexOutput tmp = os; os = null; tmp.Close(); } finally { if (os != null) { try { os.Close(); } catch (System.IO.IOException) { } } } }
protected internal override void WriteSkipData(int level, IndexOutput skipBuffer) { // To efficiently store payloads/offsets in the posting lists we do not store the length of // every payload/offset. Instead we omit the length if the previous lengths were the same // // However, in order to support skipping, the length at every skip point must be known. // So we use the same length encoding that we use for the posting lists for the skip data as well: // Case 1: current field does not store payloads/offsets // SkipDatum --> DocSkip, FreqSkip, ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // DocSkip records the document number before every SkipInterval th document in TermFreqs. // Document numbers are represented as differences from the previous value in the sequence. // Case 2: current field stores payloads/offsets // SkipDatum --> DocSkip, PayloadLength?,OffsetLength?,FreqSkip,ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // PayloadLength,OffsetLength--> VInt // In this case DocSkip/2 is the difference between // the current and the previous value. If DocSkip // is odd, then a PayloadLength encoded as VInt follows, // if DocSkip is even, then it is assumed that the // current payload/offset lengths equals the lengths at the previous // skip point int delta = CurDoc - LastSkipDoc[level]; if (CurStorePayloads || CurStoreOffsets) { Debug.Assert(CurStorePayloads || CurPayloadLength == LastSkipPayloadLength[level]); Debug.Assert(CurStoreOffsets || CurOffsetLength == LastSkipOffsetLength[level]); if (CurPayloadLength == LastSkipPayloadLength[level] && CurOffsetLength == LastSkipOffsetLength[level]) { // the current payload/offset lengths equals the lengths at the previous skip point, // so we don't store the lengths again skipBuffer.WriteVInt(delta << 1); } else { // the payload and/or offset length is different from the previous one. We shift the DocSkip, // set the lowest bit and store the current payload and/or offset lengths as VInts. skipBuffer.WriteVInt(delta << 1 | 1); if (CurStorePayloads) { skipBuffer.WriteVInt(CurPayloadLength); LastSkipPayloadLength[level] = CurPayloadLength; } if (CurStoreOffsets) { skipBuffer.WriteVInt(CurOffsetLength); LastSkipOffsetLength[level] = CurOffsetLength; } } } else { // current field does not store payloads or offsets skipBuffer.WriteVInt(delta); } skipBuffer.WriteVInt((int)(CurFreqPointer - LastSkipFreqPointer[level])); skipBuffer.WriteVInt((int)(CurProxPointer - LastSkipProxPointer[level])); LastSkipDoc[level] = CurDoc; LastSkipFreqPointer[level] = CurFreqPointer; LastSkipProxPointer[level] = CurProxPointer; }
public override void Init(IndexOutput termsOut) { CodecUtil.WriteHeader(termsOut, TERMS_CODEC, VERSION_CURRENT); termsOut.WriteVInt(Lucene41PostingsFormat.BLOCK_SIZE); }
protected override void WriteSkipData(int level, IndexOutput skipBuffer) { // To efficiently store payloads in the posting lists we do not store the length of // every payload. Instead we omit the length for a payload if the previous payload had // the same length. // However, in order to support skipping the payload length at every skip point must be known. // So we use the same length encoding that we use for the posting lists for the skip data as well: // Case 1: current field does not store payloads // SkipDatum --> DocSkip, FreqSkip, ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // DocSkip records the document number before every SkipInterval th document in TermFreqs. // Document numbers are represented as differences from the previous value in the sequence. // Case 2: current field stores payloads // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // PayloadLength --> VInt // In this case DocSkip/2 is the difference between // the current and the previous value. If DocSkip // is odd, then a PayloadLength encoded as VInt follows, // if DocSkip is even, then it is assumed that the // current payload length equals the length at the previous // skip point Debug.Assert(_indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !_curStorePayloads); if (_curStorePayloads) { int delta = _curDoc - _lastSkipDoc[level]; if (_curPayloadLength == _lastSkipPayloadLength[level]) { // the current payload length equals the length at the previous skip point, // so we don't store the length again skipBuffer.WriteVInt(delta << 1); } else { // the payload length is different from the previous one. We shift the DocSkip, // set the lowest bit and store the current payload length as VInt. skipBuffer.WriteVInt(delta << 1 | 1); skipBuffer.WriteVInt(_curPayloadLength); _lastSkipPayloadLength[level] = _curPayloadLength; } } else { // current field does not store payloads skipBuffer.WriteVInt(_curDoc - _lastSkipDoc[level]); } if (_indexOptions != FieldInfo.IndexOptions.DOCS_ONLY) { _freqIndex[level].Mark(); _freqIndex[level].Write(skipBuffer, false); } _docIndex[level].Mark(); _docIndex[level].Write(skipBuffer, false); if (_indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { _posIndex[level].Mark(); _posIndex[level].Write(skipBuffer, false); if (_curStorePayloads) { skipBuffer.WriteVInt((int) (_curPayloadPointer - _lastSkipPayloadPointer[level])); } } _lastSkipDoc[level] = _curDoc; _lastSkipPayloadPointer[level] = _curPayloadPointer; }