private void BufferSkip() { _skipOut.WriteVInt64(_statsOut.GetFilePointer() - _lastBlockStatsFp); _skipOut.WriteVInt64(_metaLongsOut.GetFilePointer() - _lastBlockMetaLongsFp); _skipOut.WriteVInt64(_metaBytesOut.GetFilePointer() - _lastBlockMetaBytesFp); for (var i = 0; i < _longsSize; i++) { _skipOut.WriteVInt64(_lastLongs[i] - _lastBlockLongs[i]); } _lastBlockStatsFp = _statsOut.GetFilePointer(); _lastBlockMetaLongsFp = _metaLongsOut.GetFilePointer(); _lastBlockMetaBytesFp = _metaBytesOut.GetFilePointer(); Array.Copy(_lastLongs, 0, _lastBlockLongs, 0, _longsSize); }
public override void FinishTerm(BytesRef text, TermStats stats) { // write term meta data into fst var state = _outerInstance._postingsWriter.NewTermState(); var meta = new FSTTermOutputs.TermData { longs = new long[_longsSize], bytes = null, docFreq = state.DocFreq = stats.DocFreq, totalTermFreq = state.TotalTermFreq = stats.TotalTermFreq }; _outerInstance._postingsWriter.FinishTerm(state); _outerInstance._postingsWriter.EncodeTerm(meta.longs, _metaWriter, _fieldInfo, state, true); var bytesSize = (int)_metaWriter.GetFilePointer(); if (bytesSize > 0) { meta.bytes = new byte[bytesSize]; _metaWriter.WriteTo(meta.bytes, 0); _metaWriter.Reset(); } _builder.Add(Util.ToInt32sRef(text, _scratchTerm), meta); _numTerms++; }
public override void FinishTerm(BytesRef text, TermStats stats) { if (_numTerms > 0 && _numTerms % SKIP_INTERVAL == 0) { BufferSkip(); } // write term meta data into fst var longs = new long[_longsSize]; long delta = stats.TotalTermFreq - stats.DocFreq; if (stats.TotalTermFreq > 0) { if (delta == 0) { _statsOut.WriteVInt32(stats.DocFreq << 1 | 1); } else { _statsOut.WriteVInt32(stats.DocFreq << 1 | 0); _statsOut.WriteVInt64(stats.TotalTermFreq - stats.DocFreq); } } else { _statsOut.WriteVInt32(stats.DocFreq); } var state = _outerInstance.postingsWriter.NewTermState(); state.DocFreq = stats.DocFreq; state.TotalTermFreq = stats.TotalTermFreq; _outerInstance.postingsWriter.FinishTerm(state); _outerInstance.postingsWriter.EncodeTerm(longs, _metaBytesOut, _fieldInfo, state, true); for (var i = 0; i < _longsSize; i++) { _metaLongsOut.WriteVInt64(longs[i] - _lastLongs[i]); _lastLongs[i] = longs[i]; } _metaLongsOut.WriteVInt64(_metaBytesOut.GetFilePointer() - _lastMetaBytesFp); _builder.Add(Util.ToInt32sRef(text, _scratchTerm), _numTerms); _numTerms++; _lastMetaBytesFp = _metaBytesOut.GetFilePointer(); }
public virtual PostingsWriter Reset() { Debug.Assert(buffer.GetFilePointer() == 0); lastDocID = 0; docCount = 0; lastPayloadLen = 0; lastOffsetLength = -1; return(this); }
public override void FinishTerm(BytesRef text, TermStats stats) { if (Debugging.AssertsEnabled) { Debugging.Assert(postingsWriter.docCount == stats.DocFreq); } if (Debugging.AssertsEnabled) { Debugging.Assert(buffer2.GetFilePointer() == 0); } buffer2.WriteVInt32(stats.DocFreq); if (field.IndexOptions != IndexOptions.DOCS_ONLY) { buffer2.WriteVInt64(stats.TotalTermFreq - stats.DocFreq); } int pos = (int)buffer2.GetFilePointer(); buffer2.WriteTo(finalBuffer, 0); buffer2.Reset(); int totalBytes = pos + (int)postingsWriter.buffer.GetFilePointer(); if (totalBytes > finalBuffer.Length) { finalBuffer = ArrayUtil.Grow(finalBuffer, totalBytes); } postingsWriter.buffer.WriteTo(finalBuffer, pos); postingsWriter.buffer.Reset(); spare.Bytes = finalBuffer; spare.Length = totalBytes; //System.out.println(" finishTerm term=" + text.utf8ToString() + " " + totalBytes + " bytes totalTF=" + stats.totalTermFreq); //for(int i=0;i<totalBytes;i++) { // System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF)); //} builder.Add(Util.ToInt32sRef(text, scratchIntsRef), BytesRef.DeepCopyOf(spare)); termCount++; }
internal SortingDocsAndPositionsEnum(int maxDoc, SortingDocsAndPositionsEnum reuse, DocsAndPositionsEnum @in, Sorter.DocMap docMap, bool storeOffsets) : base(@in) { this.maxDoc = maxDoc; this.storeOffsets = storeOffsets; if (reuse != null) { docs = reuse.docs; offsets = reuse.offsets; payload = reuse.payload; file = reuse.file; if (reuse.maxDoc == maxDoc) { sorter = reuse.sorter; } else { sorter = new DocOffsetSorter(maxDoc); } } else { docs = new int[32]; offsets = new long[32]; payload = new BytesRef(32); file = new RAMFile(); sorter = new DocOffsetSorter(maxDoc); } using (IndexOutput @out = new RAMOutputStream(file)) { int doc; int i = 0; while ((doc = @in.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (i == docs.Length) { int newLength = ArrayUtil.Oversize(i + 1, 4); docs = Arrays.CopyOf(docs, newLength); offsets = Arrays.CopyOf(offsets, newLength); } docs[i] = docMap.OldToNew(doc); offsets[i] = @out.GetFilePointer(); AddPositions(@in, @out); i++; } upto = i; sorter.Reset(docs, offsets); sorter.Sort(0, upto); } this.postingInput = new RAMInputStream("", file); }
/// <summary> /// Called when we are done adding docs to this term /// </summary> /// <param name="state"></param> public override void FinishTerm(BlockTermState state) { var state2 = (PulsingTermState)state; Debug.Assert(_pendingCount > 0 || _pendingCount == -1); if (_pendingCount == -1) { state2.wrappedState.DocFreq = state2.DocFreq; state2.wrappedState.TotalTermFreq = state2.TotalTermFreq; state2.bytes = null; _wrappedPostingsWriter.FinishTerm(state2.wrappedState); } else { // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { var lastDocID = 0; var pendingIDX = 0; var lastPayloadLength = -1; var lastOffsetLength = -1; while (pendingIDX < _pendingCount) { var doc = _pending[pendingIDX]; var delta = doc.docID - lastDocID; lastDocID = doc.docID; // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); if (doc.termFreq == 1) { _buffer.WriteVInt32((delta << 1) | 1); } else { _buffer.WriteVInt32(delta << 1); _buffer.WriteVInt32(doc.termFreq); } var lastPos = 0; var lastOffset = 0; for (var posIDX = 0; posIDX < doc.termFreq; posIDX++) { var pos = _pending[pendingIDX++]; Debug.Assert(pos.docID == doc.docID); var posDelta = pos.pos - lastPos; lastPos = pos.pos; var payloadLength = pos.payload == null ? 0 : pos.payload.Length; if (_storePayloads) { if (payloadLength != lastPayloadLength) { _buffer.WriteVInt32((posDelta << 1) | 1); _buffer.WriteVInt32(payloadLength); lastPayloadLength = payloadLength; } else { _buffer.WriteVInt32(posDelta << 1); } } else { _buffer.WriteVInt32(posDelta); } if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { //System.out.println("write=" + pos.startOffset + "," + pos.endOffset); var offsetDelta = pos.startOffset - lastOffset; var offsetLength = pos.endOffset - pos.startOffset; if (offsetLength != lastOffsetLength) { _buffer.WriteVInt32(offsetDelta << 1 | 1); _buffer.WriteVInt32(offsetLength); } else { _buffer.WriteVInt32(offsetDelta << 1); } lastOffset = pos.startOffset; lastOffsetLength = offsetLength; } if (payloadLength > 0) { Debug.Assert(_storePayloads); _buffer.WriteBytes(pos.payload.Bytes, 0, pos.payload.Length); } } } } else if (_indexOptions == IndexOptions.DOCS_AND_FREQS) { int lastDocId = 0; for (int posIdx = 0; posIdx < _pendingCount; posIdx++) { Position doc = _pending[posIdx]; int delta = doc.docID - lastDocId; Debug.Assert(doc.termFreq != 0); if (doc.termFreq == 1) { _buffer.WriteVInt32((delta << 1) | 1); } else { _buffer.WriteVInt32(delta << 1); _buffer.WriteVInt32(doc.termFreq); } lastDocId = doc.docID; } } else if (_indexOptions == IndexOptions.DOCS_ONLY) { int lastDocId = 0; for (int posIdx = 0; posIdx < _pendingCount; posIdx++) { Position doc = _pending[posIdx]; _buffer.WriteVInt32(doc.docID - lastDocId); lastDocId = doc.docID; } } state2.bytes = new byte[(int)_buffer.GetFilePointer()]; _buffer.WriteTo(state2.bytes, 0); _buffer.Reset(); } _pendingCount = 0; }
private void FlushBlock() { //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer()); // First pass: compute common prefix for all terms // in the block, against term before first term in // this block: int commonPrefix = SharedPrefix(lastPrevTerm, pendingTerms[0].Term); for (int termCount = 1; termCount < pendingCount; termCount++) { commonPrefix = Math.Min(commonPrefix, SharedPrefix(lastPrevTerm, pendingTerms[termCount].Term)); } outerInstance.m_output.WriteVInt32(pendingCount); outerInstance.m_output.WriteVInt32(commonPrefix); // 2nd pass: write suffixes, as separate byte[] blob for (int termCount = 0; termCount < pendingCount; termCount++) { int suffix = pendingTerms[termCount].Term.Length - commonPrefix; // TODO: cutover to better intblock codec, instead // of interleaving here: bytesWriter.WriteVInt32(suffix); bytesWriter.WriteBytes(pendingTerms[termCount].Term.Bytes, commonPrefix, suffix); } outerInstance.m_output.WriteVInt32((int)bytesWriter.GetFilePointer()); bytesWriter.WriteTo(outerInstance.m_output); bytesWriter.Reset(); // 3rd pass: write the freqs as byte[] blob // TODO: cutover to better intblock codec. simple64? // write prefix, suffix first: for (int termCount = 0; termCount < pendingCount; termCount++) { BlockTermState state = pendingTerms[termCount].State; Debug.Assert(state != null); bytesWriter.WriteVInt32(state.DocFreq); if (fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY) { bytesWriter.WriteVInt64(state.TotalTermFreq - state.DocFreq); } } outerInstance.m_output.WriteVInt32((int)bytesWriter.GetFilePointer()); bytesWriter.WriteTo(outerInstance.m_output); bytesWriter.Reset(); // 4th pass: write the metadata long[] longs = new long[longsSize]; bool absolute = true; for (int termCount = 0; termCount < pendingCount; termCount++) { BlockTermState state = pendingTerms[termCount].State; postingsWriter.EncodeTerm(longs, bufferWriter, fieldInfo, state, absolute); for (int i = 0; i < longsSize; i++) { bytesWriter.WriteVInt64(longs[i]); } bufferWriter.WriteTo(bytesWriter); bufferWriter.Reset(); absolute = false; } outerInstance.m_output.WriteVInt32((int)bytesWriter.GetFilePointer()); bytesWriter.WriteTo(outerInstance.m_output); bytesWriter.Reset(); lastPrevTerm.CopyBytes(pendingTerms[pendingCount - 1].Term); pendingCount = 0; }