/// <summary> /// add a term </summary> public virtual void Add(Term term) { Debug.Assert(LastTerm.Equals(new Term("")) || term.CompareTo(LastTerm) > 0); try { int prefix = SharedPrefix(LastTerm.Bytes_Renamed, term.Bytes_Renamed); int suffix = term.Bytes_Renamed.Length - prefix; if (term.Field_Renamed.Equals(LastTerm.Field_Renamed)) { Output.WriteVInt(prefix << 1); } else { Output.WriteVInt(prefix << 1 | 1); Output.WriteString(term.Field_Renamed); } Output.WriteVInt(suffix); Output.WriteBytes(term.Bytes_Renamed.Bytes, term.Bytes_Renamed.Offset + prefix, suffix); LastTerm.Bytes_Renamed.CopyBytes(term.Bytes_Renamed); LastTerm.Field_Renamed = term.Field_Renamed; } catch (IOException e) { throw new Exception(e.Message, e); } }
/// <summary> /// add a term </summary> public virtual void Add(Term term) { Debug.Assert(lastTerm.Equals(new Term("")) || term.CompareTo(lastTerm) > 0); try { int prefix = SharedPrefix(lastTerm.Bytes, term.Bytes); int suffix = term.Bytes.Length - prefix; if (term.Field.Equals(lastTerm.Field, StringComparison.Ordinal)) { output.WriteVInt32(prefix << 1); } else { output.WriteVInt32(prefix << 1 | 1); output.WriteString(term.Field); } output.WriteVInt32(suffix); output.WriteBytes(term.Bytes.Bytes, term.Bytes.Offset + prefix, suffix); lastTerm.Bytes.CopyBytes(term.Bytes); lastTerm.Field = term.Field; } catch (IOException e) { throw new Exception(e.ToString(), e); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void addPosition(int pos, util.BytesRef payload, int startOffset, int endOffset) throws java.io.IOException public override void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) { Debug.Assert(payload == null || outerInstance.field.hasPayloads()); //System.out.println(" addPos pos=" + pos + " payload=" + payload); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int delta = pos - lastPos; int delta = pos - lastPos; Debug.Assert(delta >= 0); lastPos = pos; int payloadLen = 0; if (outerInstance.field.hasPayloads()) { payloadLen = payload == null ? 0 : payload.length; if (payloadLen != lastPayloadLen) { lastPayloadLen = payloadLen; buffer.WriteVInt((delta << 1) | 1); buffer.WriteVInt(payloadLen); } else { buffer.WriteVInt(delta << 1); } } else { buffer.WriteVInt(delta); } if (outerInstance.field.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, // and the numbers aren't that much smaller anyways. int offsetDelta = startOffset - lastOffset; int offsetLength = endOffset - startOffset; if (offsetLength != lastOffsetLength) { buffer.WriteVInt(offsetDelta << 1 | 1); buffer.WriteVInt(offsetLength); } else { buffer.WriteVInt(offsetDelta << 1); } lastOffset = startOffset; lastOffsetLength = offsetLength; } if (payloadLen > 0) { buffer.WriteBytes(payload.bytes, payload.offset, payloadLen); } }
public override void AddPosition(int pos, BytesRef payload, int startOffset, int endOffset) { Debug.Assert(payload == null || outerInstance.field.HasPayloads()); //System.out.println(" addPos pos=" + pos + " payload=" + payload); int delta = pos - lastPos; Debug.Assert(delta >= 0); lastPos = pos; int payloadLen = 0; if (outerInstance.field.HasPayloads()) { payloadLen = payload == null ? 0 : payload.Length; if (payloadLen != lastPayloadLen) { lastPayloadLen = payloadLen; buffer.WriteVInt((delta << 1) | 1); buffer.WriteVInt(payloadLen); } else { buffer.WriteVInt(delta << 1); } } else { buffer.WriteVInt(delta); } if (outerInstance.field.FieldIndexOptions.Value.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, // and the numbers aren't that much smaller anyways. int offsetDelta = startOffset - lastOffset; int offsetLength = endOffset - startOffset; if (offsetLength != lastOffsetLength) { buffer.WriteVInt(offsetDelta << 1 | 1); buffer.WriteVInt(offsetLength); } else { buffer.WriteVInt(offsetDelta << 1); } lastOffset = startOffset; lastOffsetLength = offsetLength; } if (payloadLen > 0) { buffer.WriteBytes(payload.Bytes, payload.Offset, payloadLen); } }
/// <summary> /// Called when we are done adding docs to this term /// </summary> /// <param name="state"></param> public override void FinishTerm(BlockTermState state) { var state2 = (PulsingTermState)state; Debug.Assert(_pendingCount > 0 || _pendingCount == -1); if (_pendingCount == -1) { state2.wrappedState.DocFreq = state2.DocFreq; state2.wrappedState.TotalTermFreq = state2.TotalTermFreq; state2.bytes = null; _wrappedPostingsWriter.FinishTerm(state2.wrappedState); } else { // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { var lastDocID = 0; var pendingIDX = 0; var lastPayloadLength = -1; var lastOffsetLength = -1; while (pendingIDX < _pendingCount) { var doc = _pending[pendingIDX]; var delta = doc.docID - lastDocID; lastDocID = doc.docID; // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); if (doc.termFreq == 1) { _buffer.WriteVInt32((delta << 1) | 1); } else { _buffer.WriteVInt32(delta << 1); _buffer.WriteVInt32(doc.termFreq); } var lastPos = 0; var lastOffset = 0; for (var posIDX = 0; posIDX < doc.termFreq; posIDX++) { var pos = _pending[pendingIDX++]; Debug.Assert(pos.docID == doc.docID); var posDelta = pos.pos - lastPos; lastPos = pos.pos; var payloadLength = pos.payload == null ? 0 : pos.payload.Length; if (_storePayloads) { if (payloadLength != lastPayloadLength) { _buffer.WriteVInt32((posDelta << 1) | 1); _buffer.WriteVInt32(payloadLength); lastPayloadLength = payloadLength; } else { _buffer.WriteVInt32(posDelta << 1); } } else { _buffer.WriteVInt32(posDelta); } if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { //System.out.println("write=" + pos.startOffset + "," + pos.endOffset); var offsetDelta = pos.startOffset - lastOffset; var offsetLength = pos.endOffset - pos.startOffset; if (offsetLength != lastOffsetLength) { _buffer.WriteVInt32(offsetDelta << 1 | 1); _buffer.WriteVInt32(offsetLength); } else { _buffer.WriteVInt32(offsetDelta << 1); } lastOffset = pos.startOffset; lastOffsetLength = offsetLength; } if (payloadLength > 0) { Debug.Assert(_storePayloads); _buffer.WriteBytes(pos.payload.Bytes, 0, pos.payload.Length); } } } } else if (_indexOptions == IndexOptions.DOCS_AND_FREQS) { int lastDocId = 0; for (int posIdx = 0; posIdx < _pendingCount; posIdx++) { Position doc = _pending[posIdx]; int delta = doc.docID - lastDocId; Debug.Assert(doc.termFreq != 0); if (doc.termFreq == 1) { _buffer.WriteVInt32((delta << 1) | 1); } else { _buffer.WriteVInt32(delta << 1); _buffer.WriteVInt32(doc.termFreq); } lastDocId = doc.docID; } } else if (_indexOptions == IndexOptions.DOCS_ONLY) { int lastDocId = 0; for (int posIdx = 0; posIdx < _pendingCount; posIdx++) { Position doc = _pending[posIdx]; _buffer.WriteVInt32(doc.docID - lastDocId); lastDocId = doc.docID; } } state2.bytes = new byte[(int)_buffer.GetFilePointer()]; _buffer.WriteTo(state2.bytes, 0); _buffer.Reset(); } _pendingCount = 0; }
public override void AddPosition(int pos, BytesRef payload, int startOffset, int endOffset) { if (Debugging.AssertsEnabled) { Debugging.Assert(payload == null || outerInstance.field.HasPayloads); } //System.out.println(" addPos pos=" + pos + " payload=" + payload); int delta = pos - lastPos; if (Debugging.AssertsEnabled) { Debugging.Assert(delta >= 0); } lastPos = pos; int payloadLen = 0; if (outerInstance.field.HasPayloads) { payloadLen = payload == null ? 0 : payload.Length; if (payloadLen != lastPayloadLen) { lastPayloadLen = payloadLen; buffer.WriteVInt32((delta << 1) | 1); buffer.WriteVInt32(payloadLen); } else { buffer.WriteVInt32(delta << 1); } } else { buffer.WriteVInt32(delta); } // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() if (IndexOptionsComparer.Default.Compare(outerInstance.field.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, // and the numbers aren't that much smaller anyways. int offsetDelta = startOffset - lastOffset; int offsetLength = endOffset - startOffset; if (offsetLength != lastOffsetLength) { buffer.WriteVInt32(offsetDelta << 1 | 1); buffer.WriteVInt32(offsetLength); } else { buffer.WriteVInt32(offsetDelta << 1); } lastOffset = startOffset; lastOffsetLength = offsetLength; } if (payloadLen > 0) { buffer.WriteBytes(payload.Bytes, payload.Offset, payloadLen); } }
private void FlushBlock() { //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer()); // First pass: compute common prefix for all terms // in the block, against term before first term in // this block: int commonPrefix = SharedPrefix(lastPrevTerm, pendingTerms[0].Term); for (int termCount = 1; termCount < pendingCount; termCount++) { commonPrefix = Math.Min(commonPrefix, SharedPrefix(lastPrevTerm, pendingTerms[termCount].Term)); } outerInstance.m_output.WriteVInt32(pendingCount); outerInstance.m_output.WriteVInt32(commonPrefix); // 2nd pass: write suffixes, as separate byte[] blob for (int termCount = 0; termCount < pendingCount; termCount++) { int suffix = pendingTerms[termCount].Term.Length - commonPrefix; // TODO: cutover to better intblock codec, instead // of interleaving here: bytesWriter.WriteVInt32(suffix); bytesWriter.WriteBytes(pendingTerms[termCount].Term.Bytes, commonPrefix, suffix); } outerInstance.m_output.WriteVInt32((int)bytesWriter.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream bytesWriter.WriteTo(outerInstance.m_output); bytesWriter.Reset(); // 3rd pass: write the freqs as byte[] blob // TODO: cutover to better intblock codec. simple64? // write prefix, suffix first: for (int termCount = 0; termCount < pendingCount; termCount++) { BlockTermState state = pendingTerms[termCount].State; if (Debugging.AssertsEnabled) { Debugging.Assert(state != null); } bytesWriter.WriteVInt32(state.DocFreq); if (fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY) { bytesWriter.WriteVInt64(state.TotalTermFreq - state.DocFreq); } } outerInstance.m_output.WriteVInt32((int)bytesWriter.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream bytesWriter.WriteTo(outerInstance.m_output); bytesWriter.Reset(); // 4th pass: write the metadata long[] longs = new long[longsSize]; bool absolute = true; for (int termCount = 0; termCount < pendingCount; termCount++) { BlockTermState state = pendingTerms[termCount].State; postingsWriter.EncodeTerm(longs, bufferWriter, fieldInfo, state, absolute); for (int i = 0; i < longsSize; i++) { bytesWriter.WriteVInt64(longs[i]); } bufferWriter.WriteTo(bytesWriter); bufferWriter.Reset(); absolute = false; } outerInstance.m_output.WriteVInt32((int)bytesWriter.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream bytesWriter.WriteTo(outerInstance.m_output); bytesWriter.Reset(); lastPrevTerm.CopyBytes(pendingTerms[pendingCount - 1].Term); pendingCount = 0; }
public DirectField(SegmentReadState state, string field, Terms termsIn, int minSkipCount, int lowFreqCutoff) { FieldInfo fieldInfo = state.FieldInfos.FieldInfo(field); sumTotalTermFreq = termsIn.SumTotalTermFreq; sumDocFreq = termsIn.SumDocFreq; docCount = termsIn.DocCount; int numTerms = (int) termsIn.Size(); if (numTerms == -1) { throw new System.ArgumentException("codec does not provide Terms.size()"); } terms = new TermAndSkip[numTerms]; termOffsets = new int[1 + numTerms]; byte[] termBytes = new byte[1024]; this.minSkipCount = minSkipCount; hasFreq = fieldInfo.FieldIndexOptions.Value.CompareTo(IndexOptions.DOCS_ONLY) > 0; hasPos = fieldInfo.FieldIndexOptions.Value.CompareTo(IndexOptions.DOCS_AND_FREQS) > 0; hasOffsets_Renamed = fieldInfo.FieldIndexOptions.Value.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0; hasPayloads_Renamed = fieldInfo.HasPayloads(); BytesRef term; DocsEnum docsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; TermsEnum termsEnum = termsIn.Iterator(null); int termOffset = 0; IntArrayWriter scratch = new IntArrayWriter(); // Used for payloads, if any: RAMOutputStream ros = new RAMOutputStream(); // if (DEBUG) { // System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads); // } while ((term = termsEnum.Next()) != null) { int docFreq = termsEnum.DocFreq(); long totalTermFreq = termsEnum.TotalTermFreq(); // if (DEBUG) { // System.out.println(" term=" + term.utf8ToString()); // } termOffsets[count] = termOffset; if (termBytes.Length < (termOffset + term.Length)) { termBytes = ArrayUtil.Grow(termBytes, termOffset + term.Length); } Array.Copy(term.Bytes, term.Offset, termBytes, termOffset, term.Length); termOffset += term.Length; termOffsets[count + 1] = termOffset; if (hasPos) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); } else { docsEnum = termsEnum.Docs(null, docsEnum); } TermAndSkip ent; DocsEnum docsEnum2; docsEnum2 = hasPos ? docsAndPositionsEnum : docsEnum; int docID; if (docFreq <= lowFreqCutoff) { ros.Reset(); // Pack postings for low-freq terms into a single int[]: while ((docID = docsEnum2.NextDoc()) != DocsEnum.NO_MORE_DOCS) { scratch.Add(docID); if (hasFreq) { int freq = docsEnum2.Freq(); scratch.Add(freq); if (hasPos) { for (int pos = 0; pos < freq; pos++) { scratch.Add(docsAndPositionsEnum.NextPosition()); if (hasOffsets_Renamed) { scratch.Add(docsAndPositionsEnum.StartOffset()); scratch.Add(docsAndPositionsEnum.EndOffset()); } if (hasPayloads_Renamed) { BytesRef payload = docsAndPositionsEnum.Payload; if (payload != null) { scratch.Add(payload.Length); ros.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { scratch.Add(0); } } } } } } byte[] payloads; if (hasPayloads_Renamed) { ros.Flush(); payloads = new byte[(int) ros.Length]; ros.WriteTo(payloads, 0); } else { payloads = null; } int[] postings = scratch.Get(); ent = new LowFreqTerm(postings, payloads, docFreq, (int) totalTermFreq); } else { var docs = new int[docFreq]; int[] freqs; int[][] positions; byte[][][] payloads; if (hasFreq) { freqs = new int[docFreq]; if (hasPos) { positions = new int[docFreq][]; if (hasPayloads_Renamed) { payloads = new byte[docFreq][][]; } else { payloads = null; } } else { positions = null; payloads = null; } } else { freqs = null; positions = null; payloads = null; } // Use separate int[] for the postings for high-freq // terms: int upto = 0; while ((docID = docsEnum2.NextDoc()) != DocsEnum.NO_MORE_DOCS) { docs[upto] = docID; if (hasFreq) { int freq = docsEnum2.Freq(); freqs[upto] = freq; if (hasPos) { int mult; if (hasOffsets_Renamed) { mult = 3; } else { mult = 1; } if (hasPayloads_Renamed) { payloads[upto] = new byte[freq][]; } positions[upto] = new int[mult*freq]; int posUpto = 0; for (int pos = 0; pos < freq; pos++) { positions[upto][posUpto] = docsAndPositionsEnum.NextPosition(); if (hasPayloads_Renamed) { BytesRef payload = docsAndPositionsEnum.Payload; if (payload != null) { var payloadBytes = new byte[payload.Length]; Array.Copy(payload.Bytes, payload.Offset, payloadBytes, 0, payload.Length); payloads[upto][pos] = payloadBytes; } } posUpto++; if (hasOffsets_Renamed) { positions[upto][posUpto++] = docsAndPositionsEnum.StartOffset(); positions[upto][posUpto++] = docsAndPositionsEnum.EndOffset(); } } } } upto++; } Debug.Assert(upto == docFreq); ent = new HighFreqTerm(docs, freqs, positions, payloads, totalTermFreq); } terms[count] = ent; SetSkips(count, termBytes); count++; } // End sentinel: termOffsets[count] = termOffset; FinishSkips(); //System.out.println(skipCount + " skips: " + field); this.termBytes = new byte[termOffset]; Array.Copy(termBytes, 0, this.termBytes, 0, termOffset); // Pack skips: this.skips = new int[skipCount]; this.skipOffsets = new int[1 + numTerms]; int skipOffset = 0; for (int i = 0; i < numTerms; i++) { int[] termSkips = terms[i].skips; skipOffsets[i] = skipOffset; if (termSkips != null) { Array.Copy(termSkips, 0, skips, skipOffset, termSkips.Length); skipOffset += termSkips.Length; terms[i].skips = null; } } this.skipOffsets[numTerms] = skipOffset; Debug.Assert(skipOffset == skipCount); }
private void FlushBlock() { // First pass: compute common prefix for all terms // in the block, against term before first term in // this block: int commonPrefix = SharedPrefix(_lastPrevTerm, _pendingTerms[0].Term); for (int termCount = 1; termCount < _pendingCount; termCount++) { commonPrefix = Math.Min(commonPrefix, SharedPrefix(_lastPrevTerm, _pendingTerms[termCount].Term)); } _btw._output.WriteVInt(_pendingCount); _btw._output.WriteVInt(commonPrefix); // 2nd pass: write suffixes, as separate byte[] blob for (var termCount = 0; termCount < _pendingCount; termCount++) { var suffix = _pendingTerms[termCount].Term.Length - commonPrefix; // TODO: cutover to better intblock codec, instead // of interleaving here: _bytesWriter.WriteVInt(suffix); _bytesWriter.WriteBytes(_pendingTerms[termCount].Term.Bytes, commonPrefix, suffix); } _btw._output.WriteVInt((int)_bytesWriter.FilePointer); _bytesWriter.WriteTo(_btw._output); _bytesWriter.Reset(); // 3rd pass: write the freqs as byte[] blob // TODO: cutover to better intblock codec. simple64? // write prefix, suffix first: for (int termCount = 0; termCount < _pendingCount; termCount++) { BlockTermState state = _pendingTerms[termCount].State; Debug.Assert(state != null); _bytesWriter.WriteVInt(state.DocFreq); if (_fieldInfo.FieldIndexOptions != FieldInfo.IndexOptions.DOCS_ONLY) { _bytesWriter.WriteVLong(state.TotalTermFreq - state.DocFreq); } } _btw._output.WriteVInt((int)_bytesWriter.FilePointer); _bytesWriter.WriteTo(_btw._output); _bytesWriter.Reset(); // 4th pass: write the metadata var longs = new long[_longsSize]; bool absolute = true; for (int termCount = 0; termCount < _pendingCount; termCount++) { BlockTermState state = _pendingTerms[termCount].State; _postingsWriter.EncodeTerm(longs, _bufferWriter, _fieldInfo, state, absolute); for (int i = 0; i < _longsSize; i++) { _bytesWriter.WriteVLong(longs[i]); } _bufferWriter.WriteTo(_bytesWriter); _bufferWriter.Reset(); absolute = false; } _btw._output.WriteVInt((int)_bytesWriter.FilePointer); _bytesWriter.WriteTo(_btw._output); _bytesWriter.Reset(); _lastPrevTerm.CopyBytes(_pendingTerms[_pendingCount - 1].Term); _pendingCount = 0; }