public override int SetField(FieldInfo fieldInfo) { IndexOptions indexOptions = fieldInfo.IndexOptions; fieldHasFreqs = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS) >= 0; fieldHasPositions = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; fieldHasOffsets = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; fieldHasPayloads = fieldInfo.HasPayloads; skipWriter.SetField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); lastState = emptyState; if (fieldHasPositions) { if (fieldHasPayloads || fieldHasOffsets) { return(3); // doc + pos + pay FP } else { return(2); // doc + pos FP } } else { return(1); // doc FP } }
public SimpleTextPostingsWriter(SimpleTextFieldsWriter outerInstance, FieldInfo field) { _outerInstance = outerInstance; _indexOptions = field.FieldIndexOptions.Value; _writePositions = _indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; _writeOffsets = _indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; }
/// <summary> /// Add a new position & payload </summary> public override void AddPosition(int position, BytesRef payload, int startOffset, int endOffset) { //if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.Length + " bytes")) + " proxFP=" + proxOut.getFilePointer()); Debug.Assert(IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0, "invalid indexOptions: " + IndexOptions); Debug.Assert(ProxOut != null); int delta = position - LastPosition; Debug.Assert(delta >= 0, "position=" + position + " lastPosition=" + LastPosition); // not quite right (if pos=0 is repeated twice we don't catch it) LastPosition = position; int payloadLength = 0; if (StorePayloads) { payloadLength = payload == null ? 0 : payload.Length; if (payloadLength != LastPayloadLength) { LastPayloadLength = payloadLength; ProxOut.WriteVInt32((delta << 1) | 1); ProxOut.WriteVInt32(payloadLength); } else { ProxOut.WriteVInt32(delta << 1); } } else { ProxOut.WriteVInt32(delta); } if (StoreOffsets) { // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms, // and the numbers aren't that much smaller anyways. int offsetDelta = startOffset - LastOffset; int offsetLength = endOffset - startOffset; Debug.Assert(offsetDelta >= 0 && offsetLength >= 0, "startOffset=" + startOffset + ",lastOffset=" + LastOffset + ",endOffset=" + endOffset); if (offsetLength != LastOffsetLength) { ProxOut.WriteVInt32(offsetDelta << 1 | 1); ProxOut.WriteVInt32(offsetLength); } else { ProxOut.WriteVInt32(offsetDelta << 1); } LastOffset = startOffset; LastOffsetLength = offsetLength; } if (payloadLength > 0) { ProxOut.WriteBytes(payload.Bytes, payload.Offset, payloadLength); } }
private void SetIndexOptions(IndexOptions indexOptions) { if (indexOptions == IndexOptions.NONE) { // field could later be updated with indexed=true, so set everything on hasFreq = hasProx = hasOffsets = true; } else { hasFreq = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS) >= 0; hasProx = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; hasOffsets = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; } }
public virtual SimpleTextDocsAndPositionsEnum Reset(long fp, IBits liveDocs, IndexOptions indexOptions, int docFreq) { _liveDocs = liveDocs; _nextDocStart = fp; _docId = -1; _readPositions = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; _readOffsets = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; if (!_readOffsets) { _startOffset = -1; _endOffset = -1; } _cost = docFreq; return(this); }
private bool CheckConsistency() { if (!indexed) { if (Debugging.AssertsEnabled) { Debugging.Assert(!storeTermVector); Debugging.Assert(!storePayloads); Debugging.Assert(!omitNorms); Debugging.Assert(normType == DocValuesType.NONE); Debugging.Assert(indexOptions == IndexOptions.NONE); } } else { if (Debugging.AssertsEnabled) { Debugging.Assert(indexOptions != IndexOptions.NONE); } if (omitNorms) { if (Debugging.AssertsEnabled) { Debugging.Assert(normType == DocValuesType.NONE); } } // Cannot store payloads unless positions are indexed: if (Debugging.AssertsEnabled) { Debugging.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads); } } return(true); }
public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags) { DocsEnum inReuse; SortingDocsEnum wrapReuse; if (reuse != null && reuse is SortingDocsEnum) { // if we're asked to reuse the given DocsEnum and it is Sorting, return // the wrapped one, since some Codecs expect it. wrapReuse = (SortingDocsEnum)reuse; inReuse = wrapReuse.Wrapped; } else { wrapReuse = null; inReuse = reuse; } DocsEnum inDocs = m_input.Docs(NewToOld(liveDocs), inReuse, flags); bool withFreqs = indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS) >= 0 && (flags & DocsFlags.FREQS) != 0; return(new SortingDocsEnum(docMap.Count, wrapReuse, inDocs, withFreqs, docMap)); }
// Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes public override int SetField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; this.indexOptions = fieldInfo.IndexOptions; if (indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { throw new System.NotSupportedException("this codec cannot index offsets"); } skipListWriter.SetIndexOptions(indexOptions); storePayloads = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && fieldInfo.HasPayloads; lastPayloadFP = 0; lastSkipFP = 0; lastState = SetEmptyState(); return(0); }
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) { if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { // Positions were not indexed return(null); } SimpleTextDocsAndPositionsEnum docsAndPositionsEnum; if (reuse != null && reuse is SimpleTextDocsAndPositionsEnum && ((SimpleTextDocsAndPositionsEnum)reuse).CanReuse(_outerInstance._input)) { docsAndPositionsEnum = (SimpleTextDocsAndPositionsEnum)reuse; } else { docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum(_outerInstance); } return(docsAndPositionsEnum.Reset(_docsStart, liveDocs, _indexOptions, _docFreq)); }
// Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes public override int SetField(FieldInfo fieldInfo) { //System.out.println("SPW: setField"); /* * if (BlockTreeTermsWriter.DEBUG && fieldInfo.Name.Equals("id", StringComparison.Ordinal)) { * DEBUG = true; * } else { * DEBUG = false; * } */ this.FieldInfo = fieldInfo; IndexOptions = fieldInfo.IndexOptions; StoreOffsets = IndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; StorePayloads = fieldInfo.HasPayloads; LastState = EmptyState; //System.out.println(" set init blockFreqStart=" + freqStart); //System.out.println(" set init blockProxStart=" + proxStart); return(0); }
/// <summary> /// Called when we are done adding docs to this term /// </summary> /// <param name="state"></param> public override void FinishTerm(BlockTermState state) { var state2 = (PulsingTermState)state; Debug.Assert(_pendingCount > 0 || _pendingCount == -1); if (_pendingCount == -1) { state2.wrappedState.DocFreq = state2.DocFreq; state2.wrappedState.TotalTermFreq = state2.TotalTermFreq; state2.bytes = null; _wrappedPostingsWriter.FinishTerm(state2.wrappedState); } else { // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { var lastDocID = 0; var pendingIDX = 0; var lastPayloadLength = -1; var lastOffsetLength = -1; while (pendingIDX < _pendingCount) { var doc = _pending[pendingIDX]; var delta = doc.docID - lastDocID; lastDocID = doc.docID; // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); if (doc.termFreq == 1) { _buffer.WriteVInt32((delta << 1) | 1); } else { _buffer.WriteVInt32(delta << 1); _buffer.WriteVInt32(doc.termFreq); } var lastPos = 0; var lastOffset = 0; for (var posIDX = 0; posIDX < doc.termFreq; posIDX++) { var pos = _pending[pendingIDX++]; Debug.Assert(pos.docID == doc.docID); var posDelta = pos.pos - lastPos; lastPos = pos.pos; var payloadLength = pos.payload == null ? 0 : pos.payload.Length; if (_storePayloads) { if (payloadLength != lastPayloadLength) { _buffer.WriteVInt32((posDelta << 1) | 1); _buffer.WriteVInt32(payloadLength); lastPayloadLength = payloadLength; } else { _buffer.WriteVInt32(posDelta << 1); } } else { _buffer.WriteVInt32(posDelta); } if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { //System.out.println("write=" + pos.startOffset + "," + pos.endOffset); var offsetDelta = pos.startOffset - lastOffset; var offsetLength = pos.endOffset - pos.startOffset; if (offsetLength != lastOffsetLength) { _buffer.WriteVInt32(offsetDelta << 1 | 1); _buffer.WriteVInt32(offsetLength); } else { _buffer.WriteVInt32(offsetDelta << 1); } lastOffset = pos.startOffset; lastOffsetLength = offsetLength; } if (payloadLength > 0) { Debug.Assert(_storePayloads); _buffer.WriteBytes(pos.payload.Bytes, 0, pos.payload.Length); } } } } else if (_indexOptions == IndexOptions.DOCS_AND_FREQS) { int lastDocId = 0; for (int posIdx = 0; posIdx < _pendingCount; posIdx++) { Position doc = _pending[posIdx]; int delta = doc.docID - lastDocId; Debug.Assert(doc.termFreq != 0); if (doc.termFreq == 1) { _buffer.WriteVInt32((delta << 1) | 1); } else { _buffer.WriteVInt32(delta << 1); _buffer.WriteVInt32(doc.termFreq); } lastDocId = doc.docID; } } else if (_indexOptions == IndexOptions.DOCS_ONLY) { int lastDocId = 0; for (int posIdx = 0; posIdx < _pendingCount; posIdx++) { Position doc = _pending[posIdx]; _buffer.WriteVInt32(doc.docID - lastDocId); lastDocId = doc.docID; } } state2.bytes = new byte[(int)_buffer.GetFilePointer()]; _buffer.WriteTo(state2.bytes, 0); _buffer.Reset(); } _pendingCount = 0; }
public override void Write(Directory directory, string segmentName, string segmentSuffix, FieldInfos infos, IOContext context) { string fileName = IndexFileNames.SegmentFileName(segmentName, "", Lucene42FieldInfosFormat.EXTENSION); IndexOutput output = directory.CreateOutput(fileName, context); bool success = false; try { CodecUtil.WriteHeader(output, Lucene42FieldInfosFormat.CODEC_NAME, Lucene42FieldInfosFormat.FORMAT_CURRENT); output.WriteVInt32(infos.Count); foreach (FieldInfo fi in infos) { IndexOptions indexOptions = fi.IndexOptions; sbyte bits = 0x0; if (fi.HasVectors) { bits |= Lucene42FieldInfosFormat.STORE_TERMVECTOR; } if (fi.OmitsNorms) { bits |= Lucene42FieldInfosFormat.OMIT_NORMS; } if (fi.HasPayloads) { bits |= Lucene42FieldInfosFormat.STORE_PAYLOADS; } if (fi.IsIndexed) { bits |= Lucene42FieldInfosFormat.IS_INDEXED; Debug.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.HasPayloads); if (indexOptions == IndexOptions.DOCS_ONLY) { bits |= Lucene42FieldInfosFormat.OMIT_TERM_FREQ_AND_POSITIONS; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { bits |= Lucene42FieldInfosFormat.STORE_OFFSETS_IN_POSTINGS; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { bits |= Lucene42FieldInfosFormat.OMIT_POSITIONS; } } output.WriteString(fi.Name); output.WriteVInt32(fi.Number); output.WriteByte((byte)bits); // pack the DV types in one byte var dv = DocValuesByte(fi.DocValuesType); var nrm = DocValuesByte(fi.NormType); Debug.Assert((dv & (~0xF)) == 0 && (nrm & (~0x0F)) == 0); var val = (byte)(0xff & ((nrm << 4) | (byte)dv)); output.WriteByte(val); output.WriteStringStringMap(fi.Attributes); } success = true; } finally { if (success) { output.Dispose(); } else { IOUtils.DisposeWhileHandlingException(output); } } }
internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.IsIndexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparer; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: IndexOptions currentFieldIndexOptions = fieldInfo.IndexOptions; Debug.Assert(currentFieldIndexOptions != IndexOptions.NONE); bool writeTermFreq = currentFieldIndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS) >= 0; bool writePositions = currentFieldIndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; bool writeOffsets = currentFieldIndexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; bool readTermFreq = this.hasFreq; bool readPositions = this.hasProx; bool readOffsets = this.hasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.terms.Count > 0) { segDeletes = state.SegUpdates.terms; } else { segDeletes = null; } int[] termIDs = termsHashPerField.SortPostings(termComp); int numTerms = termsHashPerField.bytesHash.Count; BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.textStarts[termID]; termsHashPerField.bytePool.SetBytesRef(text, textStart); termsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { termsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.lastDocCodes[termID] != -1) { // Return last doc docID = postings.lastDocIDs[termID]; if (readTermFreq) { termFreq = postings.termFreqs[termID]; } else { termFreq = -1; } postings.lastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt32(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt32(); } } Debug.Assert(docID != postings.lastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = docState.docWriter.codec.LiveDocsFormat.NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt32(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt32(); if (payload == null) { payload = new BytesRef(); payload.Bytes = new byte[payloadLength]; } else if (payload.Bytes.Length < payloadLength) { payload.Grow(payloadLength); } prox.ReadBytes(payload.Bytes, 0, payloadLength); payload.Length = payloadLength; thisPayload = payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt32(); int endOffset = startOffset + prox.ReadVInt32(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) { _indexOptions = fieldInfo.IndexOptions; _storePayloads = fieldInfo.HasPayloads; _storeOffsets = _indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; }
public override int NextDoc() { while (true) { if (_postings.Eof) { return(_docId = NO_MORE_DOCS); } var code = _postings.ReadVInt32(); if (_indexOptions == IndexOptions.DOCS_ONLY) { _accum += code; } else { _accum += (int)((uint)code >> 1);; // shift off low bit _freq = (code & 1) != 0 ? 1 : _postings.ReadVInt32(); if (_indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { // Skip positions if (_storePayloads) { for (var pos = 0; pos < _freq; pos++) { var posCode = _postings.ReadVInt32(); if ((posCode & 1) != 0) { _payloadLength = _postings.ReadVInt32(); } if (_storeOffsets && (_postings.ReadVInt32() & 1) != 0) { // new offset length _postings.ReadVInt32(); } if (_payloadLength != 0) { _postings.SkipBytes(_payloadLength); } } } else { for (var pos = 0; pos < _freq; pos++) { // TODO: skipVInt _postings.ReadVInt32(); if (_storeOffsets && (_postings.ReadVInt32() & 1) != 0) { // new offset length _postings.ReadVInt32(); } } } } } if (_liveDocs == null || _liveDocs.Get(_accum)) { return(_docId = _accum); } } }