// DocValues updates private void ApplyDocValuesUpdates <T1>(IEnumerable <T1> updates, ReadersAndUpdates rld, SegmentReader reader, DocValuesFieldUpdates.Container dvUpdatesContainer) where T1 : DocValuesUpdate { lock (this) { Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return; } // TODO: we can process the updates per DV field, from last to first so that // if multiple terms affect same document for the same field, we add an update // only once (that of the last term). To do that, we can keep a bitset which // marks which documents have already been updated. So e.g. if term T1 // updates doc 7, and then we process term T2 and it updates doc 7 as well, // we don't apply the update since we know T1 came last and therefore wins // the update. // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so // that these documents aren't even returned. string currentField = null; TermsEnum termsEnum = null; DocsEnum docs = null; //System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader); foreach (DocValuesUpdate update in updates) { Term term = update.term; int limit = update.docIDUpto; // TODO: we traverse the terms in update order (not term order) so that we // apply the updates in the correct order, i.e. if two terms udpate the // same document, the last one that came in wins, irrespective of the // terms lexical order. // we can apply the updates in terms order if we keep an updatesGen (and // increment it with every update) and attach it to each NumericUpdate. Note // that we cannot rely only on docIDUpto because an app may send two updates // which will get same docIDUpto, yet will still need to respect the order // those updates arrived. if (!string.Equals(term.Field, currentField, StringComparison.Ordinal)) { // if we change the code to process updates in terms order, enable this assert // assert currentField == null || currentField.CompareToOrdinal(term.Field) < 0; currentField = term.Field; Terms terms = fields.GetTerms(currentField); if (terms != null) { termsEnum = terms.GetIterator(termsEnum); } else { termsEnum = null; continue; // no terms in that field } } if (termsEnum == null) { continue; } // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes)) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsFlags.NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); DocValuesFieldUpdates dvUpdates = dvUpdatesContainer.GetUpdates(update.field, update.type); if (dvUpdates == null) { dvUpdates = dvUpdatesContainer.NewUpdates(update.field, update.type, reader.MaxDoc); } int doc; while ((doc = docsEnum.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + " doc=" + docID); if (doc >= limit) { break; // no more docs that can be updated for this term } dvUpdates.Add(doc, update.value); } } } } }
/// <summary> /// Returns the number of documents containing the /// <paramref name="term"/>. This method returns 0 if the term or /// field does not exist. This method does not take into /// account deleted documents that have not yet been merged /// away. </summary> /// <seealso cref="TermsEnum.DocFreq"/> public abstract int DocFreq(Term term);
/// <summary> /// Returns the total number of occurrences of <paramref name="term"/> across all /// documents (the sum of the Freq for each doc that has this term). This /// will be -1 if the codec doesn't support this measure. Note that, like other /// term measures, this measure does not take deleted documents into account. /// </summary> public abstract long TotalTermFreq(Term term);
internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.IsIndexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparer; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: IndexOptions currentFieldIndexOptions = fieldInfo.IndexOptions; if (Debugging.AssertsEnabled) { Debugging.Assert(currentFieldIndexOptions != IndexOptions.NONE); } // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() bool writeTermFreq = IndexOptionsComparer.Default.Compare(currentFieldIndexOptions, IndexOptions.DOCS_AND_FREQS) >= 0; bool writePositions = IndexOptionsComparer.Default.Compare(currentFieldIndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; bool writeOffsets = IndexOptionsComparer.Default.Compare(currentFieldIndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; bool readTermFreq = this.hasFreq; bool readPositions = this.hasProx; bool readOffsets = this.hasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: if (Debugging.AssertsEnabled) { Debugging.Assert(!writeTermFreq || readTermFreq); Debugging.Assert(!writePositions || readPositions); Debugging.Assert(!writeOffsets || readOffsets); Debugging.Assert(!writeOffsets || writePositions); } IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.terms.Count > 0) { segDeletes = state.SegUpdates.terms; } else { segDeletes = null; } int[] termIDs = termsHashPerField.SortPostings(termComp); int numTerms = termsHashPerField.bytesHash.Count; BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.textStarts[termID]; termsHashPerField.bytePool.SetBytesRef(text, textStart); termsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { termsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes = text; if (segDeletes.TryGetValue(protoTerm, out int?docIDUpto) && docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.lastDocCodes[termID] != -1) { // Return last doc docID = postings.lastDocIDs[termID]; if (readTermFreq) { termFreq = postings.termFreqs[termID]; } else { termFreq = -1; } postings.lastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt32(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt32(); } } if (Debugging.AssertsEnabled) { Debugging.Assert(docID != postings.lastDocIDs[termID]); } } docFreq++; if (Debugging.AssertsEnabled) { Debugging.Assert(docID < state.SegmentInfo.DocCount, "doc={0} maxDoc={1}", docID, state.SegmentInfo.DocCount); } // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = docState.docWriter.codec.LiveDocsFormat.NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt32(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt32(); if (payload == null) { payload = new BytesRef(); payload.Bytes = new byte[payloadLength]; } else if (payload.Bytes.Length < payloadLength) { payload.Grow(payloadLength); } prox.ReadBytes(payload.Bytes, 0, payloadLength); payload.Length = payloadLength; thisPayload = payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt32(); int endOffset = startOffset + prox.ReadVInt32(); if (writePositions) { if (writeOffsets) { if (Debugging.AssertsEnabled) { Debugging.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset={0},endOffset={1},offset={2}", startOffset, endOffset, offset); } postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }