private void VerifyCount(IndexReader ir) { Fields fields = MultiFields.GetFields(ir); if (fields == null) { return; } foreach (string field in fields) { Terms terms = fields.Terms(field); if (terms == null) { continue; } int docCount = terms.DocCount; FixedBitSet visited = new FixedBitSet(ir.MaxDoc); TermsEnum te = terms.Iterator(null); while (te.Next() != null) { DocsEnum de = TestUtil.Docs(Random(), te, null, null, DocsEnum.FLAG_NONE); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visited.Set(de.DocID()); } } Assert.AreEqual(visited.Cardinality(), docCount); } }
private static void CheckSortedDocValues(string fieldName, AtomicReader reader, SortedDocValues dv, Bits docsWithField) { CheckBinaryDocValues(fieldName, reader, dv, docsWithField); int maxOrd = dv.ValueCount - 1; FixedBitSet seenOrds = new FixedBitSet(dv.ValueCount); int maxOrd2 = -1; for (int i = 0; i < reader.MaxDoc; i++) { int ord = dv.GetOrd(i); if (ord == -1) { if (docsWithField.Get(i)) { throw new Exception("dv for field: " + fieldName + " has -1 ord but is not marked missing for doc: " + i); } } else if (ord < -1 || ord > maxOrd) { throw new Exception("ord out of bounds: " + ord); } else { if (!docsWithField.Get(i)) { throw new Exception("dv for field: " + fieldName + " is missing but has ord=" + ord + " for doc: " + i); } maxOrd2 = Math.Max(maxOrd2, ord); seenOrds.Set(ord); } } if (maxOrd != maxOrd2) { throw new Exception("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2); } if (seenOrds.Cardinality() != dv.ValueCount) { throw new Exception("dv for field: " + fieldName + " has holes in its ords, valueCount=" + dv.ValueCount + " but only used: " + seenOrds.Cardinality()); } BytesRef lastValue = null; BytesRef scratch = new BytesRef(); for (int i = 0; i <= maxOrd; i++) { dv.LookupOrd(i, scratch); Debug.Assert(scratch.Valid); if (lastValue != null) { if (scratch.CompareTo(lastValue) <= 0) { throw new Exception("dv for field: " + fieldName + " has ords out of order: " + lastValue + " >=" + scratch); } } lastValue = BytesRef.DeepCopyOf(scratch); } }
/// <summary> /// checks Fields api is consistent with itself. /// searcher is optional, to verify with queries. Can be null. /// </summary> private static Status.TermIndexStatus CheckFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, bool doPrint, bool isVectors, TextWriter infoStream, bool verbose) { // TODO: we should probably return our own stats thing...?! Status.TermIndexStatus status = new Status.TermIndexStatus(); int computedFieldCount = 0; if (fields == null) { Msg(infoStream, "OK [no fields/terms]"); return status; } DocsEnum docs = null; DocsEnum docsAndFreqs = null; DocsAndPositionsEnum postings = null; string lastField = null; foreach (string field in fields) { // MultiFieldsEnum relies upon this order... if (lastField != null && field.CompareTo(lastField) <= 0) { throw new Exception("fields out of order: lastField=" + lastField + " field=" + field); } lastField = field; // check that the field is in fieldinfos, and is indexed. // TODO: add a separate test to check this for different reader impls FieldInfo fieldInfo = fieldInfos.FieldInfo(field); if (fieldInfo == null) { throw new Exception("fieldsEnum inconsistent with fieldInfos, no fieldInfos for: " + field); } if (!fieldInfo.Indexed) { throw new Exception("fieldsEnum inconsistent with fieldInfos, isIndexed == false for: " + field); } // TODO: really the codec should not return a field // from FieldsEnum if it has no Terms... but we do // this today: // assert fields.terms(field) != null; computedFieldCount++; Terms terms = fields.Terms(field); if (terms == null) { continue; } bool hasFreqs = terms.HasFreqs(); bool hasPositions = terms.HasPositions(); bool hasPayloads = terms.HasPayloads(); bool hasOffsets = terms.HasOffsets(); // term vectors cannot omit TF: bool expectedHasFreqs = (isVectors || fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS); if (hasFreqs != expectedHasFreqs) { throw new Exception("field \"" + field + "\" should have hasFreqs=" + expectedHasFreqs + " but got " + hasFreqs); } if (hasFreqs == false) { if (terms.SumTotalTermFreq != -1) { throw new Exception("field \"" + field + "\" hasFreqs is false, but Terms.getSumTotalTermFreq()=" + terms.SumTotalTermFreq + " (should be -1)"); } } if (!isVectors) { bool expectedHasPositions = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; if (hasPositions != expectedHasPositions) { throw new Exception("field \"" + field + "\" should have hasPositions=" + expectedHasPositions + " but got " + hasPositions); } bool expectedHasPayloads = fieldInfo.HasPayloads(); if (hasPayloads != expectedHasPayloads) { throw new Exception("field \"" + field + "\" should have hasPayloads=" + expectedHasPayloads + " but got " + hasPayloads); } bool expectedHasOffsets = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (hasOffsets != expectedHasOffsets) { throw new Exception("field \"" + field + "\" should have hasOffsets=" + expectedHasOffsets + " but got " + hasOffsets); } } TermsEnum termsEnum = terms.Iterator(null); bool hasOrd = true; long termCountStart = status.DelTermCount + status.TermCount; BytesRef lastTerm = null; IComparer<BytesRef> termComp = terms.Comparator; long sumTotalTermFreq = 0; long sumDocFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } Debug.Assert(term.Valid); // make sure terms arrive in order according to // the comp if (lastTerm == null) { lastTerm = BytesRef.DeepCopyOf(term); } else { if (termComp.Compare(lastTerm, term) >= 0) { throw new Exception("terms out of order: lastTerm=" + lastTerm + " term=" + term); } lastTerm.CopyBytes(term); } int docFreq = termsEnum.DocFreq(); if (docFreq <= 0) { throw new Exception("docfreq: " + docFreq + " is out of bounds"); } sumDocFreq += docFreq; docs = termsEnum.Docs(liveDocs, docs); postings = termsEnum.DocsAndPositions(liveDocs, postings); if (hasFreqs == false) { if (termsEnum.TotalTermFreq() != -1) { throw new Exception("field \"" + field + "\" hasFreqs is false, but TermsEnum.totalTermFreq()=" + termsEnum.TotalTermFreq() + " (should be -1)"); } } if (hasOrd) { long ord = -1; try { ord = termsEnum.Ord(); } catch (System.NotSupportedException uoe) { hasOrd = false; } if (hasOrd) { long ordExpected = status.DelTermCount + status.TermCount - termCountStart; if (ord != ordExpected) { throw new Exception("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected); } } } DocsEnum docs2; if (postings != null) { docs2 = postings; } else { docs2 = docs; } int lastDoc = -1; int docCount = 0; long totalTermFreq = 0; while (true) { int doc = docs2.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } status.TotFreq++; visitedDocs.Set(doc); int freq = -1; if (hasFreqs) { freq = docs2.Freq(); if (freq <= 0) { throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); } status.TotPos += freq; totalTermFreq += freq; } else { // When a field didn't index freq, it must // consistently "lie" and pretend that freq was // 1: if (docs2.Freq() != 1) { throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " != 1 when Terms.hasFreqs() is false"); } } docCount++; if (doc <= lastDoc) { throw new Exception("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); } if (doc >= maxDoc) { throw new Exception("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); } lastDoc = doc; int lastPos = -1; int lastOffset = 0; if (hasPositions) { for (int j = 0; j < freq; j++) { int pos = postings.NextPosition(); if (pos < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); } if (pos < lastPos) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } lastPos = pos; BytesRef payload = postings.Payload; if (payload != null) { Debug.Assert(payload.Valid); } if (payload != null && payload.Length < 1) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " payload length is out of bounds " + payload.Length); } if (hasOffsets) { int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before? // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter if (!isVectors) { if (startOffset < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds"); } if (startOffset < lastOffset) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset); } if (endOffset < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds"); } if (endOffset < startOffset) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset); } } lastOffset = startOffset; } } } } if (docCount != 0) { status.TermCount++; } else { status.DelTermCount++; } long totalTermFreq2 = termsEnum.TotalTermFreq(); bool hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1; // Re-count if there are deleted docs: if (liveDocs != null) { if (hasFreqs) { DocsEnum docsNoDel = termsEnum.Docs(null, docsAndFreqs); docCount = 0; totalTermFreq = 0; while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visitedDocs.Set(docsNoDel.DocID()); docCount++; totalTermFreq += docsNoDel.Freq(); } } else { DocsEnum docsNoDel = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE); docCount = 0; totalTermFreq = -1; while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visitedDocs.Set(docsNoDel.DocID()); docCount++; } } } if (docCount != docFreq) { throw new Exception("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount); } if (hasTotalTermFreq) { if (totalTermFreq2 <= 0) { throw new Exception("totalTermFreq: " + totalTermFreq2 + " is out of bounds"); } sumTotalTermFreq += totalTermFreq; if (totalTermFreq != totalTermFreq2) { throw new Exception("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq); } } // Test skipping if (hasPositions) { for (int idx = 0; idx < 7; idx++) { int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8); postings = termsEnum.DocsAndPositions(liveDocs, postings); int docID = postings.Advance(skipDocID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (docID < skipDocID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID); } int freq = postings.Freq(); if (freq <= 0) { throw new Exception("termFreq " + freq + " is out of bounds"); } int lastPosition = -1; int lastOffset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); if (pos < 0) { throw new Exception("position " + pos + " is out of bounds"); } if (pos < lastPosition) { throw new Exception("position " + pos + " is < lastPosition " + lastPosition); } lastPosition = pos; if (hasOffsets) { int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before? // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter if (!isVectors) { if (startOffset < 0) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds"); } if (startOffset < lastOffset) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset); } if (endOffset < 0) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds"); } if (endOffset < startOffset) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset); } } lastOffset = startOffset; } } int nextDocID = postings.NextDoc(); if (nextDocID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (nextDocID <= docID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID); } } } } else { for (int idx = 0; idx < 7; idx++) { int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8); docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE); int docID = docs.Advance(skipDocID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (docID < skipDocID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID); } int nextDocID = docs.NextDoc(); if (nextDocID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (nextDocID <= docID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID); } } } } } Terms fieldTerms = fields.Terms(field); if (fieldTerms == null) { // Unusual: the FieldsEnum returned a field but // the Terms for that field is null; this should // only happen if it's a ghost field (field with // no terms, eg there used to be terms but all // docs got deleted and then merged away): } else { if (fieldTerms is BlockTreeTermsReader.FieldReader) { BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader)fieldTerms).ComputeStats(); Debug.Assert(stats != null); if (status.BlockTreeStats == null) { status.BlockTreeStats = new Dictionary<string, BlockTreeTermsReader.Stats>(); } status.BlockTreeStats[field] = stats; } if (sumTotalTermFreq != 0) { long v = fields.Terms(field).SumTotalTermFreq; if (v != -1 && sumTotalTermFreq != v) { throw new Exception("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); } } if (sumDocFreq != 0) { long v = fields.Terms(field).SumDocFreq; if (v != -1 && sumDocFreq != v) { throw new Exception("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); } } if (fieldTerms != null) { int v = fieldTerms.DocCount; if (v != -1 && visitedDocs.Cardinality() != v) { throw new Exception("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.Cardinality()); } } // Test seek to last term: if (lastTerm != null) { if (termsEnum.SeekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND) { throw new Exception("seek to last term " + lastTerm + " failed"); } int expectedDocFreq = termsEnum.DocFreq(); DocsEnum d = termsEnum.Docs(null, null, DocsEnum.FLAG_NONE); int docFreq = 0; while (d.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { docFreq++; } if (docFreq != expectedDocFreq) { throw new Exception("docFreq for last term " + lastTerm + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq); } } // check unique term count long termCount = -1; if ((status.DelTermCount + status.TermCount) - termCountStart > 0) { termCount = fields.Terms(field).Size(); if (termCount != -1 && termCount != status.DelTermCount + status.TermCount - termCountStart) { throw new Exception("termCount mismatch " + (status.DelTermCount + termCount) + " vs " + (status.TermCount - termCountStart)); } } // Test seeking by ord if (hasOrd && status.TermCount - termCountStart > 0) { int seekCount = (int)Math.Min(10000L, termCount); if (seekCount > 0) { BytesRef[] seekTerms = new BytesRef[seekCount]; // Seek by ord for (int i = seekCount - 1; i >= 0; i--) { long ord = i * (termCount / seekCount); termsEnum.SeekExact(ord); seekTerms[i] = BytesRef.DeepCopyOf(termsEnum.Term()); } // Seek by term long totDocCount = 0; for (int i = seekCount - 1; i >= 0; i--) { if (termsEnum.SeekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) { throw new Exception("seek to existing term " + seekTerms[i] + " failed"); } docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE); if (docs == null) { throw new Exception("null DocsEnum from to existing term " + seekTerms[i]); } while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totDocCount++; } } long totDocCountNoDeletes = 0; long totDocFreq = 0; for (int i = 0; i < seekCount; i++) { if (!termsEnum.SeekExact(seekTerms[i])) { throw new Exception("seek to existing term " + seekTerms[i] + " failed"); } totDocFreq += termsEnum.DocFreq(); docs = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE); if (docs == null) { throw new Exception("null DocsEnum from to existing term " + seekTerms[i]); } while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totDocCountNoDeletes++; } } if (totDocCount > totDocCountNoDeletes) { throw new Exception("more postings with deletes=" + totDocCount + " than without=" + totDocCountNoDeletes); } if (totDocCountNoDeletes != totDocFreq) { throw new Exception("docfreqs=" + totDocFreq + " != recomputed docfreqs=" + totDocCountNoDeletes); } } } } } int fieldCount = fields.Size; if (fieldCount != -1) { if (fieldCount < 0) { throw new Exception("invalid fieldCount: " + fieldCount); } if (fieldCount != computedFieldCount) { throw new Exception("fieldCount mismatch " + fieldCount + " vs recomputed field count " + computedFieldCount); } } // for most implementations, this is boring (just the sum across all fields) // but codecs that don't work per-field like preflex actually implement this, // but don't implement it on Terms, so the check isn't redundant. long uniqueTermCountAllFields = fields.UniqueTermCount; if (uniqueTermCountAllFields != -1 && status.TermCount + status.DelTermCount != uniqueTermCountAllFields) { throw new Exception("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.TermCount + status.DelTermCount)); } if (doPrint) { Msg(infoStream, "OK [" + status.TermCount + " terms; " + status.TotFreq + " terms/docs pairs; " + status.TotPos + " tokens]"); } if (verbose && status.BlockTreeStats != null && infoStream != null && status.TermCount > 0) { foreach (KeyValuePair<string, BlockTreeTermsReader.Stats> ent in status.BlockTreeStats) { infoStream.WriteLine(" field \"" + ent.Key + "\":"); infoStream.WriteLine(" " + ent.Value.ToString().Replace("\n", "\n ")); } } return status; }
public virtual void TestBooleanScorerMax() { Directory dir = NewDirectory(); RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); int docCount = AtLeast(10000); for (int i = 0; i < docCount; i++) { Document doc = new Document(); doc.Add(NewField("field", "a", TextField.TYPE_NOT_STORED)); riw.AddDocument(doc); } riw.ForceMerge(1); IndexReader r = riw.Reader; riw.Dispose(); IndexSearcher s = NewSearcher(r); BooleanQuery bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD); bq.Add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD); Weight w = s.CreateNormalizedWeight(bq); Assert.AreEqual(1, s.IndexReader.Leaves.Count); BulkScorer scorer = w.BulkScorer(s.IndexReader.Leaves[0], false, null); FixedBitSet hits = new FixedBitSet(docCount); AtomicInteger end = new AtomicInteger(); Collector c = new CollectorAnonymousInnerClassHelper(this, scorer, hits, end); while (end.Get() < docCount) { int inc = TestUtil.NextInt(Random(), 1, 1000); end.AddAndGet(inc); scorer.Score(c, end.Get()); } Assert.AreEqual(docCount, hits.Cardinality()); r.Dispose(); dir.Dispose(); }
// maxAllowed = the "highest" we can index, but we will still // randomly index at lower IndexOption private FieldsProducer BuildIndex(Directory dir, FieldInfo.IndexOptions maxAllowed, bool allowPayloads, bool alwaysTestMax) { Codec codec = Codec; SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", MaxDoc, false, codec, null); int maxIndexOption = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(maxAllowed); if (VERBOSE) { Console.WriteLine("\nTEST: now build index"); } int maxIndexOptionNoOffsets = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); // TODO use allowPayloads var newFieldInfoArray = new FieldInfo[Fields.Count]; for (int fieldUpto = 0; fieldUpto < Fields.Count; fieldUpto++) { FieldInfo oldFieldInfo = FieldInfos.FieldInfo(fieldUpto); string pf = TestUtil.GetPostingsFormat(codec, oldFieldInfo.Name); int fieldMaxIndexOption; if (DoesntSupportOffsets.Contains(pf)) { fieldMaxIndexOption = Math.Min(maxIndexOptionNoOffsets, maxIndexOption); } else { fieldMaxIndexOption = maxIndexOption; } // Randomly picked the IndexOptions to index this // field with: FieldInfo.IndexOptions indexOptions = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToArray()[alwaysTestMax ? fieldMaxIndexOption : Random().Next(1 + fieldMaxIndexOption)]; bool doPayloads = indexOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.Name, true, fieldUpto, false, false, doPayloads, indexOptions, null, DocValuesType.NUMERIC, null); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); // Estimate that flushed segment size will be 25% of // what we use in RAM: long bytes = TotalPostings * 8 + TotalPayloadBytes; SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, 32, null, new IOContext(new FlushInfo(MaxDoc, bytes))); FieldsConsumer fieldsConsumer = codec.PostingsFormat().FieldsConsumer(writeState); foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields) { string field = fieldEnt.Key; IDictionary<BytesRef, long> terms = fieldEnt.Value; FieldInfo fieldInfo = newFieldInfos.FieldInfo(field); FieldInfo.IndexOptions? indexOptions = fieldInfo.FieldIndexOptions; if (VERBOSE) { Console.WriteLine("field=" + field + " indexOtions=" + indexOptions); } bool doFreq = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool doPos = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool doPayloads = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && allowPayloads; bool doOffsets = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; TermsConsumer termsConsumer = fieldsConsumer.AddField(fieldInfo); long sumTotalTF = 0; long sumDF = 0; FixedBitSet seenDocs = new FixedBitSet(MaxDoc); foreach (KeyValuePair<BytesRef, long> termEnt in terms) { BytesRef term = termEnt.Key; SeedPostings postings = GetSeedPostings(term.Utf8ToString(), termEnt.Value, false, maxAllowed); if (VERBOSE) { Console.WriteLine(" term=" + field + ":" + term.Utf8ToString() + " docFreq=" + postings.DocFreq + " seed=" + termEnt.Value); } PostingsConsumer postingsConsumer = termsConsumer.StartTerm(term); long totalTF = 0; int docID = 0; while ((docID = postings.NextDoc()) != DocsEnum.NO_MORE_DOCS) { int freq = postings.Freq(); if (VERBOSE) { Console.WriteLine(" " + postings.Upto + ": docID=" + docID + " freq=" + postings.Freq_Renamed); } postingsConsumer.StartDoc(docID, doFreq ? postings.Freq_Renamed : -1); seenDocs.Set(docID); if (doPos) { totalTF += postings.Freq_Renamed; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); BytesRef payload = postings.Payload; if (VERBOSE) { if (doPayloads) { Console.WriteLine(" pos=" + pos + " payload=" + (payload == null ? "null" : payload.Length + " bytes")); } else { Console.WriteLine(" pos=" + pos); } } postingsConsumer.AddPosition(pos, doPayloads ? payload : null, doOffsets ? postings.StartOffset() : -1, doOffsets ? postings.EndOffset() : -1); } } else if (doFreq) { totalTF += freq; } else { totalTF++; } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(term, new TermStats(postings.DocFreq, doFreq ? totalTF : -1)); sumTotalTF += totalTF; sumDF += postings.DocFreq; } termsConsumer.Finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.Cardinality()); } fieldsConsumer.Dispose(); if (VERBOSE) { Console.WriteLine("TEST: after indexing: files="); foreach (string file in dir.ListAll()) { Console.WriteLine(" " + file + ": " + dir.FileLength(file) + " bytes"); } } CurrentFieldInfos = newFieldInfos; SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ, 1); return codec.PostingsFormat().FieldsProducer(readState); }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs) { Assert.IsNull(acceptDocs, "acceptDocs should be null, as we have an index without deletions"); FixedBitSet set = new FixedBitSet(context.Reader.MaxDoc); int docBase = context.DocBase; int limit = docBase + context.Reader.MaxDoc; for (int index = 0; index < Docs.Length; index++) { int docId = Docs[index]; if (docId >= docBase && docId < limit) { set.Set(docId - docBase); } } return set.Cardinality() == 0 ? null : set; }