public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) { SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), field); FixedBitSet bits = new FixedBitSet(fcsi.ValueCount); for (int i = 0; i < terms.Length; i++) { int ord = fcsi.LookupTerm(terms[i]); if (ord >= 0) { bits.Set(ord); } } return(new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) => { int ord = fcsi.GetOrd(doc); if (ord == -1) { // missing return false; } else { return bits.Get(ord); } })); }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs) { FixedBitSet bits = new FixedBitSet(context.Reader.MaxDoc); bits.Set(Doc); if (acceptDocs != null && !acceptDocs.Get(Doc)) { bits.Clear(Doc); } return bits; }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs) { FixedBitSet bits = new FixedBitSet(context.Reader.MaxDoc); bits.Set(Doc); if (acceptDocs != null && !acceptDocs.Get(Doc)) { bits.Clear(Doc); } return(bits); }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) { SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), field); FixedBitSet bits = new FixedBitSet(fcsi.ValueCount); for (int i = 0; i < terms.Length; i++) { int ord = fcsi.LookupTerm(terms[i]); if (ord >= 0) { bits.Set(ord); } } return(new FieldCacheDocIdSetAnonymousInnerClassHelper(this, context.Reader.MaxDoc, acceptDocs, fcsi, bits)); }
private void VerifyCount(IndexReader ir) { Fields fields = MultiFields.GetFields(ir); if (fields == null) { return; } foreach (string field in fields) { Terms terms = fields.Terms(field); if (terms == null) { continue; } int docCount = terms.DocCount; FixedBitSet visited = new FixedBitSet(ir.MaxDoc); TermsEnum te = terms.Iterator(null); while (te.Next() != null) { DocsEnum de = TestUtil.Docs(Random(), te, null, null, DocsEnum.FLAG_NONE); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visited.Set(de.DocID()); } } Assert.AreEqual(visited.Cardinality(), docCount); } }
private static void CheckSortedDocValues(string fieldName, AtomicReader reader, SortedDocValues dv, Bits docsWithField) { CheckBinaryDocValues(fieldName, reader, dv, docsWithField); int maxOrd = dv.ValueCount - 1; FixedBitSet seenOrds = new FixedBitSet(dv.ValueCount); int maxOrd2 = -1; for (int i = 0; i < reader.MaxDoc; i++) { int ord = dv.GetOrd(i); if (ord == -1) { if (docsWithField.Get(i)) { throw new Exception("dv for field: " + fieldName + " has -1 ord but is not marked missing for doc: " + i); } } else if (ord < -1 || ord > maxOrd) { throw new Exception("ord out of bounds: " + ord); } else { if (!docsWithField.Get(i)) { throw new Exception("dv for field: " + fieldName + " is missing but has ord=" + ord + " for doc: " + i); } maxOrd2 = Math.Max(maxOrd2, ord); seenOrds.Set(ord); } } if (maxOrd != maxOrd2) { throw new Exception("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2); } if (seenOrds.Cardinality() != dv.ValueCount) { throw new Exception("dv for field: " + fieldName + " has holes in its ords, valueCount=" + dv.ValueCount + " but only used: " + seenOrds.Cardinality()); } BytesRef lastValue = null; BytesRef scratch = new BytesRef(); for (int i = 0; i <= maxOrd; i++) { dv.LookupOrd(i, scratch); Debug.Assert(scratch.Valid); if (lastValue != null) { if (scratch.CompareTo(lastValue) <= 0) { throw new Exception("dv for field: " + fieldName + " has ords out of order: " + lastValue + " >=" + scratch); } } lastValue = BytesRef.DeepCopyOf(scratch); } }
/// <summary> /// checks Fields api is consistent with itself. /// searcher is optional, to verify with queries. Can be null. /// </summary> private static Status.TermIndexStatus CheckFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, bool doPrint, bool isVectors, TextWriter infoStream, bool verbose) { // TODO: we should probably return our own stats thing...?! Status.TermIndexStatus status = new Status.TermIndexStatus(); int computedFieldCount = 0; if (fields == null) { Msg(infoStream, "OK [no fields/terms]"); return status; } DocsEnum docs = null; DocsEnum docsAndFreqs = null; DocsAndPositionsEnum postings = null; string lastField = null; foreach (string field in fields) { // MultiFieldsEnum relies upon this order... if (lastField != null && field.CompareTo(lastField) <= 0) { throw new Exception("fields out of order: lastField=" + lastField + " field=" + field); } lastField = field; // check that the field is in fieldinfos, and is indexed. // TODO: add a separate test to check this for different reader impls FieldInfo fieldInfo = fieldInfos.FieldInfo(field); if (fieldInfo == null) { throw new Exception("fieldsEnum inconsistent with fieldInfos, no fieldInfos for: " + field); } if (!fieldInfo.Indexed) { throw new Exception("fieldsEnum inconsistent with fieldInfos, isIndexed == false for: " + field); } // TODO: really the codec should not return a field // from FieldsEnum if it has no Terms... but we do // this today: // assert fields.terms(field) != null; computedFieldCount++; Terms terms = fields.Terms(field); if (terms == null) { continue; } bool hasFreqs = terms.HasFreqs(); bool hasPositions = terms.HasPositions(); bool hasPayloads = terms.HasPayloads(); bool hasOffsets = terms.HasOffsets(); // term vectors cannot omit TF: bool expectedHasFreqs = (isVectors || fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS); if (hasFreqs != expectedHasFreqs) { throw new Exception("field \"" + field + "\" should have hasFreqs=" + expectedHasFreqs + " but got " + hasFreqs); } if (hasFreqs == false) { if (terms.SumTotalTermFreq != -1) { throw new Exception("field \"" + field + "\" hasFreqs is false, but Terms.getSumTotalTermFreq()=" + terms.SumTotalTermFreq + " (should be -1)"); } } if (!isVectors) { bool expectedHasPositions = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; if (hasPositions != expectedHasPositions) { throw new Exception("field \"" + field + "\" should have hasPositions=" + expectedHasPositions + " but got " + hasPositions); } bool expectedHasPayloads = fieldInfo.HasPayloads(); if (hasPayloads != expectedHasPayloads) { throw new Exception("field \"" + field + "\" should have hasPayloads=" + expectedHasPayloads + " but got " + hasPayloads); } bool expectedHasOffsets = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (hasOffsets != expectedHasOffsets) { throw new Exception("field \"" + field + "\" should have hasOffsets=" + expectedHasOffsets + " but got " + hasOffsets); } } TermsEnum termsEnum = terms.Iterator(null); bool hasOrd = true; long termCountStart = status.DelTermCount + status.TermCount; BytesRef lastTerm = null; IComparer<BytesRef> termComp = terms.Comparator; long sumTotalTermFreq = 0; long sumDocFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } Debug.Assert(term.Valid); // make sure terms arrive in order according to // the comp if (lastTerm == null) { lastTerm = BytesRef.DeepCopyOf(term); } else { if (termComp.Compare(lastTerm, term) >= 0) { throw new Exception("terms out of order: lastTerm=" + lastTerm + " term=" + term); } lastTerm.CopyBytes(term); } int docFreq = termsEnum.DocFreq(); if (docFreq <= 0) { throw new Exception("docfreq: " + docFreq + " is out of bounds"); } sumDocFreq += docFreq; docs = termsEnum.Docs(liveDocs, docs); postings = termsEnum.DocsAndPositions(liveDocs, postings); if (hasFreqs == false) { if (termsEnum.TotalTermFreq() != -1) { throw new Exception("field \"" + field + "\" hasFreqs is false, but TermsEnum.totalTermFreq()=" + termsEnum.TotalTermFreq() + " (should be -1)"); } } if (hasOrd) { long ord = -1; try { ord = termsEnum.Ord(); } catch (System.NotSupportedException uoe) { hasOrd = false; } if (hasOrd) { long ordExpected = status.DelTermCount + status.TermCount - termCountStart; if (ord != ordExpected) { throw new Exception("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected); } } } DocsEnum docs2; if (postings != null) { docs2 = postings; } else { docs2 = docs; } int lastDoc = -1; int docCount = 0; long totalTermFreq = 0; while (true) { int doc = docs2.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } status.TotFreq++; visitedDocs.Set(doc); int freq = -1; if (hasFreqs) { freq = docs2.Freq(); if (freq <= 0) { throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); } status.TotPos += freq; totalTermFreq += freq; } else { // When a field didn't index freq, it must // consistently "lie" and pretend that freq was // 1: if (docs2.Freq() != 1) { throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " != 1 when Terms.hasFreqs() is false"); } } docCount++; if (doc <= lastDoc) { throw new Exception("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); } if (doc >= maxDoc) { throw new Exception("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); } lastDoc = doc; int lastPos = -1; int lastOffset = 0; if (hasPositions) { for (int j = 0; j < freq; j++) { int pos = postings.NextPosition(); if (pos < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); } if (pos < lastPos) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } lastPos = pos; BytesRef payload = postings.Payload; if (payload != null) { Debug.Assert(payload.Valid); } if (payload != null && payload.Length < 1) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " payload length is out of bounds " + payload.Length); } if (hasOffsets) { int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before? // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter if (!isVectors) { if (startOffset < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds"); } if (startOffset < lastOffset) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset); } if (endOffset < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds"); } if (endOffset < startOffset) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset); } } lastOffset = startOffset; } } } } if (docCount != 0) { status.TermCount++; } else { status.DelTermCount++; } long totalTermFreq2 = termsEnum.TotalTermFreq(); bool hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1; // Re-count if there are deleted docs: if (liveDocs != null) { if (hasFreqs) { DocsEnum docsNoDel = termsEnum.Docs(null, docsAndFreqs); docCount = 0; totalTermFreq = 0; while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visitedDocs.Set(docsNoDel.DocID()); docCount++; totalTermFreq += docsNoDel.Freq(); } } else { DocsEnum docsNoDel = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE); docCount = 0; totalTermFreq = -1; while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visitedDocs.Set(docsNoDel.DocID()); docCount++; } } } if (docCount != docFreq) { throw new Exception("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount); } if (hasTotalTermFreq) { if (totalTermFreq2 <= 0) { throw new Exception("totalTermFreq: " + totalTermFreq2 + " is out of bounds"); } sumTotalTermFreq += totalTermFreq; if (totalTermFreq != totalTermFreq2) { throw new Exception("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq); } } // Test skipping if (hasPositions) { for (int idx = 0; idx < 7; idx++) { int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8); postings = termsEnum.DocsAndPositions(liveDocs, postings); int docID = postings.Advance(skipDocID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (docID < skipDocID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID); } int freq = postings.Freq(); if (freq <= 0) { throw new Exception("termFreq " + freq + " is out of bounds"); } int lastPosition = -1; int lastOffset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); if (pos < 0) { throw new Exception("position " + pos + " is out of bounds"); } if (pos < lastPosition) { throw new Exception("position " + pos + " is < lastPosition " + lastPosition); } lastPosition = pos; if (hasOffsets) { int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before? // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter if (!isVectors) { if (startOffset < 0) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds"); } if (startOffset < lastOffset) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset); } if (endOffset < 0) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds"); } if (endOffset < startOffset) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset); } } lastOffset = startOffset; } } int nextDocID = postings.NextDoc(); if (nextDocID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (nextDocID <= docID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID); } } } } else { for (int idx = 0; idx < 7; idx++) { int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8); docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE); int docID = docs.Advance(skipDocID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (docID < skipDocID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID); } int nextDocID = docs.NextDoc(); if (nextDocID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (nextDocID <= docID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID); } } } } } Terms fieldTerms = fields.Terms(field); if (fieldTerms == null) { // Unusual: the FieldsEnum returned a field but // the Terms for that field is null; this should // only happen if it's a ghost field (field with // no terms, eg there used to be terms but all // docs got deleted and then merged away): } else { if (fieldTerms is BlockTreeTermsReader.FieldReader) { BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader)fieldTerms).ComputeStats(); Debug.Assert(stats != null); if (status.BlockTreeStats == null) { status.BlockTreeStats = new Dictionary<string, BlockTreeTermsReader.Stats>(); } status.BlockTreeStats[field] = stats; } if (sumTotalTermFreq != 0) { long v = fields.Terms(field).SumTotalTermFreq; if (v != -1 && sumTotalTermFreq != v) { throw new Exception("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); } } if (sumDocFreq != 0) { long v = fields.Terms(field).SumDocFreq; if (v != -1 && sumDocFreq != v) { throw new Exception("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); } } if (fieldTerms != null) { int v = fieldTerms.DocCount; if (v != -1 && visitedDocs.Cardinality() != v) { throw new Exception("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.Cardinality()); } } // Test seek to last term: if (lastTerm != null) { if (termsEnum.SeekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND) { throw new Exception("seek to last term " + lastTerm + " failed"); } int expectedDocFreq = termsEnum.DocFreq(); DocsEnum d = termsEnum.Docs(null, null, DocsEnum.FLAG_NONE); int docFreq = 0; while (d.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { docFreq++; } if (docFreq != expectedDocFreq) { throw new Exception("docFreq for last term " + lastTerm + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq); } } // check unique term count long termCount = -1; if ((status.DelTermCount + status.TermCount) - termCountStart > 0) { termCount = fields.Terms(field).Size(); if (termCount != -1 && termCount != status.DelTermCount + status.TermCount - termCountStart) { throw new Exception("termCount mismatch " + (status.DelTermCount + termCount) + " vs " + (status.TermCount - termCountStart)); } } // Test seeking by ord if (hasOrd && status.TermCount - termCountStart > 0) { int seekCount = (int)Math.Min(10000L, termCount); if (seekCount > 0) { BytesRef[] seekTerms = new BytesRef[seekCount]; // Seek by ord for (int i = seekCount - 1; i >= 0; i--) { long ord = i * (termCount / seekCount); termsEnum.SeekExact(ord); seekTerms[i] = BytesRef.DeepCopyOf(termsEnum.Term()); } // Seek by term long totDocCount = 0; for (int i = seekCount - 1; i >= 0; i--) { if (termsEnum.SeekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) { throw new Exception("seek to existing term " + seekTerms[i] + " failed"); } docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE); if (docs == null) { throw new Exception("null DocsEnum from to existing term " + seekTerms[i]); } while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totDocCount++; } } long totDocCountNoDeletes = 0; long totDocFreq = 0; for (int i = 0; i < seekCount; i++) { if (!termsEnum.SeekExact(seekTerms[i])) { throw new Exception("seek to existing term " + seekTerms[i] + " failed"); } totDocFreq += termsEnum.DocFreq(); docs = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE); if (docs == null) { throw new Exception("null DocsEnum from to existing term " + seekTerms[i]); } while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totDocCountNoDeletes++; } } if (totDocCount > totDocCountNoDeletes) { throw new Exception("more postings with deletes=" + totDocCount + " than without=" + totDocCountNoDeletes); } if (totDocCountNoDeletes != totDocFreq) { throw new Exception("docfreqs=" + totDocFreq + " != recomputed docfreqs=" + totDocCountNoDeletes); } } } } } int fieldCount = fields.Size; if (fieldCount != -1) { if (fieldCount < 0) { throw new Exception("invalid fieldCount: " + fieldCount); } if (fieldCount != computedFieldCount) { throw new Exception("fieldCount mismatch " + fieldCount + " vs recomputed field count " + computedFieldCount); } } // for most implementations, this is boring (just the sum across all fields) // but codecs that don't work per-field like preflex actually implement this, // but don't implement it on Terms, so the check isn't redundant. long uniqueTermCountAllFields = fields.UniqueTermCount; if (uniqueTermCountAllFields != -1 && status.TermCount + status.DelTermCount != uniqueTermCountAllFields) { throw new Exception("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.TermCount + status.DelTermCount)); } if (doPrint) { Msg(infoStream, "OK [" + status.TermCount + " terms; " + status.TotFreq + " terms/docs pairs; " + status.TotPos + " tokens]"); } if (verbose && status.BlockTreeStats != null && infoStream != null && status.TermCount > 0) { foreach (KeyValuePair<string, BlockTreeTermsReader.Stats> ent in status.BlockTreeStats) { infoStream.WriteLine(" field \"" + ent.Key + "\":"); infoStream.WriteLine(" " + ent.Value.ToString().Replace("\n", "\n ")); } } return status; }
public override void Merge(DocValuesFieldUpdates other) { Debug.Assert(other is NumericDocValuesFieldUpdates); NumericDocValuesFieldUpdates otherUpdates = (NumericDocValuesFieldUpdates)other; if (Size + otherUpdates.Size > int.MaxValue) { throw new InvalidOperationException("cannot support more than Integer.MAX_VALUE doc/value entries; size=" + Size + " other.size=" + otherUpdates.Size); } Docs = Docs.Grow(Size + otherUpdates.Size); Values = Values.Grow(Size + otherUpdates.Size); DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, (int)Docs.Size()); for (int i = 0; i < otherUpdates.Size; i++) { int doc = (int)otherUpdates.Docs.Get(i); if (otherUpdates.DocsWithField.Get(i)) { DocsWithField.Set(Size); } Docs.Set(Size, doc); Values.Set(Size, otherUpdates.Values.Get(i)); ++Size; } }
public virtual void AddValue(int docID, long value) { if (docID < Pending.Size()) { throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" appears more than once in this document (only one value is allowed per field)"); } // Fill in any holes: for (int i = (int)Pending.Size(); i < docID; ++i) { Pending.Add(MISSING); } Pending.Add(value); if (DocsWithField != null) { DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, docID); DocsWithField.Set(docID); } UpdateBytesUsed(); }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs) { Assert.IsNull(acceptDocs, "acceptDocs should be null, as we have an index without deletions"); FixedBitSet set = new FixedBitSet(context.Reader.MaxDoc); int docBase = context.DocBase; int limit = docBase + context.Reader.MaxDoc; for (int index = 0; index < Docs.Length; index++) { int docId = Docs[index]; if (docId >= docBase && docId < limit) { set.Set(docId - docBase); } } return set.Cardinality() == 0 ? null : set; }
protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms) { Assert.AreEqual(1, terms.DocCount); int termCount = (new HashSet<string>(Arrays.AsList(tk.Terms))).Count; Assert.AreEqual(termCount, terms.Size()); Assert.AreEqual(termCount, terms.SumDocFreq); Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions()); Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets()); Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads()); HashSet<BytesRef> uniqueTerms = new HashSet<BytesRef>(); foreach (string term in tk.Freqs.Keys) { uniqueTerms.Add(new BytesRef(term)); } BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/); Array.Sort(sortedTerms, terms.Comparator); TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value); this.termsEnum.Value = termsEnum; for (int i = 0; i < sortedTerms.Length; ++i) { BytesRef nextTerm = termsEnum.Next(); Assert.AreEqual(sortedTerms[i], nextTerm); Assert.AreEqual(sortedTerms[i], termsEnum.Term()); Assert.AreEqual(1, termsEnum.DocFreq()); FixedBitSet bits = new FixedBitSet(1); DocsEnum docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); bits.Set(0); docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum); Assert.IsNotNull(docsEnum); Assert.AreEqual(0, docsEnum.NextDoc()); Assert.AreEqual(0, docsEnum.DocID()); Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq()); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); this.docsEnum.Value = docsEnum; bits.Clear(0); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (docsAndPositionsEnum != null) { Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } bits.Set(0); docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (terms.HasPositions() || terms.HasOffsets()) { Assert.AreEqual(0, docsAndPositionsEnum.NextDoc()); int freq = docsAndPositionsEnum.Freq(); Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq); if (docsAndPositionsEnum != null) { for (int k = 0; k < freq; ++k) { int position = docsAndPositionsEnum.NextPosition(); ISet<int?> indexes; if (terms.HasPositions()) { indexes = tk.PositionToTerms[position]; Assert.IsNotNull(indexes); } else { indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()]; Assert.IsNotNull(indexes); } if (terms.HasPositions()) { bool foundPosition = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position) { foundPosition = true; break; } } Assert.IsTrue(foundPosition); } if (terms.HasOffsets()) { bool foundOffset = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset()) { foundOffset = true; break; } } Assert.IsTrue(foundOffset); } if (terms.HasPayloads()) { bool foundPayload = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload)) { foundPayload = true; break; } } Assert.IsTrue(foundPayload); } } try { docsAndPositionsEnum.NextPosition(); Assert.Fail(); } catch (Exception e) { // ok } } Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } this.docsAndPositionsEnum.Value = docsAndPositionsEnum; } Assert.IsNull(termsEnum.Next()); for (int i = 0; i < 5; ++i) { if (Random().NextBoolean()) { Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes))); } else { Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes))); } } }
// maxAllowed = the "highest" we can index, but we will still // randomly index at lower IndexOption private FieldsProducer BuildIndex(Directory dir, FieldInfo.IndexOptions maxAllowed, bool allowPayloads, bool alwaysTestMax) { Codec codec = Codec; SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", MaxDoc, false, codec, null); int maxIndexOption = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(maxAllowed); if (VERBOSE) { Console.WriteLine("\nTEST: now build index"); } int maxIndexOptionNoOffsets = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); // TODO use allowPayloads var newFieldInfoArray = new FieldInfo[Fields.Count]; for (int fieldUpto = 0; fieldUpto < Fields.Count; fieldUpto++) { FieldInfo oldFieldInfo = FieldInfos.FieldInfo(fieldUpto); string pf = TestUtil.GetPostingsFormat(codec, oldFieldInfo.Name); int fieldMaxIndexOption; if (DoesntSupportOffsets.Contains(pf)) { fieldMaxIndexOption = Math.Min(maxIndexOptionNoOffsets, maxIndexOption); } else { fieldMaxIndexOption = maxIndexOption; } // Randomly picked the IndexOptions to index this // field with: FieldInfo.IndexOptions indexOptions = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToArray()[alwaysTestMax ? fieldMaxIndexOption : Random().Next(1 + fieldMaxIndexOption)]; bool doPayloads = indexOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.Name, true, fieldUpto, false, false, doPayloads, indexOptions, null, DocValuesType.NUMERIC, null); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); // Estimate that flushed segment size will be 25% of // what we use in RAM: long bytes = TotalPostings * 8 + TotalPayloadBytes; SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, 32, null, new IOContext(new FlushInfo(MaxDoc, bytes))); FieldsConsumer fieldsConsumer = codec.PostingsFormat().FieldsConsumer(writeState); foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields) { string field = fieldEnt.Key; IDictionary<BytesRef, long> terms = fieldEnt.Value; FieldInfo fieldInfo = newFieldInfos.FieldInfo(field); FieldInfo.IndexOptions? indexOptions = fieldInfo.FieldIndexOptions; if (VERBOSE) { Console.WriteLine("field=" + field + " indexOtions=" + indexOptions); } bool doFreq = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool doPos = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool doPayloads = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && allowPayloads; bool doOffsets = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; TermsConsumer termsConsumer = fieldsConsumer.AddField(fieldInfo); long sumTotalTF = 0; long sumDF = 0; FixedBitSet seenDocs = new FixedBitSet(MaxDoc); foreach (KeyValuePair<BytesRef, long> termEnt in terms) { BytesRef term = termEnt.Key; SeedPostings postings = GetSeedPostings(term.Utf8ToString(), termEnt.Value, false, maxAllowed); if (VERBOSE) { Console.WriteLine(" term=" + field + ":" + term.Utf8ToString() + " docFreq=" + postings.DocFreq + " seed=" + termEnt.Value); } PostingsConsumer postingsConsumer = termsConsumer.StartTerm(term); long totalTF = 0; int docID = 0; while ((docID = postings.NextDoc()) != DocsEnum.NO_MORE_DOCS) { int freq = postings.Freq(); if (VERBOSE) { Console.WriteLine(" " + postings.Upto + ": docID=" + docID + " freq=" + postings.Freq_Renamed); } postingsConsumer.StartDoc(docID, doFreq ? postings.Freq_Renamed : -1); seenDocs.Set(docID); if (doPos) { totalTF += postings.Freq_Renamed; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); BytesRef payload = postings.Payload; if (VERBOSE) { if (doPayloads) { Console.WriteLine(" pos=" + pos + " payload=" + (payload == null ? "null" : payload.Length + " bytes")); } else { Console.WriteLine(" pos=" + pos); } } postingsConsumer.AddPosition(pos, doPayloads ? payload : null, doOffsets ? postings.StartOffset() : -1, doOffsets ? postings.EndOffset() : -1); } } else if (doFreq) { totalTF += freq; } else { totalTF++; } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(term, new TermStats(postings.DocFreq, doFreq ? totalTF : -1)); sumTotalTF += totalTF; sumDF += postings.DocFreq; } termsConsumer.Finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.Cardinality()); } fieldsConsumer.Dispose(); if (VERBOSE) { Console.WriteLine("TEST: after indexing: files="); foreach (string file in dir.ListAll()) { Console.WriteLine(" " + file + ": " + dir.FileLength(file) + " bytes"); } } CurrentFieldInfos = newFieldInfos; SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ, 1); return codec.PostingsFormat().FieldsProducer(readState); }
public static void CreatePostings() { TotalPostings = 0; TotalPayloadBytes = 0; Fields = new SortedDictionary<string, SortedDictionary<BytesRef, long>>(); int numFields = TestUtil.NextInt(Random(), 1, 5); if (VERBOSE) { Console.WriteLine("TEST: " + numFields + " fields"); } MaxDoc = 0; FieldInfo[] fieldInfoArray = new FieldInfo[numFields]; int fieldUpto = 0; while (fieldUpto < numFields) { string field = TestUtil.RandomSimpleString(Random()); if (Fields.ContainsKey(field)) { continue; } fieldInfoArray[fieldUpto] = new FieldInfo(field, true, fieldUpto, false, false, true, FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, null, DocValuesType.NUMERIC, null); fieldUpto++; SortedDictionary<BytesRef, long> postings = new SortedDictionary<BytesRef, long>(); Fields[field] = postings; HashSet<string> seenTerms = new HashSet<string>(); int numTerms; if (Random().Next(10) == 7) { numTerms = AtLeast(50); } else { numTerms = TestUtil.NextInt(Random(), 2, 20); } for (int termUpto = 0; termUpto < numTerms; termUpto++) { string term = TestUtil.RandomSimpleString(Random()); if (seenTerms.Contains(term)) { continue; } seenTerms.Add(term); if (TEST_NIGHTLY && termUpto == 0 && fieldUpto == 1) { // Make 1 big term: term = "big_" + term; } else if (termUpto == 1 && fieldUpto == 1) { // Make 1 medium term: term = "medium_" + term; } else if (Random().NextBoolean()) { // Low freq term: term = "low_" + term; } else { // Very low freq term (don't multiply by RANDOM_MULTIPLIER): term = "verylow_" + term; } long termSeed = Random().NextLong(); postings[new BytesRef(term)] = termSeed; // NOTE: sort of silly: we enum all the docs just to // get the maxDoc DocsEnum docsEnum = GetSeedPostings(term, termSeed, false, FieldInfo.IndexOptions.DOCS_ONLY); int doc; int lastDoc = 0; while ((doc = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { lastDoc = doc; } MaxDoc = Math.Max(lastDoc, MaxDoc); } } FieldInfos = new FieldInfos(fieldInfoArray); // It's the count, not the last docID: MaxDoc++; GlobalLiveDocs = new FixedBitSet(MaxDoc); double liveRatio = Random().NextDouble(); for (int i = 0; i < MaxDoc; i++) { if (Random().NextDouble() <= liveRatio) { GlobalLiveDocs.Set(i); } } AllTerms = new List<FieldAndTerm>(); foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields) { string field = fieldEnt.Key; foreach (KeyValuePair<BytesRef, long> termEnt in fieldEnt.Value.EntrySet()) { AllTerms.Add(new FieldAndTerm(field, termEnt.Key)); } } if (VERBOSE) { Console.WriteLine("TEST: done init postings; " + AllTerms.Count + " total terms, across " + FieldInfos.Size() + " fields"); } }
public virtual void DoTestLongPostingsNoPositions(FieldInfo.IndexOptions options) { // Don't use TestUtil.getTempDir so that we own the // randomness (ie same seed will point to same dir): Directory dir = NewFSDirectory(CreateTempDir("longpostings" + "." + Random().NextLong())); int NUM_DOCS = AtLeast(2000); if (VERBOSE) { Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS); } string s1 = GetRandomTerm(null); string s2 = GetRandomTerm(s1); if (VERBOSE) { Console.WriteLine("\nTEST: s1=" + s1 + " s2=" + s2); /* for(int idx=0;idx<s1.Length();idx++) { System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); } for(int idx=0;idx<s2.Length();idx++) { System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); } */ } FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); for (int idx = 0; idx < NUM_DOCS; idx++) { if (Random().NextBoolean()) { isS1.Set(idx); } } IndexReader r; if (true) { IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE).SetMergePolicy(NewLogMergePolicy()); iwc.SetRAMBufferSizeMB(16.0 + 16.0 * Random().NextDouble()); iwc.SetMaxBufferedDocs(-1); RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, iwc); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = options; for (int idx = 0; idx < NUM_DOCS; idx++) { Document doc = new Document(); string s = isS1.Get(idx) ? s1 : s2; Field f = NewField("field", s, ft); int count = TestUtil.NextInt(Random(), 1, 4); for (int ct = 0; ct < count; ct++) { doc.Add(f); } riw.AddDocument(doc); } r = riw.Reader; riw.Dispose(); } else { r = DirectoryReader.Open(dir); } /* if (VERBOSE) { System.out.println("TEST: terms"); TermEnum termEnum = r.Terms(); while(termEnum.Next()) { System.out.println(" term=" + termEnum.Term() + " len=" + termEnum.Term().Text().Length()); Assert.IsTrue(termEnum.DocFreq() > 0); System.out.println(" s1?=" + (termEnum.Term().Text().equals(s1)) + " s1len=" + s1.Length()); System.out.println(" s2?=" + (termEnum.Term().Text().equals(s2)) + " s2len=" + s2.Length()); final String s = termEnum.Term().Text(); for(int idx=0;idx<s.Length();idx++) { System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); } } } */ Assert.AreEqual(NUM_DOCS, r.NumDocs); Assert.IsTrue(r.DocFreq(new Term("field", s1)) > 0); Assert.IsTrue(r.DocFreq(new Term("field", s2)) > 0); int num = AtLeast(1000); for (int iter = 0; iter < num; iter++) { string term; bool doS1; if (Random().NextBoolean()) { term = s1; doS1 = true; } else { term = s2; doS1 = false; } if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " doS1=" + doS1 + " term=" + term); } DocsEnum docs; DocsEnum postings; if (options == FieldInfo.IndexOptions.DOCS_ONLY) { docs = TestUtil.Docs(Random(), r, "field", new BytesRef(term), null, null, DocsEnum.FLAG_NONE); postings = null; } else { docs = postings = TestUtil.Docs(Random(), r, "field", new BytesRef(term), null, null, DocsEnum.FLAG_FREQS); Debug.Assert(postings != null); } Debug.Assert(docs != null); int docID = -1; while (docID < DocIdSetIterator.NO_MORE_DOCS) { int what = Random().Next(3); if (what == 0) { if (VERBOSE) { Console.WriteLine("TEST: docID=" + docID + "; do next()"); } // nextDoc int expected = docID + 1; while (true) { if (expected == NUM_DOCS) { expected = int.MaxValue; break; } else if (isS1.Get(expected) == doS1) { break; } else { expected++; } } docID = docs.NextDoc(); if (VERBOSE) { Console.WriteLine(" got docID=" + docID); } Assert.AreEqual(expected, docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (Random().Next(6) == 3 && postings != null) { int freq = postings.Freq(); Assert.IsTrue(freq >= 1 && freq <= 4); } } else { // advance int targetDocID; if (docID == -1) { targetDocID = Random().Next(NUM_DOCS + 1); } else { targetDocID = docID + TestUtil.NextInt(Random(), 1, NUM_DOCS - docID); } if (VERBOSE) { Console.WriteLine("TEST: docID=" + docID + "; do advance(" + targetDocID + ")"); } int expected = targetDocID; while (true) { if (expected == NUM_DOCS) { expected = int.MaxValue; break; } else if (isS1.Get(expected) == doS1) { break; } else { expected++; } } docID = docs.Advance(targetDocID); if (VERBOSE) { Console.WriteLine(" got docID=" + docID); } Assert.AreEqual(expected, docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (Random().Next(6) == 3 && postings != null) { int freq = postings.Freq(); Assert.IsTrue(freq >= 1 && freq <= 4, "got invalid freq=" + freq); } } } } r.Dispose(); dir.Dispose(); }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs) { SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), Field); FixedBitSet bits = new FixedBitSet(fcsi.ValueCount); for (int i = 0; i < Terms.Length; i++) { int ord = fcsi.LookupTerm(Terms[i]); if (ord >= 0) { bits.Set(ord); } } return new FieldCacheDocIdSetAnonymousInnerClassHelper(this, context.Reader.MaxDoc, acceptDocs, fcsi, bits); }
internal RandomBits(int maxDoc, double pctLive, Random random) { Bits = new FixedBitSet(maxDoc); for (int i = 0; i < maxDoc; i++) { if (random.NextDouble() <= pctLive) { Bits.Set(i); } } }
public override void Merge(DocValuesFieldUpdates other) { BinaryDocValuesFieldUpdates otherUpdates = (BinaryDocValuesFieldUpdates)other; int newSize = Size + otherUpdates.Size; if (newSize > int.MaxValue) { throw new InvalidOperationException("cannot support more than Integer.MAX_VALUE doc/value entries; size=" + Size + " other.size=" + otherUpdates.Size); } Docs = Docs.Grow(newSize); Offsets = Offsets.Grow(newSize); Lengths = Lengths.Grow(newSize); DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, (int)Docs.Size()); for (int i = 0; i < otherUpdates.Size; i++) { int doc = (int)otherUpdates.Docs.Get(i); if (otherUpdates.DocsWithField.Get(i)) { DocsWithField.Set(Size); } Docs.Set(Size, doc); Offsets.Set(Size, Values.Length + otherUpdates.Offsets.Get(i)); // correct relative offset Lengths.Set(Size, otherUpdates.Lengths.Get(i)); ++Size; } Values.Append(otherUpdates.Values); }
public virtual void TestBuildDocMap() { int maxDoc = TestUtil.NextInt(Random(), 1, 128); int numDocs = TestUtil.NextInt(Random(), 0, maxDoc); int numDeletedDocs = maxDoc - numDocs; FixedBitSet liveDocs = new FixedBitSet(maxDoc); for (int i = 0; i < numDocs; ++i) { while (true) { int docID = Random().Next(maxDoc); if (!liveDocs.Get(docID)) { liveDocs.Set(docID); break; } } } MergeState.DocMap docMap = MergeState.DocMap.Build(maxDoc, liveDocs); Assert.AreEqual(maxDoc, docMap.MaxDoc); Assert.AreEqual(numDocs, docMap.NumDocs); Assert.AreEqual(numDeletedDocs, docMap.NumDeletedDocs); // assert the mapping is compact for (int i = 0, del = 0; i < maxDoc; ++i) { if (!liveDocs.Get(i)) { Assert.AreEqual(-1, docMap.Get(i)); ++del; } else { Assert.AreEqual(i - del, docMap.Get(i)); } } }