private void VerifyCount(IndexReader ir) { Fields fields = MultiFields.GetFields(ir); if (fields == null) { return; } foreach (string field in fields) { Terms terms = fields.Terms(field); if (terms == null) { continue; } int docCount = terms.DocCount; FixedBitSet visited = new FixedBitSet(ir.MaxDoc); TermsEnum te = terms.Iterator(null); while (te.Next() != null) { DocsEnum de = TestUtil.Docs(Random(), te, null, null, DocsEnum.FLAG_NONE); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visited.Set(de.DocID()); } } Assert.AreEqual(visited.Cardinality(), docCount); } }
private bool OpenBitSetContains(int[] expectedDocs, FixedBitSet actual, int maxDoc) { if (expectedDocs.Length != actual.Cardinality()) { return(false); } FixedBitSet expected = new FixedBitSet(maxDoc); foreach (int expectedDoc in expectedDocs) { expected.Set(expectedDoc); } int docId; DocIdSetIterator iterator = expected.GetIterator(); while ((docId = iterator.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (!actual.Get(docId)) { return(false); } } return(true); }
/// <param name="targetMaxSaturation"> /// A number between 0 and 1 describing the % of bits that would ideally be set in the result. /// Lower values have better accuracy but require more space. /// </param> /// <return>A smaller <see cref="FuzzySet"/> or <c>null</c> if the current set is already over-saturated.</return> public virtual FuzzySet Downsize(float targetMaxSaturation) { var numBitsSet = _filter.Cardinality(); FixedBitSet rightSizedBitSet; var rightSizedBitSetSize = _bloomSize; //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level for (int i = 0; i < _usableBitSetSizes.Length; i++) { int candidateBitsetSize = _usableBitSetSizes[i]; float candidateSaturation = (float)numBitsSet / (float)candidateBitsetSize; if (candidateSaturation <= targetMaxSaturation) { rightSizedBitSetSize = candidateBitsetSize; break; } } // Re-project the numbers to a smaller space if necessary if (rightSizedBitSetSize < _bloomSize) { // Reset the choice of bitset to the smaller version rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1); // Map across the bits from the large set to the smaller one var bitIndex = 0; do { bitIndex = _filter.NextSetBit(bitIndex); if (bitIndex < 0) { continue; } // Project the larger number into a smaller one effectively // modulo-ing by using the target bitset size as a mask var downSizedBitIndex = bitIndex & rightSizedBitSetSize; rightSizedBitSet.Set(downSizedBitIndex); bitIndex++; } while ((bitIndex >= 0) && (bitIndex <= _bloomSize)); } else { return(null); } return(new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, _hashFunction)); }
public void TestMissingTerms() { string fieldName = "field1"; Directory rd = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, rd); for (int i = 0; i < 100; i++) { Document doc = new Document(); int term = i * 10; //terms are units of 10; doc.Add(NewStringField(fieldName, "" + term, Field.Store.YES)); w.AddDocument(doc); } IndexReader reader = SlowCompositeReaderWrapper.Wrap(w.GetReader()); assertTrue(reader.Context is AtomicReaderContext); AtomicReaderContext context = (AtomicReaderContext)reader.Context; w.Dispose(); IList <Term> terms = new List <Term>(); terms.Add(new Term(fieldName, "19")); FixedBitSet bits = (FixedBitSet)TermsFilter(Random.NextBoolean(), terms).GetDocIdSet(context, context.AtomicReader.LiveDocs); assertNull("Must match nothing", bits); terms.Add(new Term(fieldName, "20")); bits = (FixedBitSet)TermsFilter(Random.NextBoolean(), terms).GetDocIdSet(context, context.AtomicReader.LiveDocs); assertEquals("Must match 1", 1, bits.Cardinality()); terms.Add(new Term(fieldName, "10")); bits = (FixedBitSet)TermsFilter(Random.NextBoolean(), terms).GetDocIdSet(context, context.AtomicReader.LiveDocs); assertEquals("Must match 2", 2, bits.Cardinality()); terms.Add(new Term(fieldName, "00")); bits = (FixedBitSet)TermsFilter(Random.NextBoolean(), terms).GetDocIdSet(context, context.AtomicReader.LiveDocs); assertEquals("Must match 2", 2, bits.Cardinality()); reader.Dispose(); rd.Dispose(); }
public void TestConcurrentSpan() { String TEXT = "the fox jumped"; Directory directory = NewDirectory(); IndexWriter indexWriter = new IndexWriter(directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false))); try { Document document = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectorOffsets = (true); customType.StoreTermVectorPositions = (true); customType.StoreTermVectors = (true); document.Add(new Field(FIELD, new TokenStreamConcurrent(), customType)); indexWriter.AddDocument(document); } finally { indexWriter.Dispose(); } IndexReader indexReader = DirectoryReader.Open(directory); try { assertEquals(1, indexReader.NumDocs); IndexSearcher indexSearcher = NewSearcher(indexReader); Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "fox")), new SpanTermQuery(new Term(FIELD, "jumped")) }, 0, true); FixedBitSet bitset = new FixedBitSet(indexReader.MaxDoc); indexSearcher.Search(phraseQuery, new ConcurrentSpanCollectorAnonymousHelper(this, bitset)); assertEquals(1, bitset.Cardinality()); int maxDoc = indexReader.MaxDoc; Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); for (int position = bitset.NextSetBit(0); position >= 0 && position < maxDoc - 1; position = bitset .NextSetBit(position + 1)) { assertEquals(0, position); TokenStream tokenStream = TokenSources.GetTokenStream( indexReader.GetTermVector(position, FIELD), false); assertEquals(highlighter.GetBestFragment(new TokenStreamConcurrent(), TEXT), highlighter.GetBestFragment(tokenStream, TEXT)); } } finally { indexReader.Dispose(); directory.Dispose(); } }
public void testSmallBitSets() { // Make sure size 0-10 bit sets are OK: for (int numBits = 0; numBits < 10; numBits++) { FixedBitSet b1 = new FixedBitSet(numBits); FixedBitSet b2 = new FixedBitSet(numBits); Assert.IsTrue(b1.Equals(b2)); Assert.AreEqual(b1.GetHashCode(), b2.GetHashCode()); Assert.AreEqual(0, b1.Cardinality()); if (numBits > 0) { b1.Set(0, numBits); Assert.AreEqual(numBits, b1.Cardinality()); //b1.Flip(0, numBits); //Assert.AreEqual(0, b1.Cardinality()); } } }
public void TestMissingField() { string fieldName = "field1"; Directory rd1 = NewDirectory(); RandomIndexWriter w1 = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, rd1); Document doc = new Document(); doc.Add(NewStringField(fieldName, "content1", Field.Store.YES)); w1.AddDocument(doc); IndexReader reader1 = w1.GetReader(); w1.Dispose(); fieldName = "field2"; Directory rd2 = NewDirectory(); RandomIndexWriter w2 = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, rd2); doc = new Document(); doc.Add(NewStringField(fieldName, "content2", Field.Store.YES)); w2.AddDocument(doc); IndexReader reader2 = w2.GetReader(); w2.Dispose(); TermsFilter tf = new TermsFilter(new Term(fieldName, "content1")); MultiReader multi = new MultiReader(reader1, reader2); foreach (AtomicReaderContext context in multi.Leaves) { DocIdSet docIdSet = tf.GetDocIdSet(context, context.AtomicReader.LiveDocs); if (context.Reader.DocFreq(new Term(fieldName, "content1")) == 0) { assertNull(docIdSet); } else { FixedBitSet bits = (FixedBitSet)docIdSet; assertTrue("Must be >= 0", bits.Cardinality() >= 0); } } multi.Dispose(); reader1.Dispose(); reader2.Dispose(); rd1.Dispose(); rd2.Dispose(); }
public void testMissingTerms() { String fieldName = "field1"; Directory rd = new RAMDirectory(); var w = new IndexWriter(rd, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); for (int i = 0; i < 100; i++) { var doc = new Document(); int term = i * 10; //terms are units of 10; doc.Add(new Field(fieldName, "" + term, Field.Store.YES, Field.Index.ANALYZED)); w.AddDocument(doc); } IndexReader reader = w.GetReader(); w.Close(); TermsFilter tf = new TermsFilter(); tf.AddTerm(new Term(fieldName, "19")); FixedBitSet bits = (FixedBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(0, bits.Cardinality(), "Must match nothing"); tf.AddTerm(new Term(fieldName, "20")); bits = (FixedBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(1, bits.Cardinality(), "Must match 1"); tf.AddTerm(new Term(fieldName, "10")); bits = (FixedBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(2, bits.Cardinality(), "Must match 2"); tf.AddTerm(new Term(fieldName, "00")); bits = (FixedBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(2, bits.Cardinality(), "Must match 2"); reader.Close(); rd.Close(); }
public void TestSkipField() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); int num = AtLeast(10); var terms = new JCG.HashSet <Term>(); for (int i = 0; i < num; i++) { string field = "field" + Random.Next(100); terms.Add(new Term(field, "content1")); Document doc = new Document(); doc.Add(NewStringField(field, "content1", Field.Store.YES)); w.AddDocument(doc); } int randomFields = Random.Next(10); for (int i = 0; i < randomFields; i++) { while (true) { string field = "field" + Random.Next(100); Term t = new Term(field, "content1"); if (!terms.Contains(t)) { terms.Add(t); break; } } } w.ForceMerge(1); IndexReader reader = w.GetReader(); w.Dispose(); assertEquals(1, reader.Leaves.size()); AtomicReaderContext context = reader.Leaves.First(); TermsFilter tf = new TermsFilter(terms.ToList()); FixedBitSet bits = (FixedBitSet)tf.GetDocIdSet(context, context.AtomicReader.LiveDocs); assertEquals(context.Reader.NumDocs, bits.Cardinality()); reader.Dispose(); dir.Dispose(); }
public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) { Assert.IsNull(acceptDocs, "acceptDocs should be null, as we have an index without deletions"); FixedBitSet set = new FixedBitSet(context.Reader.MaxDoc); int docBase = context.DocBase; int limit = docBase + context.Reader.MaxDoc; for (int index = 0; index < Docs.Length; index++) { int docId = Docs[index]; if (docId >= docBase && docId < limit) { set.Set(docId - docBase); } } return(set.Cardinality() == 0 ? null : set); }
/// <summary> /// pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser /// of the two colliding pps. Note that there can only be one collision, as by the initialization /// there were no collisions before pp was advanced. /// </summary> private bool AdvanceRpts(PhrasePositions pp) { if (pp.rptGroup < 0) { return(true); // not a repeater } PhrasePositions[] rg = rptGroups[pp.rptGroup]; FixedBitSet bits = new FixedBitSet(rg.Length); // for re-queuing after collisions are resolved int k0 = pp.rptInd; int k; while ((k = Collide(pp)) >= 0) { pp = Lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps if (!AdvancePP(pp)) { return(false); // exhausted } if (k != k0) // careful: mark only those currently in the queue { bits = FixedBitSet.EnsureCapacity(bits, k); bits.Set(k); // mark that pp2 need to be re-queued } } // collisions resolved, now re-queue // empty (partially) the queue until seeing all pps advanced for resolving collisions int n = 0; // TODO would be good if we can avoid calling cardinality() in each iteration! int numBits = bits.Length; // larges bit we set while (bits.Cardinality() > 0) { PhrasePositions pp2 = pq.Pop(); rptStack[n++] = pp2; if (pp2.rptGroup >= 0 && pp2.rptInd < numBits && bits.Get(pp2.rptInd)) // this bit may not have been set { bits.Clear(pp2.rptInd); } } // add back to queue for (int i = n - 1; i >= 0; i--) { pq.Add(rptStack[i]); } return(true); }
public virtual void TestBooleanScorerMax() { Directory dir = NewDirectory(); RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); int docCount = AtLeast(10000); for (int i = 0; i < docCount; i++) { Document doc = new Document(); doc.Add(NewField("field", "a", TextField.TYPE_NOT_STORED)); riw.AddDocument(doc); } riw.ForceMerge(1); IndexReader r = riw.Reader; riw.Dispose(); IndexSearcher s = NewSearcher(r); BooleanQuery bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD); bq.Add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD); Weight w = s.CreateNormalizedWeight(bq); Assert.AreEqual(1, s.IndexReader.Leaves.Count); BulkScorer scorer = w.BulkScorer(s.IndexReader.Leaves[0], false, null); FixedBitSet hits = new FixedBitSet(docCount); AtomicInteger end = new AtomicInteger(); Collector c = new CollectorAnonymousInnerClassHelper(this, scorer, hits, end); while (end.Get() < docCount) { int inc = TestUtil.NextInt(Random(), 1, 1000); end.AddAndGet(inc); scorer.Score(c, end.Get()); } Assert.AreEqual(docCount, hits.Cardinality()); r.Dispose(); dir.Dispose(); }
public DocumentFilteredAtomicIndexReader(AtomicReaderContext context, Filter preserveFilter, bool negateFilter) : base(context.AtomicReader) { int maxDoc = m_input.MaxDoc; FixedBitSet bits = new FixedBitSet(maxDoc); // ignore livedocs here, as we filter them later: DocIdSet docs = preserveFilter.GetDocIdSet(context, null); if (docs != null) { DocIdSetIterator it = docs.GetIterator(); if (it != null) { bits.Or(it); } } if (negateFilter) { bits.Flip(0, maxDoc); } if (m_input.HasDeletions) { IBits oldLiveDocs = m_input.LiveDocs; if (Debugging.AssertsEnabled) { Debugging.Assert(oldLiveDocs != null); } DocIdSetIterator it = bits.GetIterator(); for (int i = it.NextDoc(); i < maxDoc; i = it.NextDoc()) { if (!oldLiveDocs.Get(i)) { // we can safely modify the current bit, as the iterator already stepped over it: bits.Clear(i); } } } this.liveDocs = bits; this.numDocs = bits.Cardinality(); }
public override void AddSortedField(FieldInfo field, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrd) { int valueCount = 0; BytesRef lastValue = null; foreach (BytesRef b in values) { Debug.Assert(b != null); Debug.Assert(b.IsValid()); if (valueCount > 0) { Debug.Assert(b.CompareTo(lastValue) > 0); } lastValue = BytesRef.DeepCopyOf(b); valueCount++; } Debug.Assert(valueCount <= maxDoc); FixedBitSet seenOrds = new FixedBitSet(valueCount); int count = 0; foreach (long?v in docToOrd) { Debug.Assert(v != null); int ord = (int)v.Value; Debug.Assert(ord >= -1 && ord < valueCount); if (ord >= 0) { seenOrds.Set(ord); } count++; } Debug.Assert(count == maxDoc); Debug.Assert(seenOrds.Cardinality() == valueCount); CheckIterator(values.GetEnumerator(), valueCount, false); CheckIterator(docToOrd.GetEnumerator(), maxDoc, false); @in.AddSortedField(field, values, docToOrd); }
internal virtual void LoadTerms() { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.Singleton; Builder <PairOutputs.Pair <long?, PairOutputs.Pair <long?, long?> > > b; PairOutputs <long?, long?> outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs); PairOutputs <long?, PairOutputs.Pair <long?, long?> > outputs = new PairOutputs <long?, PairOutputs.Pair <long?, long?> >(posIntOutputs, outputsInner); b = new Builder <>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput @in = (IndexInput)outerInstance._input.Clone(); @in.Seek(termsStart); BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.ReadLine(@in, scratch); if (scratch.Equals(END) || StringHelper.StartsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.StartsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + DOC.length, scratch.Length - DOC.length, scratchUTF16); int docID = ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length); visitedDocs.Set(docID); } else if (StringHelper.StartsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + FREQ.length, scratch.Length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length); } else if (StringHelper.StartsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); } lastDocsStart = @in.FilePointer; int len = scratch.Length - TERM.length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(scratch.Bytes, TERM.length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = visitedDocs.Cardinality(); fst = b.Finish(); }
private static void CheckSortedDocValues(string fieldName, AtomicReader reader, SortedDocValues dv, Bits docsWithField) { CheckBinaryDocValues(fieldName, reader, dv, docsWithField); int maxOrd = dv.ValueCount - 1; FixedBitSet seenOrds = new FixedBitSet(dv.ValueCount); int maxOrd2 = -1; for (int i = 0; i < reader.MaxDoc; i++) { int ord = dv.GetOrd(i); if (ord == -1) { if (docsWithField.Get(i)) { throw new Exception("dv for field: " + fieldName + " has -1 ord but is not marked missing for doc: " + i); } } else if (ord < -1 || ord > maxOrd) { throw new Exception("ord out of bounds: " + ord); } else { if (!docsWithField.Get(i)) { throw new Exception("dv for field: " + fieldName + " is missing but has ord=" + ord + " for doc: " + i); } maxOrd2 = Math.Max(maxOrd2, ord); seenOrds.Set(ord); } } if (maxOrd != maxOrd2) { throw new Exception("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2); } if (seenOrds.Cardinality() != dv.ValueCount) { throw new Exception("dv for field: " + fieldName + " has holes in its ords, valueCount=" + dv.ValueCount + " but only used: " + seenOrds.Cardinality()); } BytesRef lastValue = null; BytesRef scratch = new BytesRef(); for (int i = 0; i <= maxOrd; i++) { dv.LookupOrd(i, scratch); Debug.Assert(scratch.Valid); if (lastValue != null) { if (scratch.CompareTo(lastValue) <= 0) { throw new Exception("dv for field: " + fieldName + " has ords out of order: " + lastValue + " >=" + scratch); } } lastValue = BytesRef.DeepCopyOf(scratch); } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.Indexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparator; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions; Debug.Assert(currentFieldIndexOptions != null); bool writeTermFreq = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool writeOffsets = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; bool readTermFreq = this.HasFreq; bool readPositions = this.HasProx; bool readOffsets = this.HasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0) { segDeletes = state.SegUpdates.Terms; } else { segDeletes = null; } int[] termIDs = TermsHashPerField.SortPostings(termComp); int numTerms = TermsHashPerField.BytesHash.Size(); BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.TextStarts[termID]; TermsHashPerField.BytePool.SetBytesRef(text, textStart); TermsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { TermsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes_Renamed = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.LastDocCodes[termID] != -1) { // Return last doc docID = postings.LastDocIDs[termID]; if (readTermFreq) { termFreq = postings.TermFreqs[termID]; } else { termFreq = -1; } postings.LastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt(); } } Debug.Assert(docID != postings.LastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt(); if (Payload == null) { Payload = new BytesRef(); Payload.Bytes = new sbyte[payloadLength]; } else if (Payload.Bytes.Length < payloadLength) { Payload.Grow(payloadLength); } prox.ReadBytes(Payload.Bytes, 0, payloadLength); Payload.Length = payloadLength; thisPayload = Payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt(); int endOffset = startOffset + prox.ReadVInt(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
public virtual void Merge(MergeState mergeState, IndexOptions indexOptions, TermsEnum termsEnum) { BytesRef term; Debug.Assert(termsEnum != null); long sumTotalTermFreq = 0; long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; FixedBitSet visitedDocs = new FixedBitSet(mergeState.SegmentInfo.DocCount); if (indexOptions == IndexOptions.DOCS_ONLY) { if (docsEnum == null) { docsEnum = new MappingMultiDocsEnum(); } docsEnum.MergeState = mergeState; MultiDocsEnum docsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, DocsFlags.NONE); if (docsEnumIn != null) { docsEnum.Reset(docsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, docsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.DocFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { if (docsAndFreqsEnum == null) { docsAndFreqsEnum = new MappingMultiDocsEnum(); } docsAndFreqsEnum.MergeState = mergeState; MultiDocsEnum docsAndFreqsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn); Debug.Assert(docsAndFreqsEnumIn != null); docsAndFreqsEnum.Reset(docsAndFreqsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, docsAndFreqsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (postingsEnum == null) { postingsEnum = new MappingMultiDocsAndPositionsEnum(); } postingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsFlags.PAYLOADS); Debug.Assert(postingsEnumIn != null); postingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, postingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else { Debug.Assert(indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (postingsEnum == null) { postingsEnum = new MappingMultiDocsAndPositionsEnum(); } postingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn); Debug.Assert(postingsEnumIn != null); postingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, postingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } Finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality()); }
/// <summary> /// Default merge impl </summary> public virtual void Merge(MergeState mergeState, FieldInfo.IndexOptions? indexOptions, TermsEnum termsEnum) { BytesRef term; Debug.Assert(termsEnum != null); long sumTotalTermFreq = 0; long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; FixedBitSet visitedDocs = new FixedBitSet(mergeState.SegmentInfo.DocCount); if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { if (DocsEnum == null) { DocsEnum = new MappingMultiDocsEnum(); } DocsEnum.MergeState = mergeState; MultiDocsEnum docsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, Index.DocsEnum.FLAG_NONE); if (docsEnumIn != null) { DocsEnum.Reset(docsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.DocFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS) { if (DocsAndFreqsEnum == null) { DocsAndFreqsEnum = new MappingMultiDocsEnum(); } DocsAndFreqsEnum.MergeState = mergeState; MultiDocsEnum docsAndFreqsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn); Debug.Assert(docsAndFreqsEnumIn != null); DocsAndFreqsEnum.Reset(docsAndFreqsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsAndFreqsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (PostingsEnum == null) { PostingsEnum = new MappingMultiDocsAndPositionsEnum(); } PostingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS); Debug.Assert(postingsEnumIn != null); PostingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else { Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (PostingsEnum == null) { PostingsEnum = new MappingMultiDocsAndPositionsEnum(); } PostingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn); Debug.Assert(postingsEnumIn != null); PostingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } Finish(indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality()); }
public void TestFieldNotPresent() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, Similarity, TimeZone); int num = AtLeast(3); int skip = Random.Next(num); var terms = new List <Term>(); for (int i = 0; i < num; i++) { terms.Add(new Term("field" + i, "content1")); Document doc = new Document(); if (skip == i) { continue; } doc.Add(NewStringField("field" + i, "content1", Field.Store.YES)); w.AddDocument(doc); } w.ForceMerge(1); IndexReader reader = w.GetReader(); w.Dispose(); assertEquals(1, reader.Leaves.size()); AtomicReaderContext context = reader.Leaves.First(); TermsFilter tf = new TermsFilter(terms); FixedBitSet bits = (FixedBitSet)tf.GetDocIdSet(context, context.AtomicReader.LiveDocs); assertEquals("Must be num fields - 1 since we skip only one field", num - 1, bits.Cardinality()); reader.Dispose(); dir.Dispose(); }
private void LoadTerms() { var posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner); // honestly, wtf kind of generic mess is this. var b = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); var input = (IndexInput)_outerInstance._input.Clone(); input.Seek(_termsStart); var lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; var visitedDocs = new FixedBitSet(_maxDoc); var scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(input, _scratch); if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); _sumTotalTermFreq += totalTermFreq; } break; } if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; _sumDocFreq++; UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length, _scratchUtf16); int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); visitedDocs.Set(docId); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16); totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); } lastDocsStart = input.GetFilePointer(); int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; _sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; _termCount++; } } _docCount = visitedDocs.Cardinality(); _fst = b.Finish(); }
/// <summary> /// checks Fields api is consistent with itself. /// searcher is optional, to verify with queries. Can be null. /// </summary> private static Status.TermIndexStatus CheckFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, bool doPrint, bool isVectors, StreamWriter infoStream, bool verbose) { // TODO: we should probably return our own stats thing...?! Status.TermIndexStatus status = new Status.TermIndexStatus(); int computedFieldCount = 0; if (fields == null) { Msg(infoStream, "OK [no fields/terms]"); return status; } DocsEnum docs = null; DocsEnum docsAndFreqs = null; DocsAndPositionsEnum postings = null; string lastField = null; foreach (string field in fields) { // MultiFieldsEnum relies upon this order... if (lastField != null && field.CompareTo(lastField) <= 0) { throw new Exception("fields out of order: lastField=" + lastField + " field=" + field); } lastField = field; // check that the field is in fieldinfos, and is indexed. // TODO: add a separate test to check this for different reader impls FieldInfo fieldInfo = fieldInfos.FieldInfo(field); if (fieldInfo == null) { throw new Exception("fieldsEnum inconsistent with fieldInfos, no fieldInfos for: " + field); } if (!fieldInfo.Indexed) { throw new Exception("fieldsEnum inconsistent with fieldInfos, isIndexed == false for: " + field); } // TODO: really the codec should not return a field // from FieldsEnum if it has no Terms... but we do // this today: // assert fields.terms(field) != null; computedFieldCount++; Terms terms = fields.Terms(field); if (terms == null) { continue; } bool hasFreqs = terms.HasFreqs(); bool hasPositions = terms.HasPositions(); bool hasPayloads = terms.HasPayloads(); bool hasOffsets = terms.HasOffsets(); // term vectors cannot omit TF: bool expectedHasFreqs = (isVectors || fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS); if (hasFreqs != expectedHasFreqs) { throw new Exception("field \"" + field + "\" should have hasFreqs=" + expectedHasFreqs + " but got " + hasFreqs); } if (hasFreqs == false) { if (terms.SumTotalTermFreq != -1) { throw new Exception("field \"" + field + "\" hasFreqs is false, but Terms.getSumTotalTermFreq()=" + terms.SumTotalTermFreq + " (should be -1)"); } } if (!isVectors) { bool expectedHasPositions = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; if (hasPositions != expectedHasPositions) { throw new Exception("field \"" + field + "\" should have hasPositions=" + expectedHasPositions + " but got " + hasPositions); } bool expectedHasPayloads = fieldInfo.HasPayloads(); if (hasPayloads != expectedHasPayloads) { throw new Exception("field \"" + field + "\" should have hasPayloads=" + expectedHasPayloads + " but got " + hasPayloads); } bool expectedHasOffsets = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (hasOffsets != expectedHasOffsets) { throw new Exception("field \"" + field + "\" should have hasOffsets=" + expectedHasOffsets + " but got " + hasOffsets); } } TermsEnum termsEnum = terms.Iterator(null); bool hasOrd = true; long termCountStart = status.DelTermCount + status.TermCount; BytesRef lastTerm = null; IComparer<BytesRef> termComp = terms.Comparator; long sumTotalTermFreq = 0; long sumDocFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } Debug.Assert(term.Valid); // make sure terms arrive in order according to // the comp if (lastTerm == null) { lastTerm = BytesRef.DeepCopyOf(term); } else { if (termComp.Compare(lastTerm, term) >= 0) { throw new Exception("terms out of order: lastTerm=" + lastTerm + " term=" + term); } lastTerm.CopyBytes(term); } int docFreq = termsEnum.DocFreq(); if (docFreq <= 0) { throw new Exception("docfreq: " + docFreq + " is out of bounds"); } sumDocFreq += docFreq; docs = termsEnum.Docs(liveDocs, docs); postings = termsEnum.DocsAndPositions(liveDocs, postings); if (hasFreqs == false) { if (termsEnum.TotalTermFreq() != -1) { throw new Exception("field \"" + field + "\" hasFreqs is false, but TermsEnum.totalTermFreq()=" + termsEnum.TotalTermFreq() + " (should be -1)"); } } if (hasOrd) { long ord = -1; try { ord = termsEnum.Ord(); } catch (System.NotSupportedException uoe) { hasOrd = false; } if (hasOrd) { long ordExpected = status.DelTermCount + status.TermCount - termCountStart; if (ord != ordExpected) { throw new Exception("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected); } } } DocsEnum docs2; if (postings != null) { docs2 = postings; } else { docs2 = docs; } int lastDoc = -1; int docCount = 0; long totalTermFreq = 0; while (true) { int doc = docs2.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } status.TotFreq++; visitedDocs.Set(doc); int freq = -1; if (hasFreqs) { freq = docs2.Freq(); if (freq <= 0) { throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); } status.TotPos += freq; totalTermFreq += freq; } else { // When a field didn't index freq, it must // consistently "lie" and pretend that freq was // 1: if (docs2.Freq() != 1) { throw new Exception("term " + term + ": doc " + doc + ": freq " + freq + " != 1 when Terms.hasFreqs() is false"); } } docCount++; if (doc <= lastDoc) { throw new Exception("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); } if (doc >= maxDoc) { throw new Exception("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); } lastDoc = doc; int lastPos = -1; int lastOffset = 0; if (hasPositions) { for (int j = 0; j < freq; j++) { int pos = postings.NextPosition(); if (pos < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); } if (pos < lastPos) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } lastPos = pos; BytesRef payload = postings.Payload; if (payload != null) { Debug.Assert(payload.Valid); } if (payload != null && payload.Length < 1) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + " payload length is out of bounds " + payload.Length); } if (hasOffsets) { int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before? // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter if (!isVectors) { if (startOffset < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds"); } if (startOffset < lastOffset) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset); } if (endOffset < 0) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds"); } if (endOffset < startOffset) { throw new Exception("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset); } } lastOffset = startOffset; } } } } if (docCount != 0) { status.TermCount++; } else { status.DelTermCount++; } long totalTermFreq2 = termsEnum.TotalTermFreq(); bool hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1; // Re-count if there are deleted docs: if (liveDocs != null) { if (hasFreqs) { DocsEnum docsNoDel = termsEnum.Docs(null, docsAndFreqs); docCount = 0; totalTermFreq = 0; while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visitedDocs.Set(docsNoDel.DocID()); docCount++; totalTermFreq += docsNoDel.Freq(); } } else { DocsEnum docsNoDel = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE); docCount = 0; totalTermFreq = -1; while (docsNoDel.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visitedDocs.Set(docsNoDel.DocID()); docCount++; } } } if (docCount != docFreq) { throw new Exception("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount); } if (hasTotalTermFreq) { if (totalTermFreq2 <= 0) { throw new Exception("totalTermFreq: " + totalTermFreq2 + " is out of bounds"); } sumTotalTermFreq += totalTermFreq; if (totalTermFreq != totalTermFreq2) { throw new Exception("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq); } } // Test skipping if (hasPositions) { for (int idx = 0; idx < 7; idx++) { int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8); postings = termsEnum.DocsAndPositions(liveDocs, postings); int docID = postings.Advance(skipDocID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (docID < skipDocID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID); } int freq = postings.Freq(); if (freq <= 0) { throw new Exception("termFreq " + freq + " is out of bounds"); } int lastPosition = -1; int lastOffset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); if (pos < 0) { throw new Exception("position " + pos + " is out of bounds"); } if (pos < lastPosition) { throw new Exception("position " + pos + " is < lastPosition " + lastPosition); } lastPosition = pos; if (hasOffsets) { int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before? // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter if (!isVectors) { if (startOffset < 0) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds"); } if (startOffset < lastOffset) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset); } if (endOffset < 0) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds"); } if (endOffset < startOffset) { throw new Exception("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset); } } lastOffset = startOffset; } } int nextDocID = postings.NextDoc(); if (nextDocID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (nextDocID <= docID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID); } } } } else { for (int idx = 0; idx < 7; idx++) { int skipDocID = (int)(((idx + 1) * (long)maxDoc) / 8); docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE); int docID = docs.Advance(skipDocID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } else { if (docID < skipDocID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + ") returned docID=" + docID); } int nextDocID = docs.NextDoc(); if (nextDocID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (nextDocID <= docID) { throw new Exception("term " + term + ": advance(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID); } } } } } Terms fieldTerms = fields.Terms(field); if (fieldTerms == null) { // Unusual: the FieldsEnum returned a field but // the Terms for that field is null; this should // only happen if it's a ghost field (field with // no terms, eg there used to be terms but all // docs got deleted and then merged away): } else { if (fieldTerms is BlockTreeTermsReader.FieldReader) { BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader)fieldTerms).ComputeStats(); Debug.Assert(stats != null); if (status.BlockTreeStats == null) { status.BlockTreeStats = new Dictionary<string, BlockTreeTermsReader.Stats>(); } status.BlockTreeStats[field] = stats; } if (sumTotalTermFreq != 0) { long v = fields.Terms(field).SumTotalTermFreq; if (v != -1 && sumTotalTermFreq != v) { throw new Exception("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); } } if (sumDocFreq != 0) { long v = fields.Terms(field).SumDocFreq; if (v != -1 && sumDocFreq != v) { throw new Exception("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); } } if (fieldTerms != null) { int v = fieldTerms.DocCount; if (v != -1 && visitedDocs.Cardinality() != v) { throw new Exception("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.Cardinality()); } } // Test seek to last term: if (lastTerm != null) { if (termsEnum.SeekCeil(lastTerm) != TermsEnum.SeekStatus.FOUND) { throw new Exception("seek to last term " + lastTerm + " failed"); } int expectedDocFreq = termsEnum.DocFreq(); DocsEnum d = termsEnum.Docs(null, null, DocsEnum.FLAG_NONE); int docFreq = 0; while (d.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { docFreq++; } if (docFreq != expectedDocFreq) { throw new Exception("docFreq for last term " + lastTerm + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq); } } // check unique term count long termCount = -1; if ((status.DelTermCount + status.TermCount) - termCountStart > 0) { termCount = fields.Terms(field).Size(); if (termCount != -1 && termCount != status.DelTermCount + status.TermCount - termCountStart) { throw new Exception("termCount mismatch " + (status.DelTermCount + termCount) + " vs " + (status.TermCount - termCountStart)); } } // Test seeking by ord if (hasOrd && status.TermCount - termCountStart > 0) { int seekCount = (int)Math.Min(10000L, termCount); if (seekCount > 0) { BytesRef[] seekTerms = new BytesRef[seekCount]; // Seek by ord for (int i = seekCount - 1; i >= 0; i--) { long ord = i * (termCount / seekCount); termsEnum.SeekExact(ord); seekTerms[i] = BytesRef.DeepCopyOf(termsEnum.Term()); } // Seek by term long totDocCount = 0; for (int i = seekCount - 1; i >= 0; i--) { if (termsEnum.SeekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) { throw new Exception("seek to existing term " + seekTerms[i] + " failed"); } docs = termsEnum.Docs(liveDocs, docs, DocsEnum.FLAG_NONE); if (docs == null) { throw new Exception("null DocsEnum from to existing term " + seekTerms[i]); } while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totDocCount++; } } long totDocCountNoDeletes = 0; long totDocFreq = 0; for (int i = 0; i < seekCount; i++) { if (!termsEnum.SeekExact(seekTerms[i])) { throw new Exception("seek to existing term " + seekTerms[i] + " failed"); } totDocFreq += termsEnum.DocFreq(); docs = termsEnum.Docs(null, docs, DocsEnum.FLAG_NONE); if (docs == null) { throw new Exception("null DocsEnum from to existing term " + seekTerms[i]); } while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totDocCountNoDeletes++; } } if (totDocCount > totDocCountNoDeletes) { throw new Exception("more postings with deletes=" + totDocCount + " than without=" + totDocCountNoDeletes); } if (totDocCountNoDeletes != totDocFreq) { throw new Exception("docfreqs=" + totDocFreq + " != recomputed docfreqs=" + totDocCountNoDeletes); } } } } } int fieldCount = fields.Size; if (fieldCount != -1) { if (fieldCount < 0) { throw new Exception("invalid fieldCount: " + fieldCount); } if (fieldCount != computedFieldCount) { throw new Exception("fieldCount mismatch " + fieldCount + " vs recomputed field count " + computedFieldCount); } } // for most implementations, this is boring (just the sum across all fields) // but codecs that don't work per-field like preflex actually implement this, // but don't implement it on Terms, so the check isn't redundant. long uniqueTermCountAllFields = fields.UniqueTermCount; if (uniqueTermCountAllFields != -1 && status.TermCount + status.DelTermCount != uniqueTermCountAllFields) { throw new Exception("termCount mismatch " + uniqueTermCountAllFields + " vs " + (status.TermCount + status.DelTermCount)); } if (doPrint) { Msg(infoStream, "OK [" + status.TermCount + " terms; " + status.TotFreq + " terms/docs pairs; " + status.TotPos + " tokens]"); } if (verbose && status.BlockTreeStats != null && infoStream != null && status.TermCount > 0) { foreach (KeyValuePair<string, BlockTreeTermsReader.Stats> ent in status.BlockTreeStats) { infoStream.WriteLine(" field \"" + ent.Key + "\":"); infoStream.WriteLine(" " + ent.Value.ToString().Replace("\n", "\n ")); } } return status; }
// maxAllowed = the "highest" we can index, but we will still // randomly index at lower IndexOption private FieldsProducer BuildIndex(Directory dir, FieldInfo.IndexOptions maxAllowed, bool allowPayloads, bool alwaysTestMax) { Codec codec = Codec; SegmentInfo segmentInfo = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, "_0", MaxDoc, false, codec, null); int maxIndexOption = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(maxAllowed); if (VERBOSE) { Console.WriteLine("\nTEST: now build index"); } int maxIndexOptionNoOffsets = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToList().IndexOf(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); // TODO use allowPayloads var newFieldInfoArray = new FieldInfo[Fields.Count]; for (int fieldUpto = 0; fieldUpto < Fields.Count; fieldUpto++) { FieldInfo oldFieldInfo = FieldInfos.FieldInfo(fieldUpto); string pf = TestUtil.GetPostingsFormat(codec, oldFieldInfo.Name); int fieldMaxIndexOption; if (DoesntSupportOffsets.Contains(pf)) { fieldMaxIndexOption = Math.Min(maxIndexOptionNoOffsets, maxIndexOption); } else { fieldMaxIndexOption = maxIndexOption; } // Randomly picked the IndexOptions to index this // field with: FieldInfo.IndexOptions indexOptions = Enum.GetValues(typeof(FieldInfo.IndexOptions)).Cast<FieldInfo.IndexOptions>().ToArray()[alwaysTestMax ? fieldMaxIndexOption : Random().Next(1 + fieldMaxIndexOption)]; bool doPayloads = indexOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 && allowPayloads; newFieldInfoArray[fieldUpto] = new FieldInfo(oldFieldInfo.Name, true, fieldUpto, false, false, doPayloads, indexOptions, null, DocValuesType.NUMERIC, null); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); // Estimate that flushed segment size will be 25% of // what we use in RAM: long bytes = TotalPostings * 8 + TotalPayloadBytes; SegmentWriteState writeState = new SegmentWriteState(null, dir, segmentInfo, newFieldInfos, 32, null, new IOContext(new FlushInfo(MaxDoc, bytes))); FieldsConsumer fieldsConsumer = codec.PostingsFormat().FieldsConsumer(writeState); foreach (KeyValuePair<string, SortedDictionary<BytesRef, long>> fieldEnt in Fields) { string field = fieldEnt.Key; IDictionary<BytesRef, long> terms = fieldEnt.Value; FieldInfo fieldInfo = newFieldInfos.FieldInfo(field); FieldInfo.IndexOptions? indexOptions = fieldInfo.FieldIndexOptions; if (VERBOSE) { Console.WriteLine("field=" + field + " indexOtions=" + indexOptions); } bool doFreq = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool doPos = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool doPayloads = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && allowPayloads; bool doOffsets = indexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; TermsConsumer termsConsumer = fieldsConsumer.AddField(fieldInfo); long sumTotalTF = 0; long sumDF = 0; FixedBitSet seenDocs = new FixedBitSet(MaxDoc); foreach (KeyValuePair<BytesRef, long> termEnt in terms) { BytesRef term = termEnt.Key; SeedPostings postings = GetSeedPostings(term.Utf8ToString(), termEnt.Value, false, maxAllowed); if (VERBOSE) { Console.WriteLine(" term=" + field + ":" + term.Utf8ToString() + " docFreq=" + postings.DocFreq + " seed=" + termEnt.Value); } PostingsConsumer postingsConsumer = termsConsumer.StartTerm(term); long totalTF = 0; int docID = 0; while ((docID = postings.NextDoc()) != DocsEnum.NO_MORE_DOCS) { int freq = postings.Freq(); if (VERBOSE) { Console.WriteLine(" " + postings.Upto + ": docID=" + docID + " freq=" + postings.Freq_Renamed); } postingsConsumer.StartDoc(docID, doFreq ? postings.Freq_Renamed : -1); seenDocs.Set(docID); if (doPos) { totalTF += postings.Freq_Renamed; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = postings.NextPosition(); BytesRef payload = postings.Payload; if (VERBOSE) { if (doPayloads) { Console.WriteLine(" pos=" + pos + " payload=" + (payload == null ? "null" : payload.Length + " bytes")); } else { Console.WriteLine(" pos=" + pos); } } postingsConsumer.AddPosition(pos, doPayloads ? payload : null, doOffsets ? postings.StartOffset() : -1, doOffsets ? postings.EndOffset() : -1); } } else if (doFreq) { totalTF += freq; } else { totalTF++; } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(term, new TermStats(postings.DocFreq, doFreq ? totalTF : -1)); sumTotalTF += totalTF; sumDF += postings.DocFreq; } termsConsumer.Finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.Cardinality()); } fieldsConsumer.Dispose(); if (VERBOSE) { Console.WriteLine("TEST: after indexing: files="); foreach (string file in dir.ListAll()) { Console.WriteLine(" " + file + ": " + dir.FileLength(file) + " bytes"); } } CurrentFieldInfos = newFieldInfos; SegmentReadState readState = new SegmentReadState(dir, segmentInfo, newFieldInfos, IOContext.READ, 1); return codec.PostingsFormat().FieldsProducer(readState); }
private void ExecuteRandomJoin(bool multipleValuesPerDocument, int maxIndexIter, int maxSearchIter, int numberOfDocumentsToIndex) { for (int indexIter = 1; indexIter <= maxIndexIter; indexIter++) { if (VERBOSE) { Console.WriteLine("indexIter=" + indexIter); } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)) .SetMergePolicy(NewLogMergePolicy())); bool scoreDocsInOrder = TestJoinUtil.Random().NextBoolean(); IndexIterationContext context = CreateContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument, scoreDocsInOrder); IndexReader topLevelReader = w.Reader; w.Dispose(); for (int searchIter = 1; searchIter <= maxSearchIter; searchIter++) { if (VERBOSE) { Console.WriteLine("searchIter=" + searchIter); } IndexSearcher indexSearcher = NewSearcher(topLevelReader); int r = Random().Next(context.RandomUniqueValues.Length); bool from = context.RandomFrom[r]; string randomValue = context.RandomUniqueValues[r]; FixedBitSet expectedResult = CreateExpectedResult(randomValue, from, indexSearcher.IndexReader, context); Query actualQuery = new TermQuery(new Term("value", randomValue)); if (VERBOSE) { Console.WriteLine("actualQuery=" + actualQuery); } var scoreModeLength = Enum.GetNames(typeof(ScoreMode)).Length; ScoreMode scoreMode = (ScoreMode)Random().Next(scoreModeLength); if (VERBOSE) { Console.WriteLine("scoreMode=" + scoreMode); } Query joinQuery; if (from) { joinQuery = JoinUtil.CreateJoinQuery("from", multipleValuesPerDocument, "to", actualQuery, indexSearcher, scoreMode); } else { joinQuery = JoinUtil.CreateJoinQuery("to", multipleValuesPerDocument, "from", actualQuery, indexSearcher, scoreMode); } if (VERBOSE) { Console.WriteLine("joinQuery=" + joinQuery); } // Need to know all documents that have matches. TopDocs doesn't give me that and then I'd be also testing TopDocsCollector... FixedBitSet actualResult = new FixedBitSet(indexSearcher.IndexReader.MaxDoc); TopScoreDocCollector topScoreDocCollector = TopScoreDocCollector.Create(10, false); indexSearcher.Search(joinQuery, new CollectorAnonymousInnerClassHelper2(this, scoreDocsInOrder, context, actualResult, topScoreDocCollector)); // Asserting bit set... if (VERBOSE) { Console.WriteLine("expected cardinality:" + expectedResult.Cardinality()); DocIdSetIterator iterator = expectedResult.GetIterator(); for (int doc = iterator.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.NextDoc()) { Console.WriteLine(string.Format("Expected doc[{0}] with id value {1}", doc, indexSearcher.Doc(doc).Get("id"))); } Console.WriteLine("actual cardinality:" + actualResult.Cardinality()); iterator = actualResult.GetIterator(); for (int doc = iterator.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.NextDoc()) { Console.WriteLine(string.Format("Actual doc[{0}] with id value {1}", doc, indexSearcher.Doc(doc).Get("id"))); } } assertEquals(expectedResult, actualResult); // Asserting TopDocs... TopDocs expectedTopDocs = CreateExpectedTopDocs(randomValue, from, scoreMode, context); TopDocs actualTopDocs = topScoreDocCollector.GetTopDocs(); assertEquals(expectedTopDocs.TotalHits, actualTopDocs.TotalHits); assertEquals(expectedTopDocs.ScoreDocs.Length, actualTopDocs.ScoreDocs.Length); if (scoreMode == ScoreMode.None) { continue; } assertEquals(expectedTopDocs.MaxScore, actualTopDocs.MaxScore, 0.0f); for (int i = 0; i < expectedTopDocs.ScoreDocs.Length; i++) { if (VERBOSE) { string.Format("Expected doc: {0} | Actual doc: {1}\n", expectedTopDocs.ScoreDocs[i].Doc, actualTopDocs.ScoreDocs[i].Doc); string.Format("Expected score: {0} | Actual score: {1}\n", expectedTopDocs.ScoreDocs[i].Score, actualTopDocs.ScoreDocs[i].Score); } assertEquals(expectedTopDocs.ScoreDocs[i].Doc, actualTopDocs.ScoreDocs[i].Doc); assertEquals(expectedTopDocs.ScoreDocs[i].Score, actualTopDocs.ScoreDocs[i].Score, 0.0f); Explanation explanation = indexSearcher.Explain(joinQuery, expectedTopDocs.ScoreDocs[i].Doc); assertEquals(expectedTopDocs.ScoreDocs[i].Score, explanation.Value, 0.0f); } } topLevelReader.Dispose(); dir.Dispose(); } }