public void TestConcurrentSpan() { String TEXT = "the fox jumped"; Directory directory = NewDirectory(); IndexWriter indexWriter = new IndexWriter(directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false))); try { Document document = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectorOffsets = (true); customType.StoreTermVectorPositions = (true); customType.StoreTermVectors = (true); document.Add(new Field(FIELD, new TokenStreamConcurrent(), customType)); indexWriter.AddDocument(document); } finally { indexWriter.Dispose(); } IndexReader indexReader = DirectoryReader.Open(directory); try { assertEquals(1, indexReader.NumDocs); IndexSearcher indexSearcher = NewSearcher(indexReader); Query phraseQuery = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD, "fox")), new SpanTermQuery(new Term(FIELD, "jumped")) }, 0, true); FixedBitSet bitset = new FixedBitSet(indexReader.MaxDoc); indexSearcher.Search(phraseQuery, new ConcurrentSpanCollectorAnonymousHelper(this, bitset)); assertEquals(1, bitset.Cardinality()); int maxDoc = indexReader.MaxDoc; Highlighter highlighter = new Highlighter( new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), new QueryScorer(phraseQuery)); for (int position = bitset.NextSetBit(0); position >= 0 && position < maxDoc - 1; position = bitset .NextSetBit(position + 1)) { assertEquals(0, position); TokenStream tokenStream = TokenSources.GetTokenStream( indexReader.GetTermVector(position, FIELD), false); assertEquals(highlighter.GetBestFragment(new TokenStreamConcurrent(), TEXT), highlighter.GetBestFragment(tokenStream, TEXT)); } } finally { indexReader.Dispose(); directory.Dispose(); } }
void doNextSetBit(BitArray a, FixedBitSet b) { int aa = -1, bb = -1; do { aa = a.NextSetBit(aa + 1); bb = bb < b.Length() - 1 ? b.NextSetBit(bb + 1) : -1; Assert.AreEqual(aa, bb); } while (aa >= 0); }
/// <param name="targetMaxSaturation"> /// A number between 0 and 1 describing the % of bits that would ideally be set in the result. /// Lower values have better accuracy but require more space. /// </param> /// <return>A smaller <see cref="FuzzySet"/> or <c>null</c> if the current set is already over-saturated.</return> public virtual FuzzySet Downsize(float targetMaxSaturation) { var numBitsSet = _filter.Cardinality(); FixedBitSet rightSizedBitSet; var rightSizedBitSetSize = _bloomSize; //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level for (int i = 0; i < _usableBitSetSizes.Length; i++) { int candidateBitsetSize = _usableBitSetSizes[i]; float candidateSaturation = (float)numBitsSet / (float)candidateBitsetSize; if (candidateSaturation <= targetMaxSaturation) { rightSizedBitSetSize = candidateBitsetSize; break; } } // Re-project the numbers to a smaller space if necessary if (rightSizedBitSetSize < _bloomSize) { // Reset the choice of bitset to the smaller version rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1); // Map across the bits from the large set to the smaller one var bitIndex = 0; do { bitIndex = _filter.NextSetBit(bitIndex); if (bitIndex < 0) { continue; } // Project the larger number into a smaller one effectively // modulo-ing by using the target bitset size as a mask var downSizedBitIndex = bitIndex & rightSizedBitSetSize; rightSizedBitSet.Set(downSizedBitIndex); bitIndex++; } while ((bitIndex >= 0) && (bitIndex <= _bloomSize)); } else { return(null); } return(new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, _hashFunction)); }
internal virtual int Parent(int doc) { return(parentBits.NextSetBit(doc)); }
public override int NextDoc() { //System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc); // Loop until we hit a parentDoc that's accepted while (true) { if (_nextChildDoc == NO_MORE_DOCS) { //System.out.println(" end"); return(_parentDoc = NO_MORE_DOCS); } // Gather all children sharing the same parent as // nextChildDoc _parentDoc = _parentBits.NextSetBit(_nextChildDoc); // Parent & child docs are supposed to be // orthogonal: if (_nextChildDoc == _parentDoc) { throw IllegalStateException.Create("child query must only match non-parent docs, but parent docID=" + _nextChildDoc + " matched childScorer=" + _childScorer.GetType()); } //System.out.println(" parentDoc=" + parentDoc); if (Debugging.AssertsEnabled) { Debugging.Assert(_parentDoc != -1); } //System.out.println(" nextChildDoc=" + nextChildDoc); if (_acceptDocs != null && !_acceptDocs.Get(_parentDoc)) { // Parent doc not accepted; skip child docs until // we hit a new parent doc: do { _nextChildDoc = _childScorer.NextDoc(); } while (_nextChildDoc < _parentDoc); // Parent & child docs are supposed to be // orthogonal: if (_nextChildDoc == _parentDoc) { throw IllegalStateException.Create("child query must only match non-parent docs, but parent docID=" + _nextChildDoc + " matched childScorer=" + _childScorer.GetType()); } continue; } float totalScore = 0; float maxScore = float.NegativeInfinity; _childDocUpto = 0; _parentFreq = 0; do { //System.out.println(" c=" + nextChildDoc); if (_pendingChildDocs != null && _pendingChildDocs.Length == _childDocUpto) { _pendingChildDocs = ArrayUtil.Grow(_pendingChildDocs); } if (_pendingChildScores != null && _scoreMode != ScoreMode.None && _pendingChildScores.Length == _childDocUpto) { _pendingChildScores = ArrayUtil.Grow(_pendingChildScores); } if (_pendingChildDocs != null) { _pendingChildDocs[_childDocUpto] = _nextChildDoc; } if (_scoreMode != ScoreMode.None) { // TODO: specialize this into dedicated classes per-scoreMode float childScore = _childScorer.GetScore(); int childFreq = _childScorer.Freq; if (_pendingChildScores != null) { _pendingChildScores[_childDocUpto] = childScore; } maxScore = Math.Max(childScore, maxScore); totalScore += childScore; _parentFreq += childFreq; } _childDocUpto++; _nextChildDoc = _childScorer.NextDoc(); } while (_nextChildDoc < _parentDoc); // Parent & child docs are supposed to be // orthogonal: if (_nextChildDoc == _parentDoc) { throw IllegalStateException.Create("child query must only match non-parent docs, but parent docID=" + _nextChildDoc + " matched childScorer=" + _childScorer.GetType()); } switch (_scoreMode) { case ScoreMode.Avg: _parentScore = totalScore / _childDocUpto; break; case ScoreMode.Max: _parentScore = maxScore; break; case ScoreMode.Total: _parentScore = totalScore; break; case ScoreMode.None: break; } //System.out.println(" return parentDoc=" + parentDoc + " childDocUpto=" + childDocUpto); return(_parentDoc); } }
public void Test() { RandomIndexWriter writer; DirectoryReader indexReader; int numParents = AtLeast(200); IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); cfg.SetMergePolicy(NewLogMergePolicy()); using (writer = new RandomIndexWriter(Random(), NewDirectory(), cfg)) { Document parentDoc = new Document(); NumericDocValuesField parentVal = new NumericDocValuesField("parent_val", 0L); parentDoc.Add(parentVal); StringField parent = new StringField("parent", "true", Field.Store.YES); parentDoc.Add(parent); for (int i = 0; i < numParents; ++i) { List <Document> documents = new List <Document>(); int numChildren = Random().nextInt(10); for (int j = 0; j < numChildren; ++j) { Document childDoc = new Document(); childDoc.Add(new NumericDocValuesField("child_val", Random().nextInt(5))); documents.Add(childDoc); } parentVal.SetInt64Value(Random().nextInt(50)); documents.Add(parentDoc); writer.AddDocuments(documents); } writer.ForceMerge(1); indexReader = writer.Reader; } AtomicReader reader = GetOnlySegmentReader(indexReader); Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("parent", "true")))); FixedBitSet parentBits = (FixedBitSet)parentsFilter.GetDocIdSet(reader.AtomicContext, null); NumericDocValues parentValues = reader.GetNumericDocValues("parent_val"); NumericDocValues childValues = reader.GetNumericDocValues("child_val"); Sort parentSort = new Sort(new SortField("parent_val", SortFieldType.INT64)); Sort childSort = new Sort(new SortField("child_val", SortFieldType.INT64)); Sort sort = new Sort(new SortField("custom", new BlockJoinComparerSource(parentsFilter, parentSort, childSort))); Sorter sorter = new Sorter(sort); Sorter.DocMap docMap = sorter.Sort(reader); assertEquals(reader.MaxDoc, docMap.Count); int[] children = new int[1]; int numChildren2 = 0; int previousParent = -1; for (int i = 0; i < docMap.Count; ++i) { int oldID = docMap.NewToOld(i); if (parentBits.Get(oldID)) { // check that we have the right children for (int j = 0; j < numChildren2; ++j) { assertEquals(oldID, parentBits.NextSetBit(children[j])); } // check that children are sorted for (int j = 1; j < numChildren2; ++j) { int doc1 = children[j - 1]; int doc2 = children[j]; if (childValues.Get(doc1) == childValues.Get(doc2)) { assertTrue(doc1 < doc2); // sort is stable } else { assertTrue(childValues.Get(doc1) < childValues.Get(doc2)); } } // check that parents are sorted if (previousParent != -1) { if (parentValues.Get(previousParent) == parentValues.Get(oldID)) { assertTrue(previousParent < oldID); } else { assertTrue(parentValues.Get(previousParent) < parentValues.Get(oldID)); } } // reset previousParent = oldID; numChildren2 = 0; } else { children = ArrayUtil.Grow(children, numChildren2 + 1); children[numChildren2++] = oldID; } } indexReader.Dispose(); writer.w.Directory.Dispose(); }
/// <summary> /// Used when drill downs are highly constraining vs /// baseQuery. /// </summary> private void DoDrillDownAdvanceScoring(ICollector collector, DocIdSetIterator[] disis, ICollector[] sidewaysCollectors) { int maxDoc = context.Reader.MaxDoc; int numDims = dims.Length; //if (DEBUG) { // System.out.println(" doDrillDownAdvanceScoring"); //} // TODO: maybe a class like BS, instead of parallel arrays int[] filledSlots = new int[CHUNK]; int[] docIDs = new int[CHUNK]; float[] scores = new float[CHUNK]; int[] missingDims = new int[CHUNK]; int[] counts = new int[CHUNK]; docIDs[0] = -1; int nextChunkStart = CHUNK; FixedBitSet seen = new FixedBitSet(CHUNK); while (true) { //if (DEBUG) { // System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]); //} // First dim: //if (DEBUG) { // System.out.println(" dim0"); //} DocIdSetIterator disi = disis[0]; if (disi != null) { int docID = disi.DocID; while (docID < nextChunkStart) { int slot = docID & MASK; if (docIDs[slot] != docID) { seen.Set(slot); // Mark slot as valid: //if (DEBUG) { // System.out.println(" set docID=" + docID + " id=" + context.reader().document(docID).get("id")); //} docIDs[slot] = docID; missingDims[slot] = 1; counts[slot] = 1; } docID = disi.NextDoc(); } } // Second dim: //if (DEBUG) { // System.out.println(" dim1"); //} disi = disis[1]; if (disi != null) { int docID = disi.DocID; while (docID < nextChunkStart) { int slot = docID & MASK; if (docIDs[slot] != docID) { // Mark slot as valid: seen.Set(slot); //if (DEBUG) { // System.out.println(" set docID=" + docID + " missingDim=0 id=" + context.reader().document(docID).get("id")); //} docIDs[slot] = docID; missingDims[slot] = 0; counts[slot] = 1; } else { // TODO: single-valued dims will always be true // below; we could somehow specialize if (missingDims[slot] >= 1) { missingDims[slot] = 2; counts[slot] = 2; //if (DEBUG) { // System.out.println(" set docID=" + docID + " missingDim=2 id=" + context.reader().document(docID).get("id")); //} } else { counts[slot] = 1; //if (DEBUG) { // System.out.println(" set docID=" + docID + " missingDim=" + missingDims[slot] + " id=" + context.reader().document(docID).get("id")); //} } } docID = disi.NextDoc(); } } // After this we can "upgrade" to conjunction, because // any doc not seen by either dim 0 or dim 1 cannot be // a hit or a near miss: //if (DEBUG) { // System.out.println(" baseScorer"); //} // Fold in baseScorer, using advance: int filledCount = 0; int slot0 = 0; while (slot0 < CHUNK && (slot0 = seen.NextSetBit(slot0)) != -1) { int ddDocID = docIDs[slot0]; if (Debugging.AssertsEnabled) { Debugging.Assert(ddDocID != -1); } int baseDocID = baseScorer.DocID; if (baseDocID < ddDocID) { baseDocID = baseScorer.Advance(ddDocID); } if (baseDocID == ddDocID) { //if (DEBUG) { // System.out.println(" keep docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id")); //} scores[slot0] = baseScorer.GetScore(); filledSlots[filledCount++] = slot0; counts[slot0]++; } else { //if (DEBUG) { // System.out.println(" no docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id")); //} docIDs[slot0] = -1; // TODO: we could jump slot0 forward to the // baseDocID ... but we'd need to set docIDs for // intervening slots to -1 } slot0++; } seen.Clear(0, CHUNK); if (filledCount == 0) { if (nextChunkStart >= maxDoc) { break; } nextChunkStart += CHUNK; continue; } // TODO: factor this out & share w/ union scorer, // except we start from dim=2 instead: for (int dim = 2; dim < numDims; dim++) { //if (DEBUG) { // System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]"); //} disi = disis[dim]; if (disi != null) { int docID = disi.DocID; while (docID < nextChunkStart) { int slot = docID & MASK; if (docIDs[slot] == docID && counts[slot] >= dim) { // TODO: single-valued dims will always be true // below; we could somehow specialize if (missingDims[slot] >= dim) { //if (DEBUG) { // System.out.println(" set docID=" + docID + " count=" + (dim+2)); //} missingDims[slot] = dim + 1; counts[slot] = dim + 2; } else { //if (DEBUG) { // System.out.println(" set docID=" + docID + " missing count=" + (dim+1)); //} counts[slot] = dim + 1; } } // TODO: sometimes use advance? docID = disi.NextDoc(); } } } // Collect: //if (DEBUG) { // System.out.println(" now collect: " + filledCount + " hits"); //} for (int i = 0; i < filledCount; i++) { int slot = filledSlots[i]; collectDocID = docIDs[slot]; collectScore = scores[slot]; //if (DEBUG) { // System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]); //} if (counts[slot] == 1 + numDims) { CollectHit(collector, sidewaysCollectors); } else if (counts[slot] == numDims) { CollectNearMiss(sidewaysCollectors[missingDims[slot]]); } } if (nextChunkStart >= maxDoc) { break; } nextChunkStart += CHUNK; } }