private void AssertTermsSeeking(Terms leftTerms, Terms rightTerms) { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = AtLeast(20); Random random = Random; // collect this number of terms from the left side ISet <BytesRef> tests = new JCG.HashSet <BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.Count < numTests) { leftEnum = leftTerms.GetEnumerator(leftEnum); BytesRef term = null; while (leftEnum.MoveNext()) { term = leftEnum.Term; int code = random.Next(10); if (code == 0) { // the term tests.Add(BytesRef.DeepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.DeepCopyOf(term); if (term.Length > 0) { // truncate it term.Length = random.Next(term.Length); } } else if (code == 2) { // term, but ensure a non-zero offset var newbytes = new byte[term.Length + 5]; Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length); tests.Add(new BytesRef(newbytes, 5, term.Length)); } } numPasses++; } List <BytesRef> shuffledTests = new List <BytesRef>(tests); shuffledTests.Shuffle(Random); foreach (BytesRef b in shuffledTests) { leftEnum = leftTerms.GetEnumerator(leftEnum); rightEnum = rightTerms.GetEnumerator(rightEnum); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } } }
//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: //ORIGINAL LINE: public SortingTermsEnum(final org.apache.lucene.index.TermsEnum in, Sorter.DocMap docMap, org.apache.lucene.index.FieldInfo.IndexOptions indexOptions) public SortingTermsEnum(TermsEnum @in, Sorter.DocMap docMap, IndexOptions indexOptions) : base(@in) { this.docMap = docMap; this.indexOptions = indexOptions; }
public AutomatonFuzzyTermsEnum(FuzzyTermsEnum outerInstance, TermsEnum tenum, CompiledAutomaton[] compiled) : base(tenum, false) { this.OuterInstance = outerInstance; if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } this.Matchers = new ByteRunAutomaton[compiled.Length]; for (int i = 0; i < compiled.Length; i++) { this.Matchers[i] = compiled[i].RunAutomaton; } TermRef = new BytesRef(outerInstance.Term_Renamed.Text()); }
public override TermsEnum Iterator(TermsEnum reuse) { if (!(reuse is BloomFilteredTermsEnum)) return new BloomFilteredTermsEnum(_delegateTerms, reuse, _filter); // recycle the existing BloomFilteredTermsEnum by asking the delegate // to recycle its contained TermsEnum var bfte = (BloomFilteredTermsEnum) reuse; // We have been handed something we cannot reuse (either null, wrong // class or wrong filter) so allocate a new object if (bfte.FILTER != _filter) return new BloomFilteredTermsEnum(_delegateTerms, reuse, _filter); bfte.Reset(_delegateTerms, bfte.DELEGATE_TERMS_ENUM); return bfte; }
public override TermsEnum Iterator(TermsEnum reuse) { DirectTermsEnum termsEnum; if (reuse != null && reuse is DirectTermsEnum) { termsEnum = (DirectTermsEnum) reuse; if (!termsEnum.CanReuse(terms)) { termsEnum = new DirectTermsEnum(this); } } else { termsEnum = new DirectTermsEnum(this); } termsEnum.Reset(); return termsEnum; }
internal BitsFilteredTermsEnum(TermsEnum @in, LongBitSet liveTerms) : base(@in, false) { Debug.Assert(liveTerms != null); LiveTerms = liveTerms; }
public TermsEnum iterator(TermsEnum reuse) { return new SegmentTermsEnum(); }
public override TermsEnum GetIterator(TermsEnum reuse) { return(fcsi.GetTermsEnum()); }
internal SimplePrefixTermsEnum(TestPrefixRandom.DumbPrefixQuery outerInstance, TermsEnum tenum, BytesRef prefix) : base(tenum) { this.OuterInstance = outerInstance; this.Prefix = prefix; InitialSeekTerm = new BytesRef(""); }
internal void Reset(Terms delegateTerms, TermsEnum reuseDelegate) { _delegateTerms = delegateTerms; _reuseDelegate = reuseDelegate; DELEGATE_TERMS_ENUM = null; }
private void DuellReaders(CompositeReader other, AtomicReader memIndexReader) { AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other); Fields memFields = memIndexReader.Fields; foreach (string field in competitor.Fields) { Terms memTerms = memFields.GetTerms(field); Terms iwTerms = memIndexReader.GetTerms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = competitor.GetNormValues(field); NumericDocValues memNormValues = memIndexReader.GetNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.Get(0), memNormValues.Get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.DocCount, memTerms.DocCount); assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq); assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq); TermsEnum iwTermsIter = iwTerms.GetIterator(null); TermsEnum memTermsIter = memTerms.GetIterator(null); if (iwTerms.HasPositions) { bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets; while (iwTermsIter.Next() != null) { assertNotNull(memTermsIter.Next()); assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsAndPositionsEnum iwDocsAndPos = iwTermsIter.DocsAndPositions(null, null); DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); for (int i = 0; i < iwDocsAndPos.Freq; i++) { assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition()); if (offsets) { assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset); assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset); } } } } } else { while (iwTermsIter.Next() != null) { assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsEnum iwDocsAndPos = iwTermsIter.Docs(null, null); DocsEnum memDocsAndPos = memTermsIter.Docs(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); } } } } } }
public BloomFilteredTermsEnum(Terms delegateTerms, TermsEnum reuseDelegate, FuzzySet filter) { _delegateTerms = delegateTerms; _reuseDelegate = reuseDelegate; FILTER = filter; }
public override TermsEnum Iterator(TermsEnum reuse) { return(new SegmentTermsEnum(this, _blockTermsReader)); }
/// <exception cref="System.IO.IOException"></exception> internal SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) : base(counts, total - counts[missingCountIndex ], counts[missingCountIndex], endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd) { this.tenum = tenum; this.mergePos = startFacetOrd; if (tenum != null) { tenum.SeekExact(mergePos); mergeTerm = tenum.Term(); } }
/// <summary> /// Filters the given <see cref="TermsEnum"/> by accepting only prefix coded 32 bit /// terms with a shift value of <c>0</c>. /// <para/> /// NOTE: This was filterPrefixCodedInts() in Lucene /// </summary> /// <param name="termsEnum"> /// The terms enum to filter </param> /// <returns> A filtered <see cref="TermsEnum"/> that only returns prefix coded 32 bit /// terms with a shift value of <c>0</c>. </returns> public static TermsEnum FilterPrefixCodedInt32s(TermsEnum termsEnum) { return(new FilteredTermsEnumAnonymousInnerClassHelper2(termsEnum)); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath; // LUCENENET: IDE0059: Remove unnecessary value assignment while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (enumerator.MoveNext()) { BytesRef surfaceForm = enumerator.Current; field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms is null) { throw new ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetEnumerator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst is null) { throw new ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw IllegalStateException.Create("failed to remove " + tempIndexPath, e); } } } }
/// <summary> /// Merges the sorted docvalues from <code>toMerge</code>. /// <p> /// The default implementation calls <seealso cref="#addSortedField"/>, passing /// an Iterable that merges ordinals and values and filters deleted documents.</p> /// </summary> public virtual void MergeSortedField(FieldInfo fieldInfo, MergeState mergeState, IList<SortedDocValues> toMerge) { AtomicReader[] readers = mergeState.Readers.ToArray(); SortedDocValues[] dvs = toMerge.ToArray(); // step 1: iterate thru each sub and mark terms still in use var liveTerms = new TermsEnum[dvs.Length]; for (int sub = 0; sub < liveTerms.Length; sub++) { AtomicReader reader = readers[sub]; SortedDocValues dv = dvs[sub]; Bits liveDocs = reader.LiveDocs; if (liveDocs == null) { liveTerms[sub] = dv.TermsEnum(); } else { var bitset = new LongBitSet(dv.ValueCount); for (int i = 0; i < reader.MaxDoc; i++) { if (liveDocs.Get(i)) { int ord = dv.GetOrd(i); if (ord >= 0) { bitset.Set(ord); } } } liveTerms[sub] = new BitsFilteredTermsEnum(dv.TermsEnum(), bitset); } } // step 2: create ordinal map (this conceptually does the "merging") var map = new OrdinalMap(this, liveTerms); // step 3: add field AddSortedField(fieldInfo, GetMergeSortValuesEnumerable(map, dvs), // doc -> ord GetMergeSortedFieldDocToOrdEnumerable(readers, dvs, map) ); }
internal SimplePrefixTermsEnum(TestPrefixRandom.DumbPrefixQuery outerInstance, TermsEnum tenum, BytesRef prefix) : base(tenum) { this.OuterInstance = outerInstance; this.Prefix = prefix; SetInitialSeekTerm(new BytesRef("")); }
public FilteredTermsEnumAnonymousInnerClassHelper2(TermsEnum termsEnum) : base(termsEnum, false) { }
public DocIdSetAnonymousClass(IBits acceptDocs, TermsEnum termsEnum) { this.acceptDocs = acceptDocs; this.termsEnum = termsEnum; }
public override TermsEnum Iterator(TermsEnum reuse) { var termsEnum = new PreTermsEnum(OuterInstance); termsEnum.Reset(fieldInfo); return termsEnum; }
public DocIdSetAnonymousInnerClassHelper(IBits acceptDocs, TermsEnum termsEnum) { this.acceptDocs = acceptDocs; this.termsEnum = termsEnum; }
public Int32DocValuesAnonymousInnerClassHelper(JoinDocFreqValueSource outerInstance, JoinDocFreqValueSource @this, BinaryDocValues terms, TermsEnum termsEnum) : base(@this) { this.outerInstance = outerInstance; this.terms = terms; this.termsEnum = termsEnum; @ref = new BytesRef(); }
public override TermsEnum Iterator(TermsEnum reuse) { return new SegmentTermsEnum(this, _blockTermsReader); }
/// <summary> /// Filters the given <see cref="TermsEnum"/> by accepting only prefix coded 64 bit /// terms with a shift value of <c>0</c>. /// <para/> /// NOTE: This was filterPrefixCodedLongs() in Lucene /// </summary> /// <param name="termsEnum"> /// The terms enum to filter </param> /// <returns> A filtered <see cref="TermsEnum"/> that only returns prefix coded 64 bit /// terms with a shift value of <c>0</c>. </returns> public static TermsEnum FilterPrefixCodedInt64s(TermsEnum termsEnum) { return(new FilteredTermsEnumAnonymousClass(termsEnum)); }
public override TermsEnum Iterator(TermsEnum reuse) { TVTermsEnum termsEnum; if (reuse is TVTermsEnum) { termsEnum = (TVTermsEnum)reuse; if (!termsEnum.CanReuse(OuterInstance.Tvf)) { termsEnum = new TVTermsEnum(OuterInstance); } } else { termsEnum = new TVTermsEnum(OuterInstance); } termsEnum.Reset(NumTerms, TvfFPStart, StorePositions, StoreOffsets, StorePayloads); return termsEnum; }
public FilteredTermsEnumAnonymousClass2(TermsEnum termsEnum) : base(termsEnum, false) { }
/// <exception cref="System.IO.IOException"></exception> public override void SetNextReader(AtomicReaderContext context) { if (segmentFacetCounts != null) { segmentResults.AddItem(((TermGroupFacetCollector.MV.SegmentResult)CreateSegmentResult ())); } groupFieldTermsIndex = FieldCache.DEFAULT.GetTermsIndex(((AtomicReader)context.Reader ()), groupField); facetFieldDocTermOrds = FieldCache.DEFAULT.GetDocTermOrds(((AtomicReader)context. Reader()), facetField); facetFieldNumTerms = (int)facetFieldDocTermOrds.GetValueCount(); if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.TermsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field segmentFacetCounts = new int[facetFieldNumTerms + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.Clear(); foreach (GroupedFacetHit groupedFacetHit in groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.LookupTerm (groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.SeekExact(groupedFacetHit.facetValue )) { continue; } facetOrd = (int)facetOrdTermsEnum.Ord(); } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.Put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.SeekCeil(facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { startFacetOrd = (int)facetOrdTermsEnum.Ord(); } else { startFacetOrd = 0; endFacetOrd = 0; return; } BytesRef facetEndPrefix = BytesRef.DeepCopyOf(facetPrefix); facetEndPrefix.Append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.SeekCeil(facetEndPrefix); if (seekStatus != TermsEnum.SeekStatus.END) { endFacetOrd = (int)facetOrdTermsEnum.Ord(); } else { endFacetOrd = facetFieldNumTerms; } } else { // Don't include null... startFacetOrd = 0; endFacetOrd = facetFieldNumTerms + 1; } }
public override TermsEnum GetIterator(TermsEnum reuse) { return((_fst != null) ? new SimpleTextTermsEnum(_outerInstance, _fst, _fieldInfo.IndexOptions) : TermsEnum.EMPTY); }
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { IBits randomBits = new RandomBits(MAXDOC, Random.NextDouble(), Random); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while (leftTermsEnum.MoveNext()) { Assert.IsTrue(rightTermsEnum.MoveNext()); Assert.AreEqual(leftTermsEnum.Term, rightTermsEnum.Term); AssertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with payloads only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS)); // with offsets only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS)); // with positions only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE)); // with freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE)); // with freqs: AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE)); AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE)); } } Assert.IsFalse(rightTermsEnum.MoveNext()); }
public override TermsEnum GetIterator(TermsEnum reuse) { return(docTermOrds.GetTermsEnum()); }
public override TermsEnum Iterator(TermsEnum reuse) { return new SegmentTermsEnum(this); }
public override TermsEnum GetIterator(TermsEnum reuse) { return(new MemoryTermsEnum(outerInstance.outerInstance, info)); }
/* private class IterableAnonymousInnerClassHelper3 : IEnumerable<BytesRef> { private readonly DocValuesConsumer OuterInstance; private SortedDocValues[] Dvs; private OrdinalMap Map; public IterableAnonymousInnerClassHelper3(DocValuesConsumer outerInstance, SortedDocValues[] dvs, OrdinalMap map) { this.OuterInstance = outerInstance; this.Dvs = dvs; this.Map = map; } // ord -> value public virtual IEnumerator<BytesRef> GetEnumerator() { return new IteratorAnonymousInnerClassHelper3(this); } private class IteratorAnonymousInnerClassHelper3 : IEnumerator<BytesRef> { private readonly IterableAnonymousInnerClassHelper3 OuterInstance; public IteratorAnonymousInnerClassHelper3(IterableAnonymousInnerClassHelper3 outerInstance) { this.OuterInstance = outerInstance; scratch = new BytesRef(); } internal readonly BytesRef scratch; internal int currentOrd; public virtual bool HasNext() { return currentOrd < OuterInstance.Map.ValueCount; } public virtual BytesRef Next() { if (!HasNext()) { throw new Exception(); } int segmentNumber = OuterInstance.Map.GetFirstSegmentNumber(currentOrd); int segmentOrd = (int)OuterInstance.Map.GetFirstSegmentOrd(currentOrd); OuterInstance.Dvs[segmentNumber].LookupOrd(segmentOrd, scratch); currentOrd++; return scratch; } public virtual void Remove() { throw new System.NotSupportedException(); } } } private class IterableAnonymousInnerClassHelper4 : IEnumerable<Number> { private readonly DocValuesConsumer OuterInstance; private AtomicReader[] Readers; private SortedDocValues[] Dvs; private OrdinalMap Map; public IterableAnonymousInnerClassHelper4(DocValuesConsumer outerInstance, AtomicReader[] readers, SortedDocValues[] dvs, OrdinalMap map) { this.OuterInstance = outerInstance; this.Readers = readers; this.Dvs = dvs; this.Map = map; } public virtual IEnumerator<Number> GetEnumerator() { return new IteratorAnonymousInnerClassHelper4(this); } private class IteratorAnonymousInnerClassHelper4 : IEnumerator<Number> { private readonly IterableAnonymousInnerClassHelper4 OuterInstance; public IteratorAnonymousInnerClassHelper4(IterableAnonymousInnerClassHelper4 outerInstance) { this.OuterInstance = outerInstance; readerUpto = -1; } internal int readerUpto; internal int docIDUpto; internal int nextValue; internal AtomicReader currentReader; internal Bits currentLiveDocs; internal bool nextIsSet; public virtual bool HasNext() { return nextIsSet || SetNext(); } public virtual void Remove() { throw new System.NotSupportedException(); } public virtual Number Next() { if (!HasNext()) { throw new NoSuchElementException(); } Debug.Assert(nextIsSet); nextIsSet = false; // TODO make a mutable number return nextValue; } private bool SetNext() { while (true) { if (readerUpto == OuterInstance.Readers.Length) { return false; } if (currentReader == null || docIDUpto == currentReader.MaxDoc) { readerUpto++; if (readerUpto < OuterInstance.Readers.Length) { currentReader = OuterInstance.Readers[readerUpto]; currentLiveDocs = currentReader.LiveDocs; } docIDUpto = 0; continue; } if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { nextIsSet = true; int segOrd = OuterInstance.Dvs[readerUpto].GetOrd(docIDUpto); nextValue = segOrd == -1 ? - 1 : (int) OuterInstance.Map.GetGlobalOrd(readerUpto, segOrd); docIDUpto++; return true; } docIDUpto++; } } } }*/ /// <summary> /// Merges the sortedset docvalues from <code>toMerge</code>. /// <p> /// The default implementation calls <seealso cref="#addSortedSetField"/>, passing /// an Iterable that merges ordinals and values and filters deleted documents . /// </summary> public virtual void MergeSortedSetField(FieldInfo fieldInfo, MergeState mergeState, IList<SortedSetDocValues> toMerge) { var readers = mergeState.Readers.ToArray(); var dvs = toMerge.ToArray(); // step 1: iterate thru each sub and mark terms still in use var liveTerms = new TermsEnum[dvs.Length]; for (int sub = 0; sub < liveTerms.Length; sub++) { var reader = readers[sub]; var dv = dvs[sub]; var liveDocs = reader.LiveDocs; if (liveDocs == null) { liveTerms[sub] = dv.TermsEnum(); } else { var bitset = new LongBitSet(dv.ValueCount); for (int i = 0; i < reader.MaxDoc; i++) { if (liveDocs.Get(i)) { dv.Document = i; long ord; while ((ord = dv.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { bitset.Set(ord); } } } liveTerms[sub] = new BitsFilteredTermsEnum(dv.TermsEnum(), bitset); } } // step 2: create ordinal map (this conceptually does the "merging") var map = new OrdinalMap(this, liveTerms); // step 3: add field AddSortedSetField(fieldInfo, GetMergeSortedSetValuesEnumerable(map, dvs), // doc -> ord count GetMergeSortedSetDocToOrdCountEnumerable(readers, dvs), // ords GetMergeSortedSetOrdsEnumerable(readers, dvs, map) ); }
public virtual void Test10kPulsed() { // we always run this test with pulsing codec. Codec cp = TestUtil.AlwaysPostingsFormat(new Pulsing41PostingsFormat(1)); DirectoryInfo f = CreateTempDir("10kpulsed"); BaseDirectoryWrapper dir = NewFSDirectory(f); dir.CheckIndexOnDispose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.NextInt32(Random, 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = NewField("field", "", ft); document.Add(field); //NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); // LUCENENET specific: Use .ToString formating instead for (int i = 0; i < 10050; i++) { //field.StringValue = df.format(i); field.SetStringValue(i.ToString("00000", CultureInfo.InvariantCulture)); iw.AddDocument(document); } IndexReader ir = iw.GetReader(); iw.Dispose(); TermsEnum te = MultiFields.GetTerms(ir, "field").GetEnumerator(); DocsEnum de = null; for (int i = 0; i < 10050; i++) { //string expected = df.format(i); string expected = i.ToString("00000", CultureInfo.InvariantCulture); te.MoveNext(); assertEquals(expected, te.Term.Utf8ToString()); de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE); assertTrue(de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.NextDoc()); } ir.Dispose(); TestUtil.CheckIndex(dir); dir.Dispose(); }
/// <summary> /// Filters the given <seealso cref="TermsEnum"/> by accepting only prefix coded 32 bit /// terms with a shift value of <tt>0</tt>. /// </summary> /// <param name="termsEnum"> /// the terms enum to filter </param> /// <returns> a filtered <seealso cref="TermsEnum"/> that only returns prefix coded 32 bit /// terms with a shift value of <tt>0</tt>. </returns> public static TermsEnum FilterPrefixCodedInts(TermsEnum termsEnum) { return new FilteredTermsEnumAnonymousInnerClassHelper2(termsEnum); }
//public static void main( string[] args ) throws Exception { // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); // Query query = parser.parse( "a x:b" ); // FieldQuery fieldQuery = new FieldQuery( query, true, false ); // Directory dir = new RAMDirectory(); // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); // Document doc = new Document(); // IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED); // ft.setStoreTermVectors(true); // ft.setStoreTermVectorOffsets(true); // ft.setStoreTermVectorPositions(true); // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); // doc.add( new Field( "f", ft, "b a b a f" ) ); // writer.addDocument( doc ); // writer.close(); // IndexReader reader = IndexReader.open(dir1); // new FieldTermStack( reader, 0, "f", fieldQuery ); // reader.close(); //} /// <summary> /// a constructor. /// </summary> /// <param name="reader"><see cref="IndexReader"/> of the index</param> /// <param name="docId">document id to be highlighted</param> /// <param name="fieldName">field of the document to be highlighted</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <exception cref="System.IO.IOException">If there is a low-level I/O error</exception> public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; ISet <string> termSet = fieldQuery.GetTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } Fields vectors = reader.GetTermVectors(docId); if (vectors == null) { // null snippet return; } Terms vector = vectors.GetTerms(fieldName); if (vector == null) { // null snippet return; } CharsRef spare = new CharsRef(); TermsEnum termsEnum = vector.GetIterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.MaxDoc; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); string term = spare.ToString(); if (!termSet.Contains(term)) { continue; } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.NextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0); int freq = dpEnum.Freq; for (int i = 0; i < freq; i++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { return; // no offsets, null snippet } termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight)); } } // sort by position CollectionUtil.TimSort(termList); // now look for dups at the same position, linking them together int currentPos = -1; TermInfo previous = null; TermInfo first = null; for (int i = 0; i < termList.Count;) { TermInfo current = termList[i]; if (current.Position == currentPos) { Debug.Assert(previous != null); previous.SetNext(current); previous = current; //iterator.Remove(); // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item) termList.RemoveAt(i); } else { if (previous != null) { previous.SetNext(first); } previous = first = current; currentPos = current.Position; // LUCENENET NOTE: Only increment the position if we don't do a delete. i++; } } if (previous != null) { previous.SetNext(first); } }
public override TermsEnum iterator(TermsEnum reuse) { return new FSTTermsEnum(field, fst); }
public virtual void TestPhrasePrefix() { Directory indexStore = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), indexStore, Similarity, TimeZone); Document doc1 = new Document(); Document doc2 = new Document(); Document doc3 = new Document(); Document doc4 = new Document(); Document doc5 = new Document(); doc1.Add(NewTextField("body", "blueberry pie", Field.Store.YES)); doc2.Add(NewTextField("body", "blueberry strudel", Field.Store.YES)); doc3.Add(NewTextField("body", "blueberry pizza", Field.Store.YES)); doc4.Add(NewTextField("body", "blueberry chewing gum", Field.Store.YES)); doc5.Add(NewTextField("body", "piccadilly circus", Field.Store.YES)); writer.AddDocument(doc1); writer.AddDocument(doc2); writer.AddDocument(doc3); writer.AddDocument(doc4); writer.AddDocument(doc5); IndexReader reader = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); // PhrasePrefixQuery query1 = new PhrasePrefixQuery(); MultiPhraseQuery query1 = new MultiPhraseQuery(); // PhrasePrefixQuery query2 = new PhrasePrefixQuery(); MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); LinkedList <Term> termsWithPrefix = new LinkedList <Term>(); // this TermEnum gives "piccadilly", "pie" and "pizza". string prefix = "pi"; TermsEnum te = MultiFields.GetFields(reader).GetTerms("body").GetIterator(null); te.SeekCeil(new BytesRef(prefix)); do { string s = te.Term.Utf8ToString(); if (s.StartsWith(prefix, StringComparison.Ordinal)) { termsWithPrefix.AddLast(new Term("body", s)); } else { break; } } while (te.Next() != null); query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); ScoreDoc[] result; result = searcher.Search(query1, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000).ScoreDocs; Assert.AreEqual(0, result.Length); reader.Dispose(); indexStore.Dispose(); }
public override TermsEnum Iterator(TermsEnum reuse) { // TODO: reuse return new SimpleTVTermsEnum(TERMS); }
/// <summary> /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This /// can be used to feed the highlighter with a pre-parsed token /// stream. The <see cref="Terms"/> must have offsets available. /// <para/> /// In my tests the speeds to recreate 1000 token streams using this method are: /// <list type="bullet"> /// <item><description> /// with TermVector offset only data stored - 420 milliseconds /// </description></item> /// <item><description> /// with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// </description></item> /// <item><description> /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// </description></item> /// </list> /// /// The re-analyze timings will typically vary depending on - /// <list type="number"> /// <item><description> /// The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// </description></item> /// <item><description> /// The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// </description></item> /// <item><description> /// Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </description></item> /// </list> /// </summary> /// <param name="tpv"></param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> /// <exception cref="ArgumentException">if no offsets are available</exception> public static TokenStream GetTokenStream(Terms tpv, bool tokenPositionsGuaranteedContiguous) { if (!tpv.HasOffsets) { throw new ArgumentException("Cannot create TokenStream from Terms without offsets"); } if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions) { return(new TokenStreamFromTermPositionVector(tpv)); } bool hasPayloads = tpv.HasPayloads; // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.GetEnumerator(); int totalTokens = 0; while (termsEnum.MoveNext()) { totalTokens += (int)termsEnum.TotalTermFreq; } Token[] tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; termsEnum = tpv.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { throw new ArgumentException("Required TermVector Offset information was not found"); } string term = termsEnum.Term.Utf8ToString(); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { throw new ArgumentException("Required TermVector Offset information was not found"); } Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset); if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 // token in same position or // creates jumps in position numbers - this code would fail under those // circumstances // tokens stored with positions - can use this to index straight into // sorted array tokensInOriginalOrder[pos] = token; } else { // tokens NOT stored with positions or not guaranteed contiguous - must // add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } unsortedTokens.Add(token); } } } // If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer()); //tokensInOriginalOrder = tokensInOriginalOrder // .OrderBy(t => t, new TokenComparer() ) // .ToArray(); } return(new StoredTokenStream(tokensInOriginalOrder)); }
/// <summary> /// Default merge impl </summary> public virtual void Merge(MergeState mergeState, FieldInfo.IndexOptions? indexOptions, TermsEnum termsEnum) { BytesRef term; Debug.Assert(termsEnum != null); long sumTotalTermFreq = 0; long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; FixedBitSet visitedDocs = new FixedBitSet(mergeState.SegmentInfo.DocCount); if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { if (DocsEnum == null) { DocsEnum = new MappingMultiDocsEnum(); } DocsEnum.MergeState = mergeState; MultiDocsEnum docsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, Index.DocsEnum.FLAG_NONE); if (docsEnumIn != null) { DocsEnum.Reset(docsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.DocFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS) { if (DocsAndFreqsEnum == null) { DocsAndFreqsEnum = new MappingMultiDocsEnum(); } DocsAndFreqsEnum.MergeState = mergeState; MultiDocsEnum docsAndFreqsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn); Debug.Assert(docsAndFreqsEnumIn != null); DocsAndFreqsEnum.Reset(docsAndFreqsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, DocsAndFreqsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (PostingsEnum == null) { PostingsEnum = new MappingMultiDocsAndPositionsEnum(); } PostingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsEnum.FLAG_PAYLOADS); Debug.Assert(postingsEnumIn != null); PostingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else { Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); if (PostingsEnum == null) { PostingsEnum = new MappingMultiDocsAndPositionsEnum(); } PostingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while ((term = termsEnum.Next()) != null) { // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn); Debug.Assert(postingsEnumIn != null); PostingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, PostingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.checkAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } Finish(indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality()); }
internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, TermsEnum termsEnum, long cost) { this.outerInstance = outerInstance; _acceptDocs = acceptDocs; _termsEnum = termsEnum; _cost = cost; _doc = -1; }
internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, TermsEnum termsEnum, int maxDoc, long cost) : base(outerInstance, weight, acceptDocs, termsEnum, cost) { this.outerInstance = outerInstance; alreadyEmittedDocs = new FixedBitSet(maxDoc); }
private TermsEnum Delegate() { // pull the iterator only if we really need it - // this can be a relativly heavy operation depending on the // delegate postings format and they underlying directory // (clone IndexInput) return DELEGATE_TERMS_ENUM ?? (DELEGATE_TERMS_ENUM = _delegateTerms.Iterator(_reuseDelegate)); }
internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, Bits acceptDocs, TermsEnum termsEnum, int maxDoc, long cost) : base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost) { }
private void VerifyVector(TermsEnum vector, int num) { StringBuilder temp = new StringBuilder(); while (vector.Next() != null) { temp.Append(vector.Term().Utf8ToString()); } if (!English.IntToEnglish(num).Trim().Equals(temp.ToString().Trim())) { Console.WriteLine("wrong term result"); } }
public override TermsEnum Iterator(TermsEnum reuse) { // TODO: reuse return(new SimpleTVTermsEnum(TERMS)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public org.apache.lucene.index.TermsEnum iterator(final org.apache.lucene.index.TermsEnum reuse) throws java.io.IOException //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: public override TermsEnum iterator(TermsEnum reuse) { return new SortingTermsEnum(@in.iterator(reuse), docMap, indexOptions); }
public override void Run() { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (Environment.TickCount < stopTimeMS) { try { IndexSearcher s = outerInstance.GetCurrentSearcher(); try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); string source; diagnostics.TryGetValue("source", out source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, // LUCENENET: ConditionalWeakTable doesn't have ContainsKey, so we normalize to TryGetValue !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.TryGetValue(segReader.core, out BooleanRef _)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetIterator(null); int seenTermCount = 0; int shift; int trigger; if (totTermCount < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount / 30; shift = Random.Next(trigger); } while (Environment.TickCount < stopTimeMS) { BytesRef term = termsEnum.Next(); if (term == null) { totTermCount.Value = seenTermCount; break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.m_failed.Value = (true); Console.WriteLine(t.ToString()); throw new Exception(t.ToString(), t); } } }
public TermRangeTermsEnumAnonymousInnerClassHelper(MultiTermQueryAnonymousInnerClassHelper outerInstance, TermsEnum iterator, BytesRef bref1, BytesRef bref2) : base(iterator, bref1, bref2, true, true) { this.OuterInstance = outerInstance; boostAtt = Attributes().AddAttribute<IBoostAttribute>(); }
private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query) { IDictionary <int, object> highlights = new Dictionary <int, object>(); PassageFormatter fieldFormatter = GetFormatter(field); if (fieldFormatter == null) { throw new NullReferenceException("PassageFormatter cannot be null"); } // check if we should do any multiterm processing Analyzer analyzer = GetIndexAnalyzer(field); CharacterRunAutomaton[] automata = Arrays.Empty <CharacterRunAutomaton>(); if (analyzer != null) { automata = MultiTermHighlighting.ExtractAutomata(query, field); } // resize 'terms', where the last term is the multiterm matcher if (automata.Length > 0) { BytesRef[] newTerms = new BytesRef[terms.Length + 1]; System.Array.Copy(terms, 0, newTerms, 0, terms.Length); terms = newTerms; } // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes // otherwise, we will just advance() existing enums to the new document in the same segment. DocsAndPositionsEnum[] postings = null; TermsEnum termsEnum = null; int lastLeaf = -1; for (int i = 0; i < docids.Length; i++) { string content = contents[i]; if (content.Length == 0) { continue; // nothing to do } bi.SetText(content); int doc = docids[i]; int leaf = ReaderUtil.SubIndex(doc, leaves); AtomicReaderContext subContext = leaves[leaf]; AtomicReader r = subContext.AtomicReader; if (Debugging.AssertsEnabled) { Debugging.Assert(leaf >= lastLeaf); // increasing order } // if the segment has changed, we must initialize new enums. if (leaf != lastLeaf) { Terms t = r.GetTerms(field); if (t != null) { termsEnum = t.GetEnumerator(); postings = new DocsAndPositionsEnum[terms.Length]; } } if (termsEnum == null) { continue; // no terms for this field, nothing to do } // if there are multi-term matches, we have to initialize the "fake" enum for each document if (automata.Length > 0) { DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); dp.Advance(doc - subContext.DocBase); postings[terms.Length - 1] = dp; // last term is the multiterm matcher } Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); if (passages.Length == 0) { // no passages were returned, so ask for a default summary passages = GetEmptyHighlight(field, bi, maxPassages); } if (passages.Length > 0) { highlights[doc] = fieldFormatter.Format(passages, content); } lastLeaf = leaf; } return(highlights); }
protected internal virtual void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init) { TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm); // instead of assert, we do a hard check in case someone uses our enum directly // assert newEnum != null; if (newEnum == null) { Debug.Assert(maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); throw new System.ArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE"); } Enum = newEnum; }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer == null) { throw new NullReferenceException("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de == null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) => { if (left.score < right.score) { return(-1); } else if (left.score > right.score) { return(1); } else { return(left.startOffset - right.startOffset); } })); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. if (Debugging.AssertsEnabled) { Debugging.Assert(EMPTY.StartOffset == int.MaxValue); } if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset)); return(passages); } // advance breakiterator if (Debugging.AssertsEnabled) { Debugging.Assert(BreakIterator.Done < 0); } current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: if (Debugging.AssertsEnabled) { Debugging.Assert(false); } return(null); }
internal SimpleAutomatonTermsEnum(TestRegexpRandom2.DumbRegexpQuery outerInstance, TermsEnum tenum) : base(tenum) { this.OuterInstance = outerInstance; if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } InitialSeekTerm = new BytesRef(""); }
/// <exception cref="System.IO.IOException"></exception> internal SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd , int endFacetOrd) : base(counts, total - counts[0], counts[0], endFacetOrd + 1) { this.tenum = tenum; this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1; if (mergePos < maxTermPos) { tenum != null.SeekExact(startFacetOrd == -1 ? 0 : startFacetOrd); mergeTerm = tenum.Term(); } }