/// <summary> /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the /// given selection of fields from terms with a document frequency greater than /// the given maxDocFreq /// </summary> /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> IndexReader to identify the stopwords from </param> /// <param name="fields"> Selection of fields to calculate stopwords for </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, IEnumerable <string> fields, int maxDocFreq) : base(@delegate.Strategy) { this.matchVersion = matchVersion; this.@delegate = @delegate; foreach (string field in fields) { var stopWords = new HashSet <string>(); Terms terms = MultiFields.GetTerms(indexReader, field); CharsRef spare = new CharsRef(); if (terms != null) { TermsEnum te = terms.Iterator(null); BytesRef text; while ((text = te.Next()) != null) { if (te.DocFreq() > maxDocFreq) { UnicodeUtil.UTF8toUTF16(text, spare); stopWords.Add(spare.ToString()); } } } stopWordsPerField[field] = stopWords; } }
public override bool Collect(BytesRef bytes) { int pos = PendingTerms.Add(bytes); DocVisitCount += TermsEnum.DocFreq(); if (PendingTerms.Size() >= TermCountLimit || DocVisitCount >= DocCountCutoff) { HasCutOff = true; return(false); } TermState termState = TermsEnum.TermState(); Debug.Assert(termState != null); if (pos < 0) { pos = (-pos) - 1; Array.TermState[pos].Register(termState, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); } else { Array.TermState[pos] = new TermContext(TopReaderContext, termState, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); } return(true); }
private FixedBitSet FastBits(AtomicReader reader, Bits acceptDocs) { FixedBitSet bits = new FixedBitSet(reader.MaxDoc); bits.Set(0, reader.MaxDoc); //assume all are valid Terms terms = reader.Fields.Terms(fieldName); if (terms == null) { return(bits); } TermsEnum termsEnum = terms.Iterator(null); DocsEnum docs = null; while (true) { BytesRef currTerm = termsEnum.Next(); if (currTerm == null) { break; } else { if (termsEnum.DocFreq() > 1) { // unset potential duplicates docs = termsEnum.Docs(acceptDocs, docs, DocsEnum.FLAG_NONE); int doc = docs.NextDoc(); if (doc != DocIdSetIterator.NO_MORE_DOCS) { if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { doc = docs.NextDoc(); } } int lastDoc = -1; while (true) { lastDoc = doc; bits.Clear(lastDoc); doc = docs.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } } if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) { // restore the last bit bits.Set(lastDoc); } } } } return(bits); }
/// <summary> /// checks term-level statistics /// </summary> public virtual void AssertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) { Assert.AreEqual(leftTermsEnum.DocFreq(), rightTermsEnum.DocFreq()); if (leftTermsEnum.TotalTermFreq() != -1 && rightTermsEnum.TotalTermFreq() != -1) { Assert.AreEqual(leftTermsEnum.TotalTermFreq(), rightTermsEnum.TotalTermFreq()); } }
public BytesRef Next() { if (termsEnum != null) { BytesRef next; while ((next = termsEnum.Next()) != null) { if (IsFrequent(termsEnum.DocFreq())) { freq = termsEnum.DocFreq(); spare.CopyBytes(next); return(spare); } } } return(null); }
internal void Fill(string field, TermsEnum termsEnum) { BytesRef term = null; while ((term = termsEnum.Next()) != null) { InsertWithOverflow(new TermStats(field, term, termsEnum.DocFreq(), termsEnum.TotalTermFreq())); } }
public virtual void CollectTermContext(IndexReader reader, IList <AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) { TermsEnum termsEnum = null; foreach (AtomicReaderContext context in leaves) { Fields fields = context.AtomicReader.Fields; if (fields == null) { // reader has no fields continue; } for (int i = 0; i < queryTerms.Length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.Terms terms = fields.terms(term.field()); Terms terms = fields.Terms(term.Field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.Iterator(termsEnum); Debug.Assert(termsEnum != null); if (termsEnum == TermsEnum.EMPTY) { continue; } if (termsEnum.SeekExact(term.Bytes())) { if (termContext == null) { contextArray[i] = new TermContext(reader.Context, termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } else { termContext.Register(termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } } } } }
public virtual void CollectTermContext(IndexReader reader, IList <AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) { TermsEnum termsEnum = null; foreach (AtomicReaderContext context in leaves) { Fields fields = context.AtomicReader.Fields; if (fields == null) { // reader has no fields continue; } for (int i = 0; i < queryTerms.Length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; Terms terms = fields.Terms(term.Field); if (terms == null) { // field does not exist continue; } termsEnum = terms.Iterator(termsEnum); Debug.Assert(termsEnum != null); if (termsEnum == TermsEnum.EMPTY) { continue; } if (termsEnum.SeekExact(term.Bytes)) { if (termContext == null) { contextArray[i] = new TermContext(reader.Context, termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } else { termContext.Register(termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } } } } }
public override int IntVal(int doc) { try { terms.Get(doc, @ref); if (termsEnum.SeekExact(@ref)) { return(termsEnum.DocFreq()); } else { return(0); } } catch (IOException e) { throw new Exception("caught exception in function " + outerInstance.Description + " : doc=" + doc, e); } }
public override bool Collect(BytesRef bytes) { int e = Terms.Add(bytes); TermState state = TermsEnum.TermState(); Debug.Assert(state != null); if (e < 0) { // duplicate term: update docFreq int pos = (-e) - 1; Array.TermState[pos].Register(state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); Debug.Assert(Array.Boost[pos] == BoostAtt.Boost, "boost should be equal in all segment TermsEnums"); } else { // new entry: we populate the entry initially Array.Boost[e] = BoostAtt.Boost; Array.TermState[e] = new TermContext(TopReaderContext, state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); OuterInstance.CheckMaxClauseCount(Terms.Size()); } return(true); }
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { BytesRef term; Bits randomBits = new RandomBits(MAXDOC, Random().NextDouble(), Random()); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while ((term = leftTermsEnum.Next()) != null) { Assert.AreEqual(term, rightTermsEnum.Next()); AssertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with payloads only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); // with offsets only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); // with positions only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); // with freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); // with freqs: AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE)); AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); } } Assert.IsNull(rightTermsEnum.Next()); }
public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order Debug.Assert(CompareToLastTerm(bytes)); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (StQueue.Size() == MaxSize) { ScoreTerm t = StQueue.Top(); if (boost < t.Boost) { return(true); } if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } ScoreTerm t2; TermState state = termsEnum.TermState(); Debug.Assert(state != null); if (visitedTerms.TryGetValue(bytes, out t2)) { // if the term is already in the PQ, only update docFreq of term in PQ Debug.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums"); t2.TermState.Register(state, ReaderContext.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; Debug.Assert(st.TermState.DocFreq == 0); st.TermState.Register(state, ReaderContext.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq()); StQueue.Add(st); // possibly drop entries from queue if (StQueue.Size() > MaxSize) { st = StQueue.Pop(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(TopReaderContext)); } Debug.Assert(StQueue.Size() <= MaxSize, "the PQ size must be limited to maxSize"); // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (StQueue.Size() == MaxSize) { t2 = StQueue.Top(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }
public override Scorer Scorer(AtomicReaderContext context, Bits acceptDocs) { Debug.Assert(OuterInstance.Terms_Renamed.Count > 0); AtomicReader reader = context.AtomicReader; Bits liveDocs = acceptDocs; PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[OuterInstance.Terms_Renamed.Count]; Terms fieldTerms = reader.Terms(OuterInstance.Field); if (fieldTerms == null) { return(null); } // Reuse single TermsEnum below: TermsEnum te = fieldTerms.Iterator(null); for (int i = 0; i < OuterInstance.Terms_Renamed.Count; i++) { Term t = OuterInstance.Terms_Renamed[i]; TermState state = States[i].Get(context.Ord); if (state == null) // term doesnt exist in this segment { Debug.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader"); return(null); } te.SeekExact(t.Bytes, state); DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsEnum.FLAG_NONE); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { Debug.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader"); // term does exist, but has no positions throw new InvalidOperationException("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text() + ")"); } postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq(), (int)OuterInstance.Positions_Renamed[i], t); } // sort by increasing docFreq order if (OuterInstance.slop == 0) { ArrayUtil.TimSort(postingsFreqs); } if (OuterInstance.slop == 0) // optimize exact case { ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, Similarity.DoSimScorer(Stats, context)); if (s.NoDocs) { return(null); } else { return(s); } } else { return(new SloppyPhraseScorer(this, postingsFreqs, OuterInstance.slop, Similarity.DoSimScorer(Stats, context))); } }
public override Scorer Scorer(AtomicReaderContext context, Bits acceptDocs) { Debug.Assert(OuterInstance.termArrays.Count > 0); AtomicReader reader = (context.AtomicReader); Bits liveDocs = acceptDocs; PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[OuterInstance.termArrays.Count]; Terms fieldTerms = reader.Terms(OuterInstance.Field); if (fieldTerms == null) { return(null); } // Reuse single TermsEnum below: TermsEnum termsEnum = fieldTerms.Iterator(null); for (int pos = 0; pos < postingsFreqs.Length; pos++) { Term[] terms = OuterInstance.termArrays[pos]; DocsAndPositionsEnum postingsEnum; int docFreq; if (terms.Length > 1) { postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, TermContexts, termsEnum); // coarse -- this overcounts since a given doc can // have more than one term: docFreq = 0; for (int termIdx = 0; termIdx < terms.Length; termIdx++) { Term term = terms[termIdx]; TermState termState = TermContexts[term].Get(context.Ord); if (termState == null) { // Term not in reader continue; } termsEnum.SeekExact(term.Bytes(), termState); docFreq += termsEnum.DocFreq(); } if (docFreq == 0) { // None of the terms are in this reader return(null); } } else { Term term = terms[0]; TermState termState = TermContexts[term].Get(context.Ord); if (termState == null) { // Term not in reader return(null); } termsEnum.SeekExact(term.Bytes(), termState); postingsEnum = termsEnum.DocsAndPositions(liveDocs, null, DocsEnum.FLAG_NONE); if (postingsEnum == null) { // term does exist, but has no positions Debug.Assert(termsEnum.Docs(liveDocs, null, DocsEnum.FLAG_NONE) != null, "termstate found but no term exists in reader"); throw new InvalidOperationException("field \"" + term.Field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text() + ")"); } docFreq = termsEnum.DocFreq(); } postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, (int)OuterInstance.positions[pos], terms); } // sort by increasing docFreq order if (OuterInstance.slop == 0) { ArrayUtil.TimSort(postingsFreqs); } if (OuterInstance.slop == 0) { ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, Similarity.DoSimScorer(Stats, context)); if (s.NoDocs) { return(null); } else { return(s); } } else { return(new SloppyPhraseScorer(this, postingsFreqs, OuterInstance.slop, Similarity.DoSimScorer(Stats, context))); } }