/// <summary> /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of /// length <paramref name="prefixLength"/> with <paramref name="term"/> and which have a fuzzy similarity > /// <paramref name="minSimilarity"/>. /// <para/> /// After calling the constructor the enumeration is already pointing to the first /// valid term if such a term exists. /// </summary> /// <param name="terms"> Delivers terms. </param> /// <param name="atts"> <see cref="AttributeSource"/> created by the rewrite method of <see cref="MultiTermQuery"/> /// thats contains information about competitive boosts during rewrite. It is also used /// to cache DFAs between segment transitions. </param> /// <param name="term"> Pattern term. </param> /// <param name="minSimilarity"> Minimum required similarity for terms from the reader. Pass an integer value /// representing edit distance. Passing a fraction is deprecated. </param> /// <param name="prefixLength"> Length of required common prefix. Default value is 0. </param> /// <param name="transpositions"> Transpositions </param> /// <exception cref="IOException"> if there is a low-level IO error </exception> public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength, bool transpositions) { boostAtt = Attributes.AddAttribute <IBoostAttribute>(); if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity) { throw new ArgumentException("fractional edit distances are not allowed"); } if (minSimilarity < 0.0f) { throw new ArgumentOutOfRangeException(nameof(minSimilarity), "minimumSimilarity cannot be less than 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (prefixLength < 0) { throw new ArgumentOutOfRangeException(nameof(prefixLength), "prefixLength cannot be less than 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } this.m_terms = terms; this.term = term; // convert the string into a utf32 int[] representation for fast comparisons string utf16 = term.Text; this.m_termText = new int[utf16.CodePointCount(0, utf16.Length)]; for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp)) { m_termText[j++] = cp = utf16.CodePointAt(i); } this.m_termLength = m_termText.Length; this.dfaAtt = atts.AddAttribute <ILevenshteinAutomataAttribute>(); //The prefix could be longer than the word. //It's kind of silly though. It means we must match the entire word. this.m_realPrefixLength = prefixLength > m_termLength ? m_termLength : prefixLength; // if minSimilarity >= 1, we treat it as number of edits if (minSimilarity >= 1f) { this.m_minSimilarity = 0; // just driven by number of edits m_maxEdits = (int)minSimilarity; m_raw = true; } else { this.m_minSimilarity = minSimilarity; // calculate the maximum k edits for this similarity m_maxEdits = InitialMaxDistance(this.m_minSimilarity, m_termLength); m_raw = false; } if (transpositions && m_maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw UnsupportedOperationException.Create("with transpositions enabled, distances > " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported "); } this.transpositions = transpositions; this.m_scaleFactor = 1.0f / (1.0f - this.m_minSimilarity); this.maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); bottom = maxBoostAtt.MaxNonCompetitiveBoost; bottomTerm = maxBoostAtt.CompetitiveTerm; BottomChanged(null, true); }
public AutomatonFuzzyTermsEnum(FuzzyTermsEnum outerInstance, TermsEnum tenum, CompiledAutomaton[] compiled) : base(tenum, false) { this.outerInstance = outerInstance; boostAtt = Attributes.AddAttribute <IBoostAttribute>(); this.matchers = new ByteRunAutomaton[compiled.Length]; for (int i = 0; i < compiled.Length; i++) { this.matchers[i] = compiled[i].RunAutomaton; } termRef = new BytesRef(outerInstance.term.Text); }
public override void SetNextEnum(TermsEnum termsEnum) { this.termsEnum = termsEnum; this.termComp = termsEnum.Comparer; Debug.Assert(CompareToLastTerm(null)); // lazy init the initial ScoreTerm because comparer is not known on ctor: if (st == null) { st = new ScoreTerm(this.termComp, new TermContext(m_topReaderContext)); } boostAtt = termsEnum.Attributes.AddAttribute <IBoostAttribute>(); }
/// <summary> /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity > /// <c>minSimilarity</c>. /// <para/> /// After calling the constructor the enumeration is already pointing to the first /// valid term if such a term exists. /// </summary> /// <exception cref="IOException">If there is a low-level I/O error.</exception> public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance) : base(outerInstance.m_terms.GetEnumerator()) { this.outerInstance = outerInstance; this.boostAtt = Attributes.AddAttribute <IBoostAttribute>(); this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength]; System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length); string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength); prefixBytesRef = new BytesRef(prefix); this.d = new int[this.text.Length + 1]; this.p = new int[this.text.Length + 1]; SetInitialSeekTerm(prefixBytesRef); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString is null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms is null) { return; } TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; ISet <string> processedTerms = new JCG.HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes.AddAttribute <IBoostAttribute>(); while (fe.MoveNext()) { possibleMatch = fe.Term; numVariants++; totalVariantDocFreqs += fe.DocFreq; float score = boostAtt.Boost; if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top.Score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Count; for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
/// <summary> /// Provide spelling corrections based on several parameters. /// </summary> /// <param name="term"> The term to suggest spelling corrections for </param> /// <param name="numSug"> The maximum number of spelling corrections </param> /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param> /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param> /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param> /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param> /// <param name="spare"> a chars scratch </param> /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns> /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception> protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare) { var atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); Terms terms = MultiFields.GetTerms(ir, term.Field); if (terms == null) { return(new List <ScoreTerm>()); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true); var stQueue = new Support.PriorityQueue <ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.Text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>(); while ((candidateTerm = e.Next()) != null) { float boost = boostAtt.Boost; // ignore uncompetitive hits if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost) { continue; } // ignore exact match of the same term if (queryTerm.BytesEquals(candidateTerm)) { continue; } int df = e.DocFreq; // check docFreq if required if (df <= docfreq) { continue; } float score; string termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.ScaleFactor + e.MinSimilarity; } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.ToString(); score = distance.GetDistance(term.Text(), termAsString); } if (score < accuracy) { continue; } // add new entry in PQ st.Term = BytesRef.DeepCopyOf(candidateTerm); st.Boost = boost; st.Docfreq = df; st.TermAsString = termAsString; st.Score = score; stQueue.Offer(st); // possibly drop entries from queue st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm(); maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity; } return(stQueue); }
/// <summary> /// Swap in a new actual enum to proxy to </summary> protected virtual void SetEnum(TermsEnum actualEnum) { this.actualEnum = actualEnum; this.actualBoostAtt = actualEnum.Attributes.AddAttribute <IBoostAttribute>(); }
public override void SetNextEnum(TermsEnum termsEnum) { this.termsEnum = termsEnum; this.boostAtt = termsEnum.Attributes.AddAttribute <IBoostAttribute>(); }
public TermRangeTermsEnumAnonymousInnerClassHelper(MultiTermQueryAnonymousInnerClassHelper outerInstance, TermsEnum iterator, BytesRef bref1, BytesRef bref2) : base(iterator, bref1, bref2, true, true) { this.OuterInstance = outerInstance; boostAtt = Attributes().AddAttribute<IBoostAttribute>(); }