/// <summary> /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity > /// <c>minSimilarity</c>. /// <para/> /// After calling the constructor the enumeration is already pointing to the first /// valid term if such a term exists. /// </summary> /// <exception cref="IOException">If there is a low-level I/O error.</exception> public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance) : base(outerInstance.m_terms.GetEnumerator()) { this.outerInstance = outerInstance; this.boostAtt = Attributes.AddAttribute <IBoostAttribute>(); this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength]; System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length); string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength); prefixBytesRef = new BytesRef(prefix); this.d = new int[this.text.Length + 1]; this.p = new int[this.text.Length + 1]; SetInitialSeekTerm(prefixBytesRef); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms == null) { return; } TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; ISet <string> processedTerms = new JCG.HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes.AddAttribute <IBoostAttribute>(); while ((possibleMatch = fe.Next()) != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq; float score = boostAtt.Boost; if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top.Score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Count; for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.DisposeWhileHandlingException(ts); } }