public override Query Rewrite(IndexReader reader) { FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); try { do { float minScore = 0.0f; float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); // terms come in alphabetical order, therefore if queue is full and score // not bigger than minScore, we can skip if (stQueue.Size() < maxClauseCount || score > minScore) { stQueue.Insert(new ScoreTerm(t, score)); minScore = ((ScoreTerm)stQueue.Top()).score; // maintain minScore } } }while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return(query); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects Hashtable processedTerms = new Hashtable(); while (ts.IncrementToken()) { String term = termAtt.Term(); if (!processedTerms.Contains(term)) { processedTerms.Add(term, term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term())) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term(); if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.Insert(st); minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore } } }while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.Insert(st); } } } } }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; HashSet <string> processedTerms = new HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes().AddAttribute <IBoostAttribute>(); while ((possibleMatch = fe.Next()) != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = boostAtt.Boost; if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top().score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Size() >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.CloseWhileHandlingException(ts); } }
public override Query Rewrite(IndexReader reader) { FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); try { do { float minScore = 0.0f; float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); // terms come in alphabetical order, therefore if queue is full and score // not bigger than minScore, we can skip if (stQueue.Size() < maxClauseCount || score > minScore) { stQueue.Insert(new ScoreTerm(t, score)); minScore = ((ScoreTerm) stQueue.Top()).score; // maintain minScore } } } while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm) stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return query; }