public override Query Rewrite(IndexReader reader) { FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); try { do { float minScore = 0.0f; float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); // terms come in alphabetical order, therefore if queue is full and score // not bigger than minScore, we can skip if (stQueue.Size() < maxClauseCount || score > minScore) { stQueue.Insert(new ScoreTerm(t, score)); minScore = ((ScoreTerm)stQueue.Top()).score; // maintain minScore } } }while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return(query); }
public override Query Rewrite(IndexReader reader) { FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); ScoreTerm reusableST = null; try { do { float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); if (reusableST == null) { reusableST = new ScoreTerm(t, score); } else if (score >= reusableST.score) { // reusableST holds the last "rejected" entry, so, if // this new score is not better than that, there's no // need to try inserting it reusableST.score = score; reusableST.term = t; } else { continue; } reusableST = (ScoreTerm)stQueue.InsertWithOverflow(reusableST); } }while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return(query); }
public override Query Rewrite(IndexReader reader) { if (!termLongEnough) { // can only match if it's exact return new TermQuery(term); } FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); ScoreTerm reusableST = null; try { do { float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); if (reusableST == null) { reusableST = new ScoreTerm(t, score); } else if (score >= reusableST.score) { // reusableST holds the last "rejected" entry, so, if // this new score is not better than that, there's no // need to try inserting it reusableST.score = score; reusableST.term = t; } else { continue; } reusableST = (ScoreTerm) stQueue.InsertWithOverflow(reusableST); } } while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm) stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return query; }
public override Query Rewrite(IndexReader reader) { if (rewrittenQuery != null) { return(rewrittenQuery); } //load up the list of possible terms foreach (FieldVals f in fieldVals) { AddTerms(reader, f); } //for (Iterator iter = fieldVals.iterator(); iter.hasNext(); ) //{ // FieldVals f = (FieldVals)iter.next(); // addTerms(reader, f); //} //clear the list of fields fieldVals.Clear(); BooleanQuery bq = new BooleanQuery(); //create BooleanQueries to hold the variants for each token/field pair and ensure it // has no coord factor //Step 1: sort the termqueries by term/field Hashtable variantQueries = new Hashtable(); int size = q.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)q.Pop(); ArrayList l = (ArrayList)variantQueries[st.fuzziedSourceTerm]; if (l == null) { l = new ArrayList(); variantQueries.Add(st.fuzziedSourceTerm, l); } l.Add(st); } //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries foreach (ArrayList variants in variantQueries.Values) //for (Iterator iter = variantQueries.values().iterator(); iter.hasNext(); ) { //ArrayList variants = (ArrayList)iter.next(); if (variants.Count == 1) { //optimize where only one selected variant ScoreTerm st = (ScoreTerm)variants[0]; TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); tq.SetBoost(st.score); // set the boost to a mix of IDF and score bq.Add(tq, BooleanClause.Occur.SHOULD); } else { BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants foreach (ScoreTerm st in variants) //for (Iterator iterator2 = variants.iterator(); iterator2.hasNext(); ) { //ScoreTerm st = (ScoreTerm)iterator2.next(); TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); // found a match tq.SetBoost(st.score); // set the boost using the ScoreTerm's score termVariants.Add(tq, BooleanClause.Occur.SHOULD); // add to query } bq.Add(termVariants, BooleanClause.Occur.SHOULD); // add to query } } //TODO possible alternative step 3 - organize above booleans into a new layer of field-based // booleans with a minimum-should-match of NumFields-1? bq.SetBoost(GetBoost()); this.rewrittenQuery = bq; return(bq); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects Hashtable processedTerms = new Hashtable(); while (ts.IncrementToken()) { String term = termAtt.Term(); if (!processedTerms.Contains(term)) { processedTerms.Add(term, term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term())) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term(); if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.Insert(st); minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore } } }while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.Insert(st); } } } } }
public override Query Rewrite(IndexReader reader) { if (rewrittenQuery != null) { return(rewrittenQuery); } //load up the list of possible terms for (IEnumerator <FieldVals> iter = fieldVals.GetEnumerator(); iter.MoveNext();) { FieldVals f = iter.Current; AddTerms(reader, f); } //clear the list of fields fieldVals.Clear(); BooleanQuery bq = new BooleanQuery(); //create BooleanQueries to hold the variants for each token/field pair and ensure it // has no coord factor //Step 1: sort the termqueries by term/field IDictionary <Term, List <ScoreTerm> > variantQueries = new Dictionary <Term, List <ScoreTerm> >(); int size = q.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = q.Pop(); //List<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm); // if(l==null) List <ScoreTerm> l; if (!variantQueries.TryGetValue(st.fuzziedSourceTerm, out l) || l == null) { l = new List <ScoreTerm>(); variantQueries[st.fuzziedSourceTerm] = l; } l.Add(st); } //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries for (var iter = variantQueries.Values.GetEnumerator(); iter.MoveNext();) { List <ScoreTerm> variants = iter.Current; if (variants.Count == 1) { //optimize where only one selected variant ScoreTerm st = variants[0]; Query tq = ignoreTF ? (Query) new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1); tq.Boost = st.score; // set the boost to a mix of IDF and score bq.Add(tq, BooleanClause.Occur.SHOULD); } else { BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants for (IEnumerator <ScoreTerm> iterator2 = variants.GetEnumerator(); iterator2 .MoveNext();) { ScoreTerm st = iterator2.Current; // found a match Query tq = ignoreTF ? (Query) new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1); tq.Boost = st.score; // set the boost using the ScoreTerm's score termVariants.Add(tq, BooleanClause.Occur.SHOULD); // add to query } bq.Add(termVariants, BooleanClause.Occur.SHOULD); // add to query } } //TODO possible alternative step 3 - organize above booleans into a new layer of field-based // booleans with a minimum-should-match of NumFields-1? bq.Boost = Boost; this.rewrittenQuery = bq; return(bq); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; HashSet <string> processedTerms = new HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes().AddAttribute <IBoostAttribute>(); while ((possibleMatch = fe.Next()) != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = boostAtt.Boost; if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top().score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Size() >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.CloseWhileHandlingException(ts); } }
public override Query Rewrite(IndexReader reader) { FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); try { do { float minScore = 0.0f; float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); // terms come in alphabetical order, therefore if queue is full and score // not bigger than minScore, we can skip if (stQueue.Size() < maxClauseCount || score > minScore) { stQueue.Insert(new ScoreTerm(t, score)); minScore = ((ScoreTerm) stQueue.Top()).score; // maintain minScore } } } while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm) stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return query; }