Ejemplo n.º 1
0
        public override Query Rewrite(IndexReader reader)
        {
            FilteredTermEnum enumerator   = GetEnum(reader);
            int            maxClauseCount = BooleanQuery.GetMaxClauseCount();
            ScoreTermQueue stQueue        = new ScoreTermQueue(maxClauseCount);

            try
            {
                do
                {
                    float minScore = 0.0f;
                    float score    = 0.0f;
                    Term  t        = enumerator.Term();
                    if (t != null)
                    {
                        score = enumerator.Difference();
                        // terms come in alphabetical order, therefore if queue is full and score
                        // not bigger than minScore, we can skip
                        if (stQueue.Size() < maxClauseCount || score > minScore)
                        {
                            stQueue.Insert(new ScoreTerm(t, score));
                            minScore = ((ScoreTerm)stQueue.Top()).score;                              // maintain minScore
                        }
                    }
                }while (enumerator.Next());
            }
            finally
            {
                enumerator.Close();
            }

            BooleanQuery query = new BooleanQuery(true);
            int          size  = stQueue.Size();

            for (int i = 0; i < size; i++)
            {
                ScoreTerm st = (ScoreTerm)stQueue.Pop();
                TermQuery tq = new TermQuery(st.term);              // found a match
                tq.SetBoost(GetBoost() * st.score);                 // set the boost
                query.Add(tq, BooleanClause.Occur.SHOULD);          // add to query
            }

            return(query);
        }
Ejemplo n.º 2
0
        public override Query Rewrite(IndexReader reader)
        {
            FilteredTermEnum enumerator   = GetEnum(reader);
            int            maxClauseCount = BooleanQuery.GetMaxClauseCount();
            ScoreTermQueue stQueue        = new ScoreTermQueue(maxClauseCount);
            ScoreTerm      reusableST     = null;

            try
            {
                do
                {
                    float score = 0.0f;
                    Term  t     = enumerator.Term();
                    if (t != null)
                    {
                        score = enumerator.Difference();
                        if (reusableST == null)
                        {
                            reusableST = new ScoreTerm(t, score);
                        }
                        else if (score >= reusableST.score)
                        {
                            // reusableST holds the last "rejected" entry, so, if
                            // this new score is not better than that, there's no
                            // need to try inserting it
                            reusableST.score = score;
                            reusableST.term  = t;
                        }
                        else
                        {
                            continue;
                        }

                        reusableST = (ScoreTerm)stQueue.InsertWithOverflow(reusableST);
                    }
                }while (enumerator.Next());
            }
            finally
            {
                enumerator.Close();
            }

            BooleanQuery query = new BooleanQuery(true);
            int          size  = stQueue.Size();

            for (int i = 0; i < size; i++)
            {
                ScoreTerm st = (ScoreTerm)stQueue.Pop();
                TermQuery tq = new TermQuery(st.term);              // found a match
                tq.SetBoost(GetBoost() * st.score);                 // set the boost
                query.Add(tq, BooleanClause.Occur.SHOULD);          // add to query
            }

            return(query);
        }
Ejemplo n.º 3
0
		public override Query Rewrite(IndexReader reader)
		{
			if (!termLongEnough)
			{
				// can only match if it's exact
				return new TermQuery(term);
			}
			
			FilteredTermEnum enumerator = GetEnum(reader);
			int maxClauseCount = BooleanQuery.GetMaxClauseCount();
			ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
			ScoreTerm reusableST = null;
			
			try
			{
				do 
				{
					float score = 0.0f;
					Term t = enumerator.Term();
					if (t != null)
					{
						score = enumerator.Difference();
						if (reusableST == null)
						{
							reusableST = new ScoreTerm(t, score);
						}
						else if (score >= reusableST.score)
						{
							// reusableST holds the last "rejected" entry, so, if
							// this new score is not better than that, there's no
							// need to try inserting it
							reusableST.score = score;
							reusableST.term = t;
						}
						else
						{
							continue;
						}
						
						reusableST = (ScoreTerm) stQueue.InsertWithOverflow(reusableST);
					}
				}
				while (enumerator.Next());
			}
			finally
			{
				enumerator.Close();
			}
			
			BooleanQuery query = new BooleanQuery(true);
			int size = stQueue.Size();
			for (int i = 0; i < size; i++)
			{
				ScoreTerm st = (ScoreTerm) stQueue.Pop();
				TermQuery tq = new TermQuery(st.term); // found a match
				tq.SetBoost(GetBoost() * st.score); // set the boost
				query.Add(tq, BooleanClause.Occur.SHOULD); // add to query
			}
			
			return query;
		}
Ejemplo n.º 4
0
        public override Query Rewrite(IndexReader reader)
        {
            if (rewrittenQuery != null)
            {
                return(rewrittenQuery);
            }
            //load up the list of possible terms
            foreach (FieldVals f in fieldVals)
            {
                AddTerms(reader, f);
            }
            //for (Iterator iter = fieldVals.iterator(); iter.hasNext(); )
            //{
            //    FieldVals f = (FieldVals)iter.next();
            //    addTerms(reader, f);
            //}
            //clear the list of fields
            fieldVals.Clear();

            BooleanQuery bq = new BooleanQuery();


            //create BooleanQueries to hold the variants for each token/field pair and ensure it
            // has no coord factor
            //Step 1: sort the termqueries by term/field
            Hashtable variantQueries = new Hashtable();
            int       size           = q.Size();

            for (int i = 0; i < size; i++)
            {
                ScoreTerm st = (ScoreTerm)q.Pop();
                ArrayList l  = (ArrayList)variantQueries[st.fuzziedSourceTerm];
                if (l == null)
                {
                    l = new ArrayList();
                    variantQueries.Add(st.fuzziedSourceTerm, l);
                }
                l.Add(st);
            }
            //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
            foreach (ArrayList variants in variantQueries.Values)
            //for (Iterator iter = variantQueries.values().iterator(); iter.hasNext(); )
            {
                //ArrayList variants = (ArrayList)iter.next();
                if (variants.Count == 1)
                {
                    //optimize where only one selected variant
                    ScoreTerm st = (ScoreTerm)variants[0];
                    TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);
                    tq.SetBoost(st.score); // set the boost to a mix of IDF and score
                    bq.Add(tq, BooleanClause.Occur.SHOULD);
                }
                else
                {
                    BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
                    foreach (ScoreTerm st in variants)
                    //for (Iterator iterator2 = variants.iterator(); iterator2.hasNext(); )
                    {
                        //ScoreTerm st = (ScoreTerm)iterator2.next();
                        TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); // found a match
                        tq.SetBoost(st.score);                                // set the boost using the ScoreTerm's score
                        termVariants.Add(tq, BooleanClause.Occur.SHOULD);     // add to query
                    }
                    bq.Add(termVariants, BooleanClause.Occur.SHOULD);         // add to query
                }
            }
            //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
            // booleans with a minimum-should-match of NumFields-1?
            bq.SetBoost(GetBoost());
            this.rewrittenQuery = bq;
            return(bq);
        }
Ejemplo n.º 5
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            TokenStream   ts      = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            int       corpusNumDocs            = reader.NumDocs();
            Term      internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
            Hashtable processedTerms           = new Hashtable();

            while (ts.IncrementToken())
            {
                String term = termAtt.Term();
                if (!processedTerms.Contains(term))
                {
                    processedTerms.Add(term, term);
                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                    float          minScore  = 0;
                    Term           startTerm = internSavingTemplateTerm.CreateTerm(term);
                    FuzzyTermEnum  fe        = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
                    TermEnum       origEnum  = reader.Terms(startTerm);
                    int            df        = 0;
                    if (startTerm.Equals(origEnum.Term()))
                    {
                        df = origEnum.DocFreq(); //store the df so all variants use same idf
                    }
                    int numVariants          = 0;
                    int totalVariantDocFreqs = 0;
                    do
                    {
                        Term possibleMatch = fe.Term();
                        if (possibleMatch != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = fe.Difference();
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
                                variantsQ.Insert(st);
                                minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
                            }
                        }
                    }while (fe.Next());
                    if (numVariants > 0)
                    {
                        int avgDf = totalVariantDocFreqs / numVariants;
                        if (df == 0)    //no direct match we can use as df for all variants
                        {
                            df = avgDf; //use avg df of all variants
                        }

                        // take the top variants (scored by edit distance) and reset the score
                        // to include an IDF factor then add to the global queue for ranking
                        // overall top query terms
                        int size = variantsQ.Size();
                        for (int i = 0; i < size; i++)
                        {
                            ScoreTerm st = (ScoreTerm)variantsQ.Pop();
                            st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
                            q.Insert(st);
                        }
                    }
                }
            }
        }
Ejemplo n.º 6
0
        public override Query Rewrite(IndexReader reader)
        {
            if (rewrittenQuery != null)
            {
                return(rewrittenQuery);
            }
            //load up the list of possible terms
            for (IEnumerator <FieldVals> iter = fieldVals.GetEnumerator(); iter.MoveNext();)
            {
                FieldVals f = iter.Current;
                AddTerms(reader, f);
            }
            //clear the list of fields
            fieldVals.Clear();

            BooleanQuery bq = new BooleanQuery();


            //create BooleanQueries to hold the variants for each token/field pair and ensure it
            // has no coord factor
            //Step 1: sort the termqueries by term/field
            IDictionary <Term, List <ScoreTerm> > variantQueries = new Dictionary <Term, List <ScoreTerm> >();
            int size = q.Size();

            for (int i = 0; i < size; i++)
            {
                ScoreTerm st = q.Pop();
                //List<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm);
                //          if(l==null)
                List <ScoreTerm> l;
                if (!variantQueries.TryGetValue(st.fuzziedSourceTerm, out l) || l == null)
                {
                    l = new List <ScoreTerm>();
                    variantQueries[st.fuzziedSourceTerm] = l;
                }
                l.Add(st);
            }
            //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
            for (var iter = variantQueries.Values.GetEnumerator(); iter.MoveNext();)
            {
                List <ScoreTerm> variants = iter.Current;
                if (variants.Count == 1)
                {
                    //optimize where only one selected variant
                    ScoreTerm st = variants[0];
                    Query     tq = ignoreTF ? (Query) new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1);
                    tq.Boost = st.score; // set the boost to a mix of IDF and score
                    bq.Add(tq, BooleanClause.Occur.SHOULD);
                }
                else
                {
                    BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
                    for (IEnumerator <ScoreTerm> iterator2 = variants.GetEnumerator(); iterator2
                         .MoveNext();)
                    {
                        ScoreTerm st = iterator2.Current;
                        // found a match
                        Query tq = ignoreTF ? (Query) new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1);
                        tq.Boost = st.score;                              // set the boost using the ScoreTerm's score
                        termVariants.Add(tq, BooleanClause.Occur.SHOULD); // add to query
                    }
                    bq.Add(termVariants, BooleanClause.Occur.SHOULD);     // add to query
                }
            }
            //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
            // booleans with a minimum-should-match of NumFields-1?
            bq.Boost            = Boost;
            this.rewrittenQuery = bq;
            return(bq);
        }
Ejemplo n.º 7
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            Terms terms = MultiFields.GetTerms(reader, f.fieldName);

            if (terms == null)
            {
                return;
            }
            TokenStream ts = analyzer.TokenStream(f.fieldName, f.queryString);

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();

                int corpusNumDocs = reader.NumDocs;
                HashSet <string> processedTerms = new HashSet <string>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    string term = termAtt.ToString();
                    if (!processedTerms.Contains(term))
                    {
                        processedTerms.Add(term);
                        ScoreTermQueue  variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                        float           minScore  = 0;
                        Term            startTerm = new Term(f.fieldName, term);
                        AttributeSource atts      = new AttributeSource();
                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
                            atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
                        //store the df so all variants use same idf
                        int             df                   = reader.DocFreq(startTerm);
                        int             numVariants          = 0;
                        int             totalVariantDocFreqs = 0;
                        BytesRef        possibleMatch;
                        IBoostAttribute boostAtt =
                            fe.Attributes().AddAttribute <IBoostAttribute>();
                        while ((possibleMatch = fe.Next()) != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = boostAtt.Boost;
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top().score; // maintain minScore
                            }
                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Size() >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
                        }

                        if (numVariants > 0)
                        {
                            int avgDf = totalVariantDocFreqs / numVariants;
                            if (df == 0)    //no direct match we can use as df for all variants
                            {
                                df = avgDf; //use avg df of all variants
                            }

                            // take the top variants (scored by edit distance) and reset the score
                            // to include an IDF factor then add to the global queue for ranking
                            // overall top query terms
                            int size = variantsQ.Size();
                            for (int i = 0; i < size; i++)
                            {
                                ScoreTerm st = variantsQ.Pop();
                                st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
                                q.InsertWithOverflow(st);
                            }
                        }
                    }
                }
                ts.End();
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }
Ejemplo n.º 8
0
		public override Query Rewrite(IndexReader reader)
		{
			FilteredTermEnum enumerator = GetEnum(reader);
			int maxClauseCount = BooleanQuery.GetMaxClauseCount();
			ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
			
			try
			{
				do 
				{
					float minScore = 0.0f;
					float score = 0.0f;
					Term t = enumerator.Term();
					if (t != null)
					{
						score = enumerator.Difference();
						// terms come in alphabetical order, therefore if queue is full and score
						// not bigger than minScore, we can skip
						if (stQueue.Size() < maxClauseCount || score > minScore)
						{
							stQueue.Insert(new ScoreTerm(t, score));
							minScore = ((ScoreTerm) stQueue.Top()).score; // maintain minScore
						}
					}
				}
				while (enumerator.Next());
			}
			finally
			{
				enumerator.Close();
			}
			
			BooleanQuery query = new BooleanQuery(true);
			int size = stQueue.Size();
			for (int i = 0; i < size; i++)
			{
				ScoreTerm st = (ScoreTerm) stQueue.Pop();
				TermQuery tq = new TermQuery(st.term); // found a match
				tq.SetBoost(GetBoost() * st.score); // set the boost
				query.Add(tq, BooleanClause.Occur.SHOULD); // add to query
			}
			
			return query;
		}