Exemplo n.º 1
0
        public override Query Rewrite(IndexReader reader)
        {
            FilteredTermEnum enumerator   = GetEnum(reader);
            int            maxClauseCount = BooleanQuery.GetMaxClauseCount();
            ScoreTermQueue stQueue        = new ScoreTermQueue(maxClauseCount);
            ScoreTerm      reusableST     = null;

            try
            {
                do
                {
                    float score = 0.0f;
                    Term  t     = enumerator.Term();
                    if (t != null)
                    {
                        score = enumerator.Difference();
                        if (reusableST == null)
                        {
                            reusableST = new ScoreTerm(t, score);
                        }
                        else if (score >= reusableST.score)
                        {
                            // reusableST holds the last "rejected" entry, so, if
                            // this new score is not better than that, there's no
                            // need to try inserting it
                            reusableST.score = score;
                            reusableST.term  = t;
                        }
                        else
                        {
                            continue;
                        }

                        reusableST = (ScoreTerm)stQueue.InsertWithOverflow(reusableST);
                    }
                }while (enumerator.Next());
            }
            finally
            {
                enumerator.Close();
            }

            BooleanQuery query = new BooleanQuery(true);
            int          size  = stQueue.Size();

            for (int i = 0; i < size; i++)
            {
                ScoreTerm st = (ScoreTerm)stQueue.Pop();
                TermQuery tq = new TermQuery(st.term);              // found a match
                tq.SetBoost(GetBoost() * st.score);                 // set the boost
                query.Add(tq, BooleanClause.Occur.SHOULD);          // add to query
            }

            return(query);
        }
Exemplo n.º 2
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString is null)
            {
                return;
            }
            Terms terms = MultiFields.GetTerms(reader, f.fieldName);

            if (terms is null)
            {
                return;
            }
            TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString);

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();

                int           corpusNumDocs  = reader.NumDocs;
                ISet <string> processedTerms = new JCG.HashSet <string>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    string term = termAtt.ToString();
                    if (!processedTerms.Contains(term))
                    {
                        processedTerms.Add(term);
                        ScoreTermQueue  variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                        float           minScore  = 0;
                        Term            startTerm = new Term(f.fieldName, term);
                        AttributeSource atts      = new AttributeSource();
                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
                            atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
                        //store the df so all variants use same idf
                        int             df                   = reader.DocFreq(startTerm);
                        int             numVariants          = 0;
                        int             totalVariantDocFreqs = 0;
                        BytesRef        possibleMatch;
                        IBoostAttribute boostAtt =
                            fe.Attributes.AddAttribute <IBoostAttribute>();
                        while (fe.MoveNext())
                        {
                            possibleMatch = fe.Term;
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq;
                            float score = boostAtt.Boost;
                            if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top.Score; // maintain minScore
                            }
                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
                        }

                        if (numVariants > 0)
                        {
                            int avgDf = totalVariantDocFreqs / numVariants;
                            if (df == 0)    //no direct match we can use as df for all variants
                            {
                                df = avgDf; //use avg df of all variants
                            }

                            // take the top variants (scored by edit distance) and reset the score
                            // to include an IDF factor then add to the global queue for ranking
                            // overall top query terms
                            int size = variantsQ.Count;
                            for (int i = 0; i < size; i++)
                            {
                                ScoreTerm st = variantsQ.Pop();
                                st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
                                q.InsertWithOverflow(st);
                            }
                        }
                    }
                }
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Exemplo n.º 3
0
		public override Query Rewrite(IndexReader reader)
		{
			if (!termLongEnough)
			{
				// can only match if it's exact
				return new TermQuery(term);
			}
			
			FilteredTermEnum enumerator = GetEnum(reader);
			int maxClauseCount = BooleanQuery.GetMaxClauseCount();
			ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
			ScoreTerm reusableST = null;
			
			try
			{
				do 
				{
					float score = 0.0f;
					Term t = enumerator.Term();
					if (t != null)
					{
						score = enumerator.Difference();
						if (reusableST == null)
						{
							reusableST = new ScoreTerm(t, score);
						}
						else if (score >= reusableST.score)
						{
							// reusableST holds the last "rejected" entry, so, if
							// this new score is not better than that, there's no
							// need to try inserting it
							reusableST.score = score;
							reusableST.term = t;
						}
						else
						{
							continue;
						}
						
						reusableST = (ScoreTerm) stQueue.InsertWithOverflow(reusableST);
					}
				}
				while (enumerator.Next());
			}
			finally
			{
				enumerator.Close();
			}
			
			BooleanQuery query = new BooleanQuery(true);
			int size = stQueue.Size();
			for (int i = 0; i < size; i++)
			{
				ScoreTerm st = (ScoreTerm) stQueue.Pop();
				TermQuery tq = new TermQuery(st.term); // found a match
				tq.SetBoost(GetBoost() * st.score); // set the boost
				query.Add(tq, BooleanClause.Occur.SHOULD); // add to query
			}
			
			return query;
		}
Exemplo n.º 4
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            TokenStream    ts      = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
            ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>();

            int              corpusNumDocs            = reader.NumDocs();
            Term             internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
            HashSet <string> processedTerms           = new HashSet <string>();

            while (ts.IncrementToken())
            {
                String term = termAtt.Term;
                if (!processedTerms.Contains(term))
                {
                    processedTerms.Add(term);
                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                    float          minScore  = 0;
                    Term           startTerm = internSavingTemplateTerm.CreateTerm(term);
                    FuzzyTermEnum  fe        = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
                    TermEnum       origEnum  = reader.Terms(startTerm);
                    int            df        = 0;
                    if (startTerm.Equals(origEnum.Term))
                    {
                        df = origEnum.DocFreq(); //store the df so all variants use same idf
                    }
                    int numVariants          = 0;
                    int totalVariantDocFreqs = 0;
                    do
                    {
                        Term possibleMatch = fe.Term;
                        if (possibleMatch != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = fe.Difference();
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top().Score; // maintain minScore
                            }
                        }
                    }while (fe.Next());
                    if (numVariants > 0)
                    {
                        int avgDf = totalVariantDocFreqs / numVariants;
                        if (df == 0)    //no direct match we can use as df for all variants
                        {
                            df = avgDf; //use avg df of all variants
                        }

                        // take the top variants (scored by edit distance) and reset the score
                        // to include an IDF factor then add to the global queue for ranking
                        // overall top query terms
                        int size = variantsQ.Size();
                        for (int i = 0; i < size; i++)
                        {
                            ScoreTerm st = variantsQ.Pop();
                            st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
                            q.InsertWithOverflow(st);
                        }
                    }
                }
            }
        }