Пример #1
0
        public SpanWeight(SpanQuery query, IndexSearcher searcher)
        {
            this.m_similarity = searcher.Similarity;
            this.m_query      = query;

            m_termContexts = new Dictionary <Term, TermContext>();
            ISet <Term> terms = new JCG.SortedSet <Term>();

            query.ExtractTerms(terms);
            IndexReaderContext context = searcher.TopReaderContext;

            TermStatistics[] termStats = new TermStatistics[terms.Count];
            int i = 0;

            foreach (Term term in terms)
            {
                TermContext state = TermContext.Build(context, term);
                termStats[i]         = searcher.TermStatistics(term, state);
                m_termContexts[term] = state;
                i++;
            }
            string field = query.Field;

            if (field != null)
            {
                m_stats = m_similarity.ComputeWeight(query.Boost, searcher.CollectionStatistics(query.Field), termStats);
            }
        }
Пример #2
0
        /// <summary>
        /// Computes a score factor for a simple term and returns an explanation
        /// for that score factor.
        ///
        /// <para/>
        /// The default implementation uses:
        ///
        /// <code>
        /// Idf(docFreq, searcher.MaxDoc);
        /// </code>
        ///
        /// Note that <see cref="CollectionStatistics.MaxDoc"/> is used instead of
        /// <see cref="Lucene.Net.Index.IndexReader.NumDocs"/> because also
        /// <see cref="TermStatistics.DocFreq"/> is used, and when the latter
        /// is inaccurate, so is <see cref="CollectionStatistics.MaxDoc"/>, and in the same direction.
        /// In addition, <see cref="CollectionStatistics.MaxDoc"/> is more efficient to compute
        /// </summary>
        /// <param name="collectionStats"> Collection-level statistics </param>
        /// <param name="termStats"> Term-level statistics for the term </param>
        /// <returns> An Explain object that includes both an idf score factor
        ///           and an explanation for the term. </returns>
        public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats)
        {
            long  df  = termStats.DocFreq;
            long  max = collectionStats.MaxDoc;
            float idf = Idf(df, max);

            return(new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        }
Пример #3
0
            public PhraseWeight(PhraseQuery outerInstance, IndexSearcher searcher)
            {
                this.outerInstance = outerInstance;
                this.similarity    = searcher.Similarity;
                IndexReaderContext context = searcher.TopReaderContext;

                states = new TermContext[outerInstance.terms.Count];
                TermStatistics[] termStats = new TermStatistics[outerInstance.terms.Count];
                for (int i = 0; i < outerInstance.terms.Count; i++)
                {
                    Term term = outerInstance.terms[i];
                    states[i]    = TermContext.Build(context, term);
                    termStats[i] = searcher.TermStatistics(term, states[i]);
                }
                stats = similarity.ComputeWeight(outerInstance.Boost, searcher.CollectionStatistics(outerInstance.field), termStats);
            }
Пример #4
0
        /// <summary>
        /// Fills all member fields defined in {@code BasicStats} in {@code stats}.
        ///  Subclasses can override this method to fill additional stats.
        /// </summary>
        protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
        {
            // #positions(field) must be >= #positions(term)
            Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq());
            long numberOfDocuments = collectionStats.MaxDoc;

            long docFreq       = termStats.DocFreq();
            long totalTermFreq = termStats.TotalTermFreq();

            // codec does not supply totalTermFreq: substitute docFreq
            if (totalTermFreq == -1)
            {
                totalTermFreq = docFreq;
            }

            long  numberOfFieldTokens;
            float avgFieldLength;

            long sumTotalTermFreq = collectionStats.SumTotalTermFreq();

            if (sumTotalTermFreq <= 0)
            {
                // field does not exist;
                // We have to provide something if codec doesnt supply these measures,
                // or if someone omitted frequencies for the field... negative values cause
                // NaN/Inf for some scorers.
                numberOfFieldTokens = docFreq;
                avgFieldLength      = 1;
            }
            else
            {
                numberOfFieldTokens = sumTotalTermFreq;
                avgFieldLength      = (float)numberOfFieldTokens / numberOfDocuments;
            }

            // TODO: add sumDocFreq for field (numberOfFieldPostings)
            stats.NumberOfDocuments   = numberOfDocuments;
            stats.NumberOfFieldTokens = numberOfFieldTokens;
            stats.AvgFieldLength      = avgFieldLength;
            stats.DocFreq             = docFreq;
            stats.TotalTermFreq       = totalTermFreq;
        }
 // idf used for phrase queries
 public override Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats)
 {
     return new Explanation(1.0f, "Inexplicable");
 }
Пример #6
0
 /// <summary>
 /// Computes the collection probability of the current term in addition to the
 /// usual statistics.
 /// </summary>
 protected internal override void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
 {
     base.FillBasicStats(stats, collectionStats, termStats);
     LMStats lmStats = (LMStats)stats;
     lmStats.CollectionProbability = collectionModel.ComputeProbability(stats);
 }
Пример #7
0
 /// <summary>
 /// Computes a score factor for a phrase.
 ///
 /// <p>
 /// The default implementation sums the idf factor for
 /// each term in the phrase.
 /// </summary>
 /// <param name="collectionStats"> collection-level statistics </param>
 /// <param name="termStats"> term-level statistics for the terms in the phrase </param>
 /// <returns> an Explain object that includes both an idf
 ///         score factor for the phrase and an explanation
 ///         for each term. </returns>
 public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats)
 {
     long max = collectionStats.MaxDoc();
     float idf = 0.0f;
     Explanation exp = new Explanation();
     exp.Description = "idf(), sum of:";
     foreach (TermStatistics stat in termStats)
     {
         long df = stat.DocFreq();
         float termIdf = Idf(df, max);
         exp.AddDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
         idf += termIdf;
     }
     exp.Value = idf;
     return exp;
 }
Пример #8
0
 /// <summary>
 /// Computes a score factor for a simple term and returns an explanation
 /// for that score factor.
 ///
 /// <p>
 /// The default implementation uses:
 ///
 /// <pre class="prettyprint">
 /// idf(docFreq, searcher.maxDoc());
 /// </pre>
 ///
 /// Note that <seealso cref="CollectionStatistics#maxDoc()"/> is used instead of
 /// <seealso cref="Lucene.Net.Index.IndexReader#numDocs() IndexReader#numDocs()"/> because also
 /// <seealso cref="TermStatistics#docFreq()"/> is used, and when the latter
 /// is inaccurate, so is <seealso cref="CollectionStatistics#maxDoc()"/>, and in the same direction.
 /// In addition, <seealso cref="CollectionStatistics#maxDoc()"/> is more efficient to compute
 /// </summary>
 /// <param name="collectionStats"> collection-level statistics </param>
 /// <param name="termStats"> term-level statistics for the term </param>
 /// <returns> an Explain object that includes both an idf score factor
 ///           and an explanation for the term. </returns>
 public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats)
 {
     long df = termStats.DocFreq();
     long max = collectionStats.MaxDoc();
     float idf = Idf(df, max);
     return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
 }
Пример #9
0
        /// <summary>
        /// Computes the collection probability of the current term in addition to the
        /// usual statistics.
        /// </summary>
        protected internal override void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
        {
            base.FillBasicStats(stats, collectionStats, termStats);
            LMStats lmStats = (LMStats)stats;

            lmStats.CollectionProbability = m_collectionModel.ComputeProbability(stats);
        }
Пример #10
0
        /// <summary>
        /// Fills all member fields defined in {@code BasicStats} in {@code stats}.
        ///  Subclasses can override this method to fill additional stats.
        /// </summary>
        protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
        {
            // #positions(field) must be >= #positions(term)
            Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq());
            long numberOfDocuments = collectionStats.MaxDoc;

            long docFreq = termStats.DocFreq();
            long totalTermFreq = termStats.TotalTermFreq();

            // codec does not supply totalTermFreq: substitute docFreq
            if (totalTermFreq == -1)
            {
                totalTermFreq = docFreq;
            }

            long numberOfFieldTokens;
            float avgFieldLength;

            long sumTotalTermFreq = collectionStats.SumTotalTermFreq();

            if (sumTotalTermFreq <= 0)
            {
                // field does not exist;
                // We have to provide something if codec doesnt supply these measures,
                // or if someone omitted frequencies for the field... negative values cause
                // NaN/Inf for some scorers.
                numberOfFieldTokens = docFreq;
                avgFieldLength = 1;
            }
            else
            {
                numberOfFieldTokens = sumTotalTermFreq;
                avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
            }

            // TODO: add sumDocFreq for field (numberOfFieldPostings)
            stats.NumberOfDocuments = numberOfDocuments;
            stats.NumberOfFieldTokens = numberOfFieldTokens;
            stats.AvgFieldLength = avgFieldLength;
            stats.DocFreq = docFreq;
            stats.TotalTermFreq = totalTermFreq;
        }