예제 #1
0
        /// <summary>
        /// Computes a score factor for a simple term and returns an explanation
        /// for that score factor.
        ///
        /// <p>
        /// The default implementation uses:
        ///
        /// <pre class="prettyprint">
        /// idf(docFreq, searcher.maxDoc());
        /// </pre>
        ///
        /// Note that <seealso cref="CollectionStatistics#maxDoc()"/> is used instead of
        /// <seealso cref="Lucene.Net.Index.IndexReader#numDocs() IndexReader#numDocs()"/> because also
        /// <seealso cref="TermStatistics#docFreq()"/> is used, and when the latter
        /// is inaccurate, so is <seealso cref="CollectionStatistics#maxDoc()"/>, and in the same direction.
        /// In addition, <seealso cref="CollectionStatistics#maxDoc()"/> is more efficient to compute
        /// </summary>
        /// <param name="collectionStats"> collection-level statistics </param>
        /// <param name="termStats"> term-level statistics for the term </param>
        /// <returns> an Explain object that includes both an idf score factor
        ///           and an explanation for the term. </returns>
        public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats)
        {
            long  df  = termStats.DocFreq();
            long  max = collectionStats.MaxDoc;
            float idf = Idf(df, max);

            return(new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
        }
예제 #2
0
        /// <summary>
        /// Fills all member fields defined in {@code BasicStats} in {@code stats}.
        ///  Subclasses can override this method to fill additional stats.
        /// </summary>
        protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
        {
            // #positions(field) must be >= #positions(term)
            Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq());
            long numberOfDocuments = collectionStats.MaxDoc;

            long docFreq       = termStats.DocFreq();
            long totalTermFreq = termStats.TotalTermFreq();

            // codec does not supply totalTermFreq: substitute docFreq
            if (totalTermFreq == -1)
            {
                totalTermFreq = docFreq;
            }

            long  numberOfFieldTokens;
            float avgFieldLength;

            long sumTotalTermFreq = collectionStats.SumTotalTermFreq();

            if (sumTotalTermFreq <= 0)
            {
                // field does not exist;
                // We have to provide something if codec doesnt supply these measures,
                // or if someone omitted frequencies for the field... negative values cause
                // NaN/Inf for some scorers.
                numberOfFieldTokens = docFreq;
                avgFieldLength      = 1;
            }
            else
            {
                numberOfFieldTokens = sumTotalTermFreq;
                avgFieldLength      = (float)numberOfFieldTokens / numberOfDocuments;
            }

            // TODO: add sumDocFreq for field (numberOfFieldPostings)
            stats.NumberOfDocuments   = numberOfDocuments;
            stats.NumberOfFieldTokens = numberOfFieldTokens;
            stats.AvgFieldLength      = avgFieldLength;
            stats.DocFreq             = docFreq;
            stats.TotalTermFreq       = totalTermFreq;
        }
예제 #3
0
 /// <summary>
 /// Computes a score factor for a simple term and returns an explanation
 /// for that score factor.
 ///
 /// <p>
 /// The default implementation uses:
 ///
 /// <pre class="prettyprint">
 /// idf(docFreq, searcher.maxDoc());
 /// </pre>
 ///
 /// Note that <seealso cref="CollectionStatistics#maxDoc()"/> is used instead of
 /// <seealso cref="Lucene.Net.Index.IndexReader#numDocs() IndexReader#numDocs()"/> because also
 /// <seealso cref="TermStatistics#docFreq()"/> is used, and when the latter
 /// is inaccurate, so is <seealso cref="CollectionStatistics#maxDoc()"/>, and in the same direction.
 /// In addition, <seealso cref="CollectionStatistics#maxDoc()"/> is more efficient to compute
 /// </summary>
 /// <param name="collectionStats"> collection-level statistics </param>
 /// <param name="termStats"> term-level statistics for the term </param>
 /// <returns> an Explain object that includes both an idf score factor
 ///           and an explanation for the term. </returns>
 public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats)
 {
     long df = termStats.DocFreq();
     long max = collectionStats.MaxDoc();
     float idf = Idf(df, max);
     return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
 }
예제 #4
0
        /// <summary>
        /// Fills all member fields defined in {@code BasicStats} in {@code stats}.
        ///  Subclasses can override this method to fill additional stats.
        /// </summary>
        protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
        {
            // #positions(field) must be >= #positions(term)
            Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq());
            long numberOfDocuments = collectionStats.MaxDoc;

            long docFreq = termStats.DocFreq();
            long totalTermFreq = termStats.TotalTermFreq();

            // codec does not supply totalTermFreq: substitute docFreq
            if (totalTermFreq == -1)
            {
                totalTermFreq = docFreq;
            }

            long numberOfFieldTokens;
            float avgFieldLength;

            long sumTotalTermFreq = collectionStats.SumTotalTermFreq();

            if (sumTotalTermFreq <= 0)
            {
                // field does not exist;
                // We have to provide something if codec doesnt supply these measures,
                // or if someone omitted frequencies for the field... negative values cause
                // NaN/Inf for some scorers.
                numberOfFieldTokens = docFreq;
                avgFieldLength = 1;
            }
            else
            {
                numberOfFieldTokens = sumTotalTermFreq;
                avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
            }

            // TODO: add sumDocFreq for field (numberOfFieldPostings)
            stats.NumberOfDocuments = numberOfDocuments;
            stats.NumberOfFieldTokens = numberOfFieldTokens;
            stats.AvgFieldLength = avgFieldLength;
            stats.DocFreq = docFreq;
            stats.TotalTermFreq = totalTermFreq;
        }