/// <summary> /// Computes a score factor for a simple term and returns an explanation /// for that score factor. /// /// <p> /// The default implementation uses: /// /// <pre class="prettyprint"> /// idf(docFreq, searcher.maxDoc()); /// </pre> /// /// Note that <seealso cref="CollectionStatistics#maxDoc()"/> is used instead of /// <seealso cref="Lucene.Net.Index.IndexReader#numDocs() IndexReader#numDocs()"/> because also /// <seealso cref="TermStatistics#docFreq()"/> is used, and when the latter /// is inaccurate, so is <seealso cref="CollectionStatistics#maxDoc()"/>, and in the same direction. /// In addition, <seealso cref="CollectionStatistics#maxDoc()"/> is more efficient to compute /// </summary> /// <param name="collectionStats"> collection-level statistics </param> /// <param name="termStats"> term-level statistics for the term </param> /// <returns> an Explain object that includes both an idf score factor /// and an explanation for the term. </returns> public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { long df = termStats.DocFreq(); long max = collectionStats.MaxDoc; float idf = Idf(df, max); return(new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); }
/// <summary> /// Fills all member fields defined in {@code BasicStats} in {@code stats}. /// Subclasses can override this method to fill additional stats. /// </summary> protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq()); long numberOfDocuments = collectionStats.MaxDoc; long docFreq = termStats.DocFreq(); long totalTermFreq = termStats.TotalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } long numberOfFieldTokens; float avgFieldLength; long sumTotalTermFreq = collectionStats.SumTotalTermFreq(); if (sumTotalTermFreq <= 0) { // field does not exist; // We have to provide something if codec doesnt supply these measures, // or if someone omitted frequencies for the field... negative values cause // NaN/Inf for some scorers. numberOfFieldTokens = docFreq; avgFieldLength = 1; } else { numberOfFieldTokens = sumTotalTermFreq; avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; } // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.NumberOfDocuments = numberOfDocuments; stats.NumberOfFieldTokens = numberOfFieldTokens; stats.AvgFieldLength = avgFieldLength; stats.DocFreq = docFreq; stats.TotalTermFreq = totalTermFreq; }
/// <summary> /// Computes a score factor for a simple term and returns an explanation /// for that score factor. /// /// <p> /// The default implementation uses: /// /// <pre class="prettyprint"> /// idf(docFreq, searcher.maxDoc()); /// </pre> /// /// Note that <seealso cref="CollectionStatistics#maxDoc()"/> is used instead of /// <seealso cref="Lucene.Net.Index.IndexReader#numDocs() IndexReader#numDocs()"/> because also /// <seealso cref="TermStatistics#docFreq()"/> is used, and when the latter /// is inaccurate, so is <seealso cref="CollectionStatistics#maxDoc()"/>, and in the same direction. /// In addition, <seealso cref="CollectionStatistics#maxDoc()"/> is more efficient to compute /// </summary> /// <param name="collectionStats"> collection-level statistics </param> /// <param name="termStats"> term-level statistics for the term </param> /// <returns> an Explain object that includes both an idf score factor /// and an explanation for the term. </returns> public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { long df = termStats.DocFreq(); long max = collectionStats.MaxDoc(); float idf = Idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }