public SpanWeight(SpanQuery query, IndexSearcher searcher) { this.m_similarity = searcher.Similarity; this.m_query = query; m_termContexts = new Dictionary <Term, TermContext>(); ISet <Term> terms = new JCG.SortedSet <Term>(); query.ExtractTerms(terms); IndexReaderContext context = searcher.TopReaderContext; TermStatistics[] termStats = new TermStatistics[terms.Count]; int i = 0; foreach (Term term in terms) { TermContext state = TermContext.Build(context, term); termStats[i] = searcher.TermStatistics(term, state); m_termContexts[term] = state; i++; } string field = query.Field; if (field != null) { m_stats = m_similarity.ComputeWeight(query.Boost, searcher.CollectionStatistics(query.Field), termStats); } }
/// <summary> /// Computes a score factor for a simple term and returns an explanation /// for that score factor. /// /// <para/> /// The default implementation uses: /// /// <code> /// Idf(docFreq, searcher.MaxDoc); /// </code> /// /// Note that <see cref="CollectionStatistics.MaxDoc"/> is used instead of /// <see cref="Lucene.Net.Index.IndexReader.NumDocs"/> because also /// <see cref="TermStatistics.DocFreq"/> is used, and when the latter /// is inaccurate, so is <see cref="CollectionStatistics.MaxDoc"/>, and in the same direction. /// In addition, <see cref="CollectionStatistics.MaxDoc"/> is more efficient to compute /// </summary> /// <param name="collectionStats"> Collection-level statistics </param> /// <param name="termStats"> Term-level statistics for the term </param> /// <returns> An Explain object that includes both an idf score factor /// and an explanation for the term. </returns> public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { long df = termStats.DocFreq; long max = collectionStats.MaxDoc; float idf = Idf(df, max); return(new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); }
public PhraseWeight(PhraseQuery outerInstance, IndexSearcher searcher) { this.outerInstance = outerInstance; this.similarity = searcher.Similarity; IndexReaderContext context = searcher.TopReaderContext; states = new TermContext[outerInstance.terms.Count]; TermStatistics[] termStats = new TermStatistics[outerInstance.terms.Count]; for (int i = 0; i < outerInstance.terms.Count; i++) { Term term = outerInstance.terms[i]; states[i] = TermContext.Build(context, term); termStats[i] = searcher.TermStatistics(term, states[i]); } stats = similarity.ComputeWeight(outerInstance.Boost, searcher.CollectionStatistics(outerInstance.field), termStats); }
/// <summary> /// Fills all member fields defined in {@code BasicStats} in {@code stats}. /// Subclasses can override this method to fill additional stats. /// </summary> protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq()); long numberOfDocuments = collectionStats.MaxDoc; long docFreq = termStats.DocFreq(); long totalTermFreq = termStats.TotalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } long numberOfFieldTokens; float avgFieldLength; long sumTotalTermFreq = collectionStats.SumTotalTermFreq(); if (sumTotalTermFreq <= 0) { // field does not exist; // We have to provide something if codec doesnt supply these measures, // or if someone omitted frequencies for the field... negative values cause // NaN/Inf for some scorers. numberOfFieldTokens = docFreq; avgFieldLength = 1; } else { numberOfFieldTokens = sumTotalTermFreq; avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; } // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.NumberOfDocuments = numberOfDocuments; stats.NumberOfFieldTokens = numberOfFieldTokens; stats.AvgFieldLength = avgFieldLength; stats.DocFreq = docFreq; stats.TotalTermFreq = totalTermFreq; }
// idf used for phrase queries public override Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) { return new Explanation(1.0f, "Inexplicable"); }
/// <summary> /// Computes the collection probability of the current term in addition to the /// usual statistics. /// </summary> protected internal override void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { base.FillBasicStats(stats, collectionStats, termStats); LMStats lmStats = (LMStats)stats; lmStats.CollectionProbability = collectionModel.ComputeProbability(stats); }
/// <summary> /// Computes a score factor for a phrase. /// /// <p> /// The default implementation sums the idf factor for /// each term in the phrase. /// </summary> /// <param name="collectionStats"> collection-level statistics </param> /// <param name="termStats"> term-level statistics for the terms in the phrase </param> /// <returns> an Explain object that includes both an idf /// score factor for the phrase and an explanation /// for each term. </returns> public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) { long max = collectionStats.MaxDoc(); float idf = 0.0f; Explanation exp = new Explanation(); exp.Description = "idf(), sum of:"; foreach (TermStatistics stat in termStats) { long df = stat.DocFreq(); float termIdf = Idf(df, max); exp.AddDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } exp.Value = idf; return exp; }
/// <summary> /// Computes a score factor for a simple term and returns an explanation /// for that score factor. /// /// <p> /// The default implementation uses: /// /// <pre class="prettyprint"> /// idf(docFreq, searcher.maxDoc()); /// </pre> /// /// Note that <seealso cref="CollectionStatistics#maxDoc()"/> is used instead of /// <seealso cref="Lucene.Net.Index.IndexReader#numDocs() IndexReader#numDocs()"/> because also /// <seealso cref="TermStatistics#docFreq()"/> is used, and when the latter /// is inaccurate, so is <seealso cref="CollectionStatistics#maxDoc()"/>, and in the same direction. /// In addition, <seealso cref="CollectionStatistics#maxDoc()"/> is more efficient to compute /// </summary> /// <param name="collectionStats"> collection-level statistics </param> /// <param name="termStats"> term-level statistics for the term </param> /// <returns> an Explain object that includes both an idf score factor /// and an explanation for the term. </returns> public virtual Explanation IdfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { long df = termStats.DocFreq(); long max = collectionStats.MaxDoc(); float idf = Idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
/// <summary> /// Computes the collection probability of the current term in addition to the /// usual statistics. /// </summary> protected internal override void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { base.FillBasicStats(stats, collectionStats, termStats); LMStats lmStats = (LMStats)stats; lmStats.CollectionProbability = m_collectionModel.ComputeProbability(stats); }