public override sealed float Score(BasicStats stats, float tfn) { long N = stats.NumberOfDocuments; long F = stats.TotalTermFreq; double ne = N * (1 - Math.Pow((N - 1) / (double)N, F)); return tfn * (float)(SimilarityBase.Log2((N + 1) / (ne + 0.5))); }
public override sealed float Score(BasicStats stats, float tfn) { double F = stats.TotalTermFreq + 1 + tfn; // approximation only holds true when F << N, so we use N += F double N = F + stats.NumberOfDocuments; return (float)(-SimilarityBase.Log2((N - 1) * Math.E) + f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn)); }
public override sealed float Score(BasicStats stats, float tfn, float lambda) { if (lambda == 1f) { lambda = 0.99f; } return (float)-Math.Log((Math.Pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda)); }
public virtual void TestVerySparseDocuments() { BasicStats stats = CreateStats(); stats.NumberOfFieldTokens = stats.NumberOfDocuments * 2 / 3; stats.TotalTermFreq = stats.DocFreq; stats.AvgFieldLength = (float)stats.NumberOfFieldTokens / stats.NumberOfDocuments; UnitTestCore(stats, FREQ, DOC_LEN); }
public virtual void TestMostDocumentsRelevant() { BasicStats stats = CreateStats(); float mult = (0.6f * stats.NumberOfDocuments) / stats.DocFreq; stats.TotalTermFreq = (int)(stats.TotalTermFreq * mult); stats.DocFreq = (int)(stats.NumberOfDocuments * 0.6); UnitTestCore(stats, FREQ, DOC_LEN); }
public virtual void TestAllTermsRelevant() { BasicStats stats = CreateStats(); stats.TotalTermFreq = stats.NumberOfFieldTokens; UnitTestCore(stats, DOC_LEN, DOC_LEN); stats.AvgFieldLength = DOC_LEN + 10; UnitTestCore(stats, DOC_LEN, DOC_LEN); }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { if (stats.TotalBoost != 1.0f) { expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); } expl.AddDetail(new Explanation(lambda, "lambda")); base.Explain(expl, stats, doc, freq, docLen); }
public override sealed Explanation Explain(BasicStats stats, float tfn) { Explanation result = new Explanation(); result.Description = this.GetType().Name + ", computed from: "; result.Value = Score(stats, tfn); result.AddDetail(new Explanation(tfn, "tfn")); return(result); }
public override sealed float Score(BasicStats stats, float tfn) { // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F) double F = stats.TotalTermFreq + 1; double N = stats.NumberOfDocuments; double lambda = F / (N + F); // -log(1 / (lambda + 1)) -> log(lambda + 1) return((float)(SimilarityBase.Log2(lambda + 1) + tfn * SimilarityBase.Log2((1 + lambda) / lambda))); }
public virtual void TestOnlyOneTerm() { BasicStats stats = CreateStats(); stats.NumberOfFieldTokens = 1; stats.AvgFieldLength = 1.0f / stats.NumberOfDocuments; stats.DocFreq = 1; stats.TotalTermFreq = 1; UnitTestCore(stats, 1, DOC_LEN); }
public override sealed SimWeight ComputeWeight(float queryBoost, CollectionStatistics collectionStats, params TermStatistics[] termStats) { BasicStats[] stats = new BasicStats[termStats.Length]; for (int i = 0; i < termStats.Length; i++) { stats[i] = NewStats(collectionStats.Field(), queryBoost); FillBasicStats(stats[i], collectionStats, termStats[i]); } return(stats.Length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats) as SimWeight); }
public override sealed Explanation Explain(BasicStats stats) { Explanation result = new Explanation(); result.Description = this.GetType().Name + ", computed from: "; result.Value = CalculateLambda(stats); result.AddDetail(new Explanation(stats.TotalTermFreq, "totalTermFreq")); result.AddDetail(new Explanation(stats.NumberOfDocuments, "numberOfDocuments")); return(result); }
public override sealed SimWeight ComputeWeight(float queryBoost, CollectionStatistics collectionStats, params TermStatistics[] termStats) { BasicStats[] stats = new BasicStats[termStats.Length]; for (int i = 0; i < termStats.Length; i++) { stats[i] = NewStats(collectionStats.Field(), queryBoost); FillBasicStats(stats[i], collectionStats, termStats[i]); } return stats.Length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats) as SimWeight; }
/// <summary> /// Returns an explanation for the normalized term frequency. /// <p>The default normalization methods use the field length of the document /// and the average field length to compute the normalized term frequency. /// this method provides a generic explanation for such methods. /// Subclasses that use other statistics must override this method.</p> /// </summary> public virtual Explanation Explain(BasicStats stats, float tf, float len) { Explanation result = new Explanation(); result.Description = this.GetType().Name + ", computed from: "; result.Value = Tfn(stats, tf, len); result.AddDetail(new Explanation(tf, "tf")); result.AddDetail(new Explanation(stats.AvgFieldLength, "avgFieldLength")); result.AddDetail(new Explanation(len, "len")); return(result); }
/// <summary> /// Creates the default statistics object that the specific tests modify. </summary> private BasicStats CreateStats() { BasicStats stats = new BasicStats("spoof", 1); stats.NumberOfDocuments = NUMBER_OF_DOCUMENTS; stats.NumberOfFieldTokens = NUMBER_OF_FIELD_TOKENS; stats.AvgFieldLength = AVG_FIELD_LENGTH; stats.DocFreq = DOC_FREQ; stats.TotalTermFreq = TOTAL_TERM_FREQ; return(stats); }
public virtual void TestAllTermsRelevantOnlyOneDocument() { BasicStats stats = CreateStats(); stats.NumberOfDocuments = 10; stats.NumberOfFieldTokens = 50; stats.AvgFieldLength = 5; stats.DocFreq = 1; stats.TotalTermFreq = 50; UnitTestCore(stats, 50, 50); }
/// <summary> /// Returns an explanation for the score. /// <para>Most basic models use the number of documents and the total term /// frequency to compute Inf<sub>1</sub>. this method provides a generic /// explanation for such models. Subclasses that use other statistics must /// override this method.</para> /// </summary> public virtual Explanation Explain(BasicStats stats, float tfn) { Explanation result = new Explanation(); result.Description = this.GetType().Name + ", computed from: "; result.Value = Score(stats, tfn); result.AddDetail(new Explanation(tfn, "tfn")); result.AddDetail(new Explanation(stats.NumberOfDocuments, "numberOfDocuments")); result.AddDetail(new Explanation(stats.TotalTermFreq, "totalTermFreq")); return(result); }
public virtual void TestOneDocument() { BasicStats stats = CreateStats(); stats.NumberOfDocuments = 1; stats.NumberOfFieldTokens = DOC_LEN; stats.AvgFieldLength = DOC_LEN; stats.DocFreq = 1; stats.TotalTermFreq = (int)FREQ; UnitTestCore(stats, FREQ, DOC_LEN); }
/// <summary> /// Explains the score. The implementation here provides a basic explanation /// in the format <em>score(name-of-similarity, doc=doc-id, /// freq=term-frequency), computed from:</em>, and /// attaches the score (computed via the <seealso cref="#score(BasicStats, float, float)"/> /// method) and the explanation for the term frequency. Subclasses content with /// this format may add additional details in /// <seealso cref="#explain(Explanation, BasicStats, int, float, float)"/>. /// </summary> /// <param name="stats"> the corpus level statistics. </param> /// <param name="doc"> the document id. </param> /// <param name="freq"> the term frequency and its explanation. </param> /// <param name="docLen"> the document length. </param> /// <returns> the explanation. </returns> public virtual Explanation Explain(BasicStats stats, int doc, Explanation freq, float docLen) { Explanation result = new Explanation(); result.Value = Score(stats, freq.Value, docLen); result.Description = "score(" + this.GetType().Name + ", doc=" + doc + ", freq=" + freq.Value + "), computed from:"; result.AddDetail(freq); Explain(result, stats, doc, freq.Value, docLen); return(result); }
public override sealed float Score(BasicStats stats, float tfn) { // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq // to create a 'normalized' F. double F = stats.TotalTermFreq + 1 + tfn; double phi = (double)tfn / F; double nphi = 1 - phi; double p = 1.0 / (stats.NumberOfDocuments + 1); double D = phi * SimilarityBase.Log2(phi / p) + nphi * SimilarityBase.Log2(nphi / (1 - p)); return (float)(D * F + 0.5 * SimilarityBase.Log2(1 + 2 * Math.PI * tfn * nphi)); }
/// <summary> /// The generic test core called by all unit test methods. It calls the /// <seealso cref="SimilarityBase#score(BasicStats, float, float)"/> method of all /// Similarities in <seealso cref="#sims"/> and checks if the score is valid; i.e. it /// is a finite positive real number. /// </summary> private void UnitTestCore(BasicStats stats, float freq, int docLen) { foreach (SimilarityBase sim in Sims) { BasicStats realStats = (BasicStats)sim.ComputeWeight(stats.TotalBoost, ToCollectionStats(stats), ToTermStats(stats)); float score = sim.Score(realStats, freq, docLen); float explScore = sim.Explain(realStats, 1, new Explanation(freq, "freq"), docLen).Value; Assert.IsFalse(float.IsInfinity(score), "Score infinite: " + sim.ToString()); Assert.IsFalse(float.IsNaN(score), "Score NaN: " + sim.ToString()); Assert.IsTrue(score >= 0, "Score negative: " + sim.ToString()); Assert.AreEqual(score, explScore, FLOAT_EPSILON, "score() and explain() return different values: " + sim.ToString()); } }
public override sealed float Score(BasicStats stats, float tfn) { // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq // to create a 'normalized' F. double F = stats.TotalTermFreq + 1 + tfn; double phi = (double)tfn / F; double nphi = 1 - phi; double p = 1.0 / (stats.NumberOfDocuments + 1); double D = phi * SimilarityBase.Log2(phi / p) + nphi * SimilarityBase.Log2(nphi / (1 - p)); return((float)(D * F + 0.5 * SimilarityBase.Log2(1 + 2 * Math.PI * tfn * nphi))); }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { if (stats.TotalBoost != 1.0f) { expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); } Explanation normExpl = m_normalization.Explain(stats, freq, docLen); Explanation lambdaExpl = m_lambda.Explain(stats); expl.AddDetail(normExpl); expl.AddDetail(lambdaExpl); expl.AddDetail(m_distribution.Explain(stats, normExpl.Value, lambdaExpl.Value)); }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { if (stats.TotalBoost != 1.0f) { expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); } Explanation normExpl = m_normalization.Explain(stats, freq, docLen); float tfn = normExpl.Value; expl.AddDetail(normExpl); expl.AddDetail(m_basicModel.Explain(stats, tfn)); expl.AddDetail(m_afterEffect.Explain(stats, tfn)); }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { if (stats.TotalBoost != 1.0f) { expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); } expl.AddDetail(new Explanation(mu, "mu")); Explanation weightExpl = new Explanation(); weightExpl.Value = (float)Math.Log(1 + freq / (mu * ((LMStats)stats).CollectionProbability)); weightExpl.Description = "term weight"; expl.AddDetail(weightExpl); expl.AddDetail(new Explanation((float)Math.Log(mu / (docLen + mu)), "document norm")); base.Explain(expl, stats, doc, freq, docLen); }
/// <summary> /// Fills all member fields defined in {@code BasicStats} in {@code stats}. /// Subclasses can override this method to fill additional stats. /// </summary> protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) Debug.Assert(collectionStats.SumTotalTermFreq() == -1 || collectionStats.SumTotalTermFreq() >= termStats.TotalTermFreq()); long numberOfDocuments = collectionStats.MaxDoc; long docFreq = termStats.DocFreq(); long totalTermFreq = termStats.TotalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } long numberOfFieldTokens; float avgFieldLength; long sumTotalTermFreq = collectionStats.SumTotalTermFreq(); if (sumTotalTermFreq <= 0) { // field does not exist; // We have to provide something if codec doesnt supply these measures, // or if someone omitted frequencies for the field... negative values cause // NaN/Inf for some scorers. numberOfFieldTokens = docFreq; avgFieldLength = 1; } else { numberOfFieldTokens = sumTotalTermFreq; avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; } // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.NumberOfDocuments = numberOfDocuments; stats.NumberOfFieldTokens = numberOfFieldTokens; stats.AvgFieldLength = avgFieldLength; stats.DocFreq = docFreq; stats.TotalTermFreq = totalTermFreq; }
public override SimScorer DoSimScorer(SimWeight stats, AtomicReaderContext context) { if (stats is MultiSimilarity.MultiStats) { // a multi term query (e.g. phrase). return the summation, // scoring almost as if it were boolean query SimWeight[] subStats = ((MultiSimilarity.MultiStats)stats).SubStats; SimScorer[] subScorers = new SimScorer[subStats.Length]; for (int i = 0; i < subScorers.Length; i++) { BasicStats basicstats = (BasicStats)subStats[i]; subScorers[i] = new BasicSimScorer(this, basicstats, context.AtomicReader.GetNormValues(basicstats.field)); } return(new MultiSimilarity.MultiSimScorer(subScorers)); } else { BasicStats basicstats = (BasicStats)stats; return(new BasicSimScorer(this, basicstats, context.AtomicReader.GetNormValues(basicstats.field))); } }
/// <summary> /// Computes the collection probability of the current term in addition to the /// usual statistics. /// </summary> protected internal override void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { base.FillBasicStats(stats, collectionStats, termStats); LMStats lmStats = (LMStats)stats; lmStats.CollectionProbability = collectionModel.ComputeProbability(stats); }
/// <summary> /// Subclasses should implement this method to explain the score. {@code expl} /// already contains the score, the name of the class and the doc id, as well /// as the term frequency and its explanation; subclasses can add additional /// clauses to explain details of their scoring formulae. /// <p>The default implementation does nothing.</p> /// </summary> /// <param name="expl"> the explanation to extend with details. </param> /// <param name="stats"> the corpus level statistics. </param> /// <param name="doc"> the document id. </param> /// <param name="freq"> the term frequency. </param> /// <param name="docLen"> the document length. </param> protected internal virtual void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { }
public override float Tfn(BasicStats stats, float tf, float len) { return(tf); }
public override sealed float CalculateLambda(BasicStats stats) { return((stats.TotalTermFreq + 1F) / (stats.NumberOfDocuments + 1F)); }
public override sealed float Score(BasicStats stats, float tfn) { float lambda = (float)(stats.TotalTermFreq + 1) / (stats.NumberOfDocuments + 1); return (float)(tfn * SimilarityBase.Log2(tfn / lambda) + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E + 0.5 * SimilarityBase.Log2(2 * Math.PI * tfn)); }
internal BasicSimScorer(SimilarityBase outerInstance, BasicStats stats, NumericDocValues norms) { this.OuterInstance = outerInstance; this.Stats = stats; this.Norms = norms; }
/// <summary> /// Creates the default statistics object that the specific tests modify. </summary> private BasicStats CreateStats() { BasicStats stats = new BasicStats("spoof", 1); stats.NumberOfDocuments = NUMBER_OF_DOCUMENTS; stats.NumberOfFieldTokens = NUMBER_OF_FIELD_TOKENS; stats.AvgFieldLength = AVG_FIELD_LENGTH; stats.DocFreq = DOC_FREQ; stats.TotalTermFreq = TOTAL_TERM_FREQ; return stats; }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { expl.AddDetail(new Explanation(collectionModel.ComputeProbability(stats), "collection probability")); }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { if (stats.TotalBoost != 1.0f) { expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); } expl.AddDetail(new Explanation(Mu_Renamed, "mu")); Explanation weightExpl = new Explanation(); weightExpl.Value = (float)Math.Log(1 + freq / (Mu_Renamed * ((LMStats)stats).CollectionProbability)); weightExpl.Description = "term weight"; expl.AddDetail(weightExpl); expl.AddDetail(new Explanation((float)Math.Log(Mu_Renamed / (docLen + Mu_Renamed)), "document norm")); base.Explain(expl, stats, doc, freq, docLen); }
public float ComputeProbability(BasicStats stats) { return (stats.TotalTermFreq + 1F) / (stats.NumberOfFieldTokens + 1F); }
private TermStatistics ToTermStats(BasicStats stats) { return new TermStatistics(new BytesRef("spoofyText"), stats.DocFreq, stats.TotalTermFreq); }
public override float Score(BasicStats stats, float freq, float docLen) { return stats.TotalBoost * (float)Math.Log(1 + ((1 - Lambda_Renamed) * freq / docLen) / (Lambda_Renamed * ((LMStats)stats).CollectionProbability)); }
/// <summary> /// Returns the normalized term frequency. </summary> /// <param name="len"> the field length. </param> public abstract float Tfn(BasicStats stats, float tf, float len);
public override float Score(BasicStats stats, float freq, float docLen) { float tfn = m_normalization.Tfn(stats, freq, docLen); return(stats.TotalBoost * m_basicModel.Score(stats, tfn) * m_afterEffect.Score(stats, tfn)); }
/// <summary> /// Explains the score. The implementation here provides a basic explanation /// in the format <em>score(name-of-similarity, doc=doc-id, /// freq=term-frequency), computed from:</em>, and /// attaches the score (computed via the <seealso cref="#score(BasicStats, float, float)"/> /// method) and the explanation for the term frequency. Subclasses content with /// this format may add additional details in /// <seealso cref="#explain(Explanation, BasicStats, int, float, float)"/>. /// </summary> /// <param name="stats"> the corpus level statistics. </param> /// <param name="doc"> the document id. </param> /// <param name="freq"> the term frequency and its explanation. </param> /// <param name="docLen"> the document length. </param> /// <returns> the explanation. </returns> public virtual Explanation Explain(BasicStats stats, int doc, Explanation freq, float docLen) { Explanation result = new Explanation(); result.Value = Score(stats, freq.Value, docLen); result.Description = "score(" + this.GetType().Name + ", doc=" + doc + ", freq=" + freq.Value + "), computed from:"; result.AddDetail(freq); Explain(result, stats, doc, freq.Value, docLen); return result; }
public override sealed float Score(BasicStats stats, float tfn, float lambda) { return((float)-Math.Log(lambda / (tfn + lambda))); }
private CollectionStatistics ToCollectionStats(BasicStats stats) { return new CollectionStatistics(stats.Field, stats.NumberOfDocuments, -1, stats.NumberOfFieldTokens, -1); }
public override sealed float Score(BasicStats stats, float tfn, float lambda) { return (float)-Math.Log(lambda / (tfn + lambda)); }
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) { if (stats.TotalBoost != 1.0f) { expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); } expl.AddDetail(new Explanation(Lambda_Renamed, "lambda")); base.Explain(expl, stats, doc, freq, docLen); }
public override float Tfn(BasicStats stats, float tf, float len) { return((float)(tf * Math.Pow(stats.m_avgFieldLength / len, z))); }
/// <summary> /// Scores the document {@code doc}. /// <p>Subclasses must apply their scoring formula in this class.</p> </summary> /// <param name="stats"> the corpus level statistics. </param> /// <param name="freq"> the term frequency. </param> /// <param name="docLen"> the document length. </param> /// <returns> the score. </returns> public abstract float Score(BasicStats stats, float freq, float docLen);
public override float Tfn(BasicStats stats, float tf, float len) { return (float)(tf * Math.Pow(stats.AvgFieldLength_Renamed / len, z)); }
public override Explanation Explain(BasicStats stats, float tf, float len) { return(new Explanation(1, "no normalization")); }
public override float Score(BasicStats stats, float freq, float docLen) { float score = stats.TotalBoost * (float)(Math.Log(1 + freq / (Mu_Renamed * ((LMStats)stats).CollectionProbability)) + Math.Log(Mu_Renamed / (docLen + Mu_Renamed))); return score > 0.0f ? score : 0.0f; }