public abstract Tf ( float freq ) : float | ||
freq | float | the frequency of a term within a document /// |
return | float |
public override float Score() { //System.out.println("scoring " + first.doc); float raw = Similarity.Tf(freq) * value_Renamed; // raw score return(norms == null?raw:raw *Similarity.DecodeNorm(norms[first.doc])); // normalize }
protected internal override bool Score(HitCollector c, int end) { Similarity similarity = GetSimilarity(); // cache sim in local float[] normDecoder = Similarity.GetNormDecoder(); while (doc < end) { // for docs in window int f = freqs[pointer]; float score = f < SCORE_CACHE_SIZE?scoreCache[f]:similarity.Tf(f) * weightValue; // cache miss score *= normDecoder[norms[doc] & 0xFF]; // normalize for field c.Collect(doc, score); // collect score if (++pointer >= pointerMax) { pointerMax = termDocs.Read(docs, freqs); // refill buffers if (pointerMax != 0) { pointer = 0; } else { termDocs.Close(); // close stream doc = System.Int32.MaxValue; // set to sentinel value return(false); } } doc = docs[pointer]; } return(true); }
public override float Score(IState state) { System.Diagnostics.Debug.Assert(doc != -1); int f = freqs[pointer]; float raw = f < SCORE_CACHE_SIZE?scoreCache[f]:Similarity.Tf(f) * weightValue; // cache miss return(norms == null?raw:raw *SIM_NORM_DECODER[norms[doc] & 0xFF]); // normalize for field }
/// <summary> Construct a <c>TermScorer</c>. /// /// </summary> /// <param name="weight">The weight of the <c>Term</c> in the query. /// </param> /// <param name="td">An iterator over the documents matching the <c>Term</c>. /// </param> /// <param name="similarity">The <c>Similarity</c> implementation to be used for score /// computations. /// </param> /// <param name="norms">The field norms of the document fields for the <c>Term</c>. /// </param> public /*internal*/ TermScorer(Weight weight, TermDocs td, Similarity similarity, byte[] norms) : base(similarity) { this.weight = weight; this.termDocs = td; this.norms = norms; this.weightValue = weight.Value; for (int i = 0; i < SCORE_CACHE_SIZE; i++) { scoreCache[i] = Similarity.Tf(i) * weightValue; } }
public virtual void TestKnownSetOfDocuments() { System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" }; System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader.Terms(); TermDocs termDocs = knownSearcher.reader.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i]) == true) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("Field", "chocolate")); Hits hits = knownSearcher.Search(query); //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length() == 3); float score = hits.Score(0); /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString())); Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString())); Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString())); TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = (System.Int32)test4Map[term]; System.Object tmpFreqInt = test4Map[term]; Assert.IsTrue(tmpFreqInt != null); Assert.IsTrue(freqInt == freq); } knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public override Explanation Explain(IndexReader reader, int doc) { ComplexExplanation result = new ComplexExplanation(); result.Description = "weight(" + Query + " in " + doc + "), product of:"; Explanation expl = new Explanation(idf, idfExp.Explain()); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.Description = "queryWeight(" + Query + "), product of:"; Explanation boostExpl = new Explanation(Enclosing_Instance.Boost, "boost"); if (Enclosing_Instance.Boost != 1.0f) { queryExpl.AddDetail(boostExpl); } queryExpl.AddDetail(expl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.Value = boostExpl.Value * expl.Value * queryNormExpl.Value; result.AddDetail(queryExpl); // explain field weight string field = Enclosing_Instance.term.Field; ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.Description = "fieldWeight(" + Enclosing_Instance.term + " in " + doc + "), product of:"; Explanation tfExplanation = new Explanation(); int tf = 0; TermDocs termDocs = reader.TermDocs(enclosingInstance.term); if (termDocs != null) { try { if (termDocs.SkipTo(doc) && termDocs.Doc == doc) { tf = termDocs.Freq; } } finally { termDocs.Close(); } tfExplanation.Value = similarity.Tf(tf); tfExplanation.Description = "tf(termFreq(" + enclosingInstance.term + ")=" + tf + ")"; } else { tfExplanation.Value = 0.0f; tfExplanation.Description = "no matching term"; } fieldExpl.AddDetail(tfExplanation); fieldExpl.AddDetail(expl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(field); float fieldNorm = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.Value = fieldNorm; fieldNormExpl.Description = "fieldNorm(field=" + field + ", doc=" + doc + ")"; fieldExpl.AddDetail(fieldNormExpl); fieldExpl.Match = tfExplanation.IsMatch; fieldExpl.Value = tfExplanation.Value * expl.Value * fieldNormExpl.Value; result.AddDetail(fieldExpl); bool?tempAux = fieldExpl.Match; result.Match = tempAux; // combine them result.Value = queryExpl.Value * fieldExpl.Value; if (queryExpl.Value == 1.0f) { return(fieldExpl); } return(result); }
public override Explanation Explain(IndexReader reader, int doc) { ComplexExplanation result = new ComplexExplanation(); result.Description = "weight(" + Query + " in " + doc + "), product of:"; Explanation idfExpl = new Explanation(idf, "idf(" + Query + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.Description = "queryWeight(" + Query + "), product of:"; Explanation boostExpl = new Explanation(Enclosing_Instance.Boost, "boost"); if (Enclosing_Instance.Boost != 1.0f) { queryExpl.AddDetail(boostExpl); } queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.Value = boostExpl.Value * idfExpl.Value * queryNormExpl.Value; result.AddDetail(queryExpl); // explain field weight ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.Description = "fieldWeight(" + Query + " in " + doc + "), product of:"; PhraseScorer scorer = (PhraseScorer)Scorer(reader, true, false); if (scorer == null) { return(new Explanation(0.0f, "no matching docs")); } Explanation tfExplanation = new Explanation(); int d = scorer.Advance(doc); float phraseFreq = (d == doc) ? scorer.CurrentFreq() : 0.0f; tfExplanation.Value = similarity.Tf(phraseFreq); tfExplanation.Description = "tf(phraseFreq=" + phraseFreq + ")"; fieldExpl.AddDetail(tfExplanation); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(Enclosing_Instance.field); float fieldNorm = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.Value = fieldNorm; fieldNormExpl.Description = "fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")"; fieldExpl.AddDetail(fieldNormExpl); fieldExpl.Match = tfExplanation.IsMatch; fieldExpl.Value = tfExplanation.Value * idfExpl.Value * fieldNormExpl.Value; result.AddDetail(fieldExpl); bool?tempAux = fieldExpl.Match; result.Match = tempAux; // combine them result.Value = queryExpl.Value * fieldExpl.Value; if (queryExpl.Value == 1.0f) { return(fieldExpl); } return(result); }
public virtual void TestKnownSetOfDocuments() { System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new MockRAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms(); TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i])) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("field", "chocolate")); ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).scoreDocs; //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length == 3); float score = hits[0].score; /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(hits[0].doc == 2); Assert.IsTrue(hits[1].doc == 3); Assert.IsTrue(hits[2].doc == 0); TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = -1; try { freqInt = (System.Int32)test4Map[term]; } catch (Exception) { Assert.IsTrue(false); } Assert.IsTrue(freqInt == freq); } SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper); System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet(); Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10); TermVectorEntry last = null; foreach (TermVectorEntry tve in vectorEntrySet.Keys) { if (tve != null && last != null) { Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted"); System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()]; //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:"); } last = tve; } FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper); System.Collections.IDictionary map = fieldMapper.GetFieldToTerms(); Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2); vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"]; Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be"); Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10); knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public override Explanation Explain(IndexReader reader, int doc, IState state) { Explanation result = new Explanation(); result.Description = "weight(" + Query + " in " + doc + "), product of:"; System.Text.StringBuilder docFreqs = new System.Text.StringBuilder(); System.Text.StringBuilder query = new System.Text.StringBuilder(); query.Append('\"'); docFreqs.Append(idfExp.Explain()); for (int i = 0; i < Enclosing_Instance.terms.Count; i++) { if (i != 0) { query.Append(" "); } Term term = Enclosing_Instance.terms[i]; query.Append(term.Text); } query.Append('\"'); Explanation idfExpl = new Explanation(idf, "idf(" + Enclosing_Instance.field + ":" + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.Description = "queryWeight(" + Query + "), product of:"; Explanation boostExpl = new Explanation(Enclosing_Instance.Boost, "boost"); if (Enclosing_Instance.Boost != 1.0f) { queryExpl.AddDetail(boostExpl); } queryExpl.AddDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); queryExpl.AddDetail(queryNormExpl); queryExpl.Value = boostExpl.Value * idfExpl.Value * queryNormExpl.Value; result.AddDetail(queryExpl); // explain field weight Explanation fieldExpl = new Explanation(); fieldExpl.Description = "fieldWeight(" + Enclosing_Instance.field + ":" + query + " in " + doc + "), product of:"; PhraseScorer scorer = (PhraseScorer)Scorer(reader, true, false, state); if (scorer == null) { return(new Explanation(0.0f, "no matching docs")); } Explanation tfExplanation = new Explanation(); int d = scorer.Advance(doc, state); float phraseFreq = (d == doc) ? scorer.CurrentFreq() : 0.0f; tfExplanation.Value = similarity.Tf(phraseFreq); tfExplanation.Description = "tf(phraseFreq=" + phraseFreq + ")"; fieldExpl.AddDetail(tfExplanation); fieldExpl.AddDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.Norms(Enclosing_Instance.field, state); float fieldNorm = fieldNorms != null?Similarity.DecodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.Value = fieldNorm; fieldNormExpl.Description = "fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")"; fieldExpl.AddDetail(fieldNormExpl); fieldExpl.Value = tfExplanation.Value * idfExpl.Value * fieldNormExpl.Value; result.AddDetail(fieldExpl); // combine them result.Value = queryExpl.Value * fieldExpl.Value; if (queryExpl.Value == 1.0f) { return(fieldExpl); } return(result); }
public override float Tf(float freq) { return(delegee.Tf(freq)); }