/// <summary> Create a PriorityQueue from a word->tf map. /// /// </summary> /// <param name="words">a map of words keyed on the word(String) with Int objects as the values. /// </param> private PriorityQueue CreateQueue(IDictionary words) { // have collected all words in doc and their freqs int numDocs = ir.NumDocs(); FreqQ res = new FreqQ(words.Count); // will order words by score IEnumerator it = words.Keys.GetEnumerator(); while (it.MoveNext()) { // for every word String word = (String)it.Current; int tf = ((Int)words[word]).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (int i = 0; i < fieldNames.Length; i++) { int freq = ir.DocFreq(new Term(fieldNames[i], word)); topField = (freq > docFreq) ? fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting PQRecord pqr = new PQRecord( word, topField, score, idf, docFreq, tf ); res.Insert(pqr); } return(res); }
/// <summary> Create a PriorityQueue from a word->tf map. /// /// </summary> /// <param name="words">a map of words keyed on the word(String) with Int objects as the values. /// </param> protected PriorityQueue <object[]> CreateQueue(IDictionary <string, Int> words) { // have collected all words in doc and their freqs int numDocs = _ir.NumDocs(); FreqQ res = new FreqQ(words.Count); // will order words by score var it = words.Keys.GetEnumerator(); while (it.MoveNext()) { // for every word System.String word = it.Current; int tf = words[word].X; // term freq in the source doc if (_minTermFreq > 0 && tf < _minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency System.String topField = _fieldNames[0]; int docFreq = 0; for (int i = 0; i < _fieldNames.Length; i++) { int freq = _ir.DocFreq(new Term(_fieldNames[i], word), _state); topField = (freq > docFreq) ? _fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (_minDocFreq > 0 && docFreq < _minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > _maxDocfreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = _similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf }); } return(res); }
/// <summary> /// Create a <see cref="T:Util.PriorityQueue{object[]}"/> from a word->tf map. /// </summary> /// <param name="words"> a map of words keyed on the word(<see cref="string"/>) with <see cref="Int32"/> objects as the values. </param> /// <exception cref="IOException"/> private Util.PriorityQueue <object[]> CreateQueue(IDictionary <string, Int32> words) { // have collected all words in doc and their freqs int numDocs = ir.NumDocs; FreqQ res = new FreqQ(words.Count); // will order words by score foreach (string word in words.Keys) // for every word { int tf = words[word].x; // term freq in the source doc if (MinTermFreq > 0 && tf < MinTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency string topField = FieldNames[0]; int docFreq = 0; foreach (string fieldName in FieldNames) { int freq = ir.DocFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (MinDocFreq > 0 && docFreq < MinDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > MaxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = Similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs - idf - overall score - the top field - the word } return(res); }
/// <summary> Create a PriorityQueue from a word->tf map. /// /// </summary> /// <param name="words">a map of words keyed on the word(String) with Int objects as the values. /// </param> private PriorityQueue CreateQueue(System.Collections.IDictionary words) { // have collected all words in doc and their freqs int numDocs = ir.NumDocs(); FreqQ res = new FreqQ(words.Count); // will order words by score System.Collections.IEnumerator it = words.Keys.GetEnumerator(); while (it.MoveNext()) { // for every word System.String word = (System.String) it.Current; int tf = ((Int) words[word]).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency System.String topField = fieldNames[0]; int docFreq = 0; for (int i = 0; i < fieldNames.Length; i++) { int freq = ir.DocFreq(new Term(fieldNames[i], word)); topField = (freq > docFreq) ? fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.Insert(new System.Object[]{word, topField, (float) score, (float) idf, (System.Int32) docFreq, (System.Int32) tf}); } return res; }