/// <summary> /// Create a <see cref="T:Util.PriorityQueue{object[]}"/> from a word->tf map. /// </summary> /// <param name="words"> a map of words keyed on the word(<see cref="string"/>) with <see cref="Int32"/> objects as the values. </param> /// <exception cref="IOException"/> private Util.PriorityQueue <object[]> CreateQueue(IDictionary <string, Int32> words) { // have collected all words in doc and their freqs int numDocs = ir.NumDocs; FreqQ res = new FreqQ(words.Count); // will order words by score foreach (string word in words.Keys) // for every word { int tf = words[word].x; // term freq in the source doc if (MinTermFreq > 0 && tf < MinTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency string topField = FieldNames[0]; int docFreq = 0; foreach (string fieldName in FieldNames) { int freq = ir.DocFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (MinDocFreq > 0 && docFreq < MinDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > MaxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = Similarity.Idf(docFreq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs - idf - overall score - the top field - the word } return(res); }