Exemple #1
0
        /// <summary>
        /// Create a <see cref="T:Util.PriorityQueue{object[]}"/> from a word-&gt;tf map.
        /// </summary>
        /// <param name="words"> a map of words keyed on the word(<see cref="string"/>) with <see cref="Int32"/> objects as the values. </param>
        /// <exception cref="IOException"/>
        private Util.PriorityQueue <object[]> CreateQueue(IDictionary <string, Int32> words)
        {
            // have collected all words in doc and their freqs
            int   numDocs = ir.NumDocs;
            FreqQ res     = new FreqQ(words.Count); // will order words by score

            foreach (string word in words.Keys)     // for every word
            {
                int tf = words[word].x;             // term freq in the source doc
                if (MinTermFreq > 0 && tf < MinTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }

                // go through all the fields and find the largest document frequency
                string topField = FieldNames[0];
                int    docFreq  = 0;
                foreach (string fieldName in FieldNames)
                {
                    int freq = ir.DocFreq(new Term(fieldName, word));
                    topField = (freq > docFreq) ? fieldName : topField;
                    docFreq  = (freq > docFreq) ? freq : docFreq;
                }

                if (MinDocFreq > 0 && docFreq < MinDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }

                if (docFreq > MaxDocFreq)
                {
                    continue; // filter out words that occur in too many docs
                }

                if (docFreq == 0)
                {
                    continue; // index update problem?
                }

                float idf   = Similarity.Idf(docFreq, numDocs);
                float score = tf * idf;

                // only really need 1st 3 entries, other ones are for troubleshooting
                res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs -  idf -  overall score -  the top field -  the word
            }
            return(res);
        }