/// <summary> Create a PriorityQueue from a word->tf map.
        ///
        /// </summary>
        /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
        /// </param>
        private PriorityQueue CreateQueue(IDictionary words)
        {
            // have collected all words in doc and their freqs
            int   numDocs = ir.NumDocs();
            FreqQ res     = new FreqQ(words.Count); // will order words by score

            IEnumerator it = words.Keys.GetEnumerator();

            while (it.MoveNext())
            {
                // for every word
                String word = (String)it.Current;

                int tf = ((Int)words[word]).x; // term freq in the source doc
                if (minTermFreq > 0 && tf < minTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }

                // go through all the fields and find the largest document frequency
                String topField = fieldNames[0];
                int    docFreq  = 0;
                for (int i = 0; i < fieldNames.Length; i++)
                {
                    int freq = ir.DocFreq(new Term(fieldNames[i], word));
                    topField = (freq > docFreq) ? fieldNames[i] : topField;
                    docFreq  = (freq > docFreq) ? freq : docFreq;
                }

                if (minDocFreq > 0 && docFreq < minDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }

                if (docFreq == 0)
                {
                    continue; // index update problem?
                }

                float idf   = similarity.Idf(docFreq, numDocs);
                float score = tf * idf;

                // only really need 1st 3 entries, other ones are for troubleshooting
                PQRecord pqr = new PQRecord(
                    word,
                    topField,
                    score,
                    idf,
                    docFreq,
                    tf
                    );
                res.Insert(pqr);
            }
            return(res);
        }
Exemplo n.º 2
0
        /// <summary> Create a PriorityQueue from a word->tf map.
        ///
        /// </summary>
        /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
        /// </param>
        protected PriorityQueue <object[]> CreateQueue(IDictionary <string, Int> words)
        {
            // have collected all words in doc and their freqs
            int   numDocs = _ir.NumDocs();
            FreqQ res     = new FreqQ(words.Count); // will order words by score

            var it = words.Keys.GetEnumerator();

            while (it.MoveNext())
            {
                // for every word
                System.String word = it.Current;

                int tf = words[word].X; // term freq in the source doc
                if (_minTermFreq > 0 && tf < _minTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }

                // go through all the fields and find the largest document frequency
                System.String topField = _fieldNames[0];
                int           docFreq  = 0;
                for (int i = 0; i < _fieldNames.Length; i++)
                {
                    int freq = _ir.DocFreq(new Term(_fieldNames[i], word), _state);
                    topField = (freq > docFreq) ? _fieldNames[i] : topField;
                    docFreq  = (freq > docFreq) ? freq : docFreq;
                }

                if (_minDocFreq > 0 && docFreq < _minDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }

                if (docFreq > _maxDocfreq)
                {
                    continue; // filter out words that occur in too many docs
                }

                if (docFreq == 0)
                {
                    continue; // index update problem?
                }

                float idf   = _similarity.Idf(docFreq, numDocs);
                float score = tf * idf;

                // only really need 1st 3 entries, other ones are for troubleshooting
                res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf });
            }
            return(res);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Create a <see cref="T:Util.PriorityQueue{object[]}"/> from a word-&gt;tf map.
        /// </summary>
        /// <param name="words"> a map of words keyed on the word(<see cref="string"/>) with <see cref="Int32"/> objects as the values. </param>
        /// <exception cref="IOException"/>
        private Util.PriorityQueue <object[]> CreateQueue(IDictionary <string, Int32> words)
        {
            // have collected all words in doc and their freqs
            int   numDocs = ir.NumDocs;
            FreqQ res     = new FreqQ(words.Count); // will order words by score

            foreach (string word in words.Keys)     // for every word
            {
                int tf = words[word].x;             // term freq in the source doc
                if (MinTermFreq > 0 && tf < MinTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }

                // go through all the fields and find the largest document frequency
                string topField = FieldNames[0];
                int    docFreq  = 0;
                foreach (string fieldName in FieldNames)
                {
                    int freq = ir.DocFreq(new Term(fieldName, word));
                    topField = (freq > docFreq) ? fieldName : topField;
                    docFreq  = (freq > docFreq) ? freq : docFreq;
                }

                if (MinDocFreq > 0 && docFreq < MinDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }

                if (docFreq > MaxDocFreq)
                {
                    continue; // filter out words that occur in too many docs
                }

                if (docFreq == 0)
                {
                    continue; // index update problem?
                }

                float idf   = Similarity.Idf(docFreq, numDocs);
                float score = tf * idf;

                // only really need 1st 3 entries, other ones are for troubleshooting
                res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs -  idf -  overall score -  the top field -  the word
            }
            return(res);
        }
Exemplo n.º 4
0
        /// <summary> Create a PriorityQueue from a word->tf map.
        /// 
        /// </summary>
        /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
        /// </param>
        private PriorityQueue CreateQueue(System.Collections.IDictionary words)
        {
            // have collected all words in doc and their freqs
            int numDocs = ir.NumDocs();
            FreqQ res = new FreqQ(words.Count); // will order words by score
			
            System.Collections.IEnumerator it = words.Keys.GetEnumerator();
            while (it.MoveNext())
            {
                // for every word
                System.String word = (System.String) it.Current;
				
                int tf = ((Int) words[word]).x; // term freq in the source doc
                if (minTermFreq > 0 && tf < minTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }
				
                // go through all the fields and find the largest document frequency
                System.String topField = fieldNames[0];
                int docFreq = 0;
                for (int i = 0; i < fieldNames.Length; i++)
                {
                    int freq = ir.DocFreq(new Term(fieldNames[i], word));
                    topField = (freq > docFreq) ? fieldNames[i] : topField;
                    docFreq = (freq > docFreq) ? freq : docFreq;
                }
				
                if (minDocFreq > 0 && docFreq < minDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }
				
                if (docFreq == 0)
                {
                    continue; // index update problem?
                }
				
                float idf = similarity.Idf(docFreq, numDocs);
                float score = tf * idf;
				
                // only really need 1st 3 entries, other ones are for troubleshooting
                res.Insert(new System.Object[]{word, topField, (float) score, (float) idf, (System.Int32) docFreq, (System.Int32) tf});
            }
            return res;
        }