Exemplo n.º 1
0
        /// <summary> Create a PriorityQueue from a word->tf map.
        ///
        /// </summary>
        /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
        /// </param>
        protected PriorityQueue <object[]> CreateQueue(IDictionary <string, Int> words)
        {
            // have collected all words in doc and their freqs
            int   numDocs = ir.NumDocs();
            FreqQ res     = new FreqQ(words.Count); // will order words by score

            var it = words.Keys.GetEnumerator();

            while (it.MoveNext())
            {
                // for every word
                System.String word = it.Current;

                int tf = words[word].x; // term freq in the source doc
                if (minTermFreq > 0 && tf < minTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }

                // go through all the fields and find the largest document frequency
                System.String topField = fieldNames[0];
                int           docFreq  = 0;
                for (int i = 0; i < fieldNames.Length; i++)
                {
                    int freq = ir.DocFreq(new Term(fieldNames[i], word));
                    topField = (freq > docFreq) ? fieldNames[i] : topField;
                    docFreq  = (freq > docFreq) ? freq : docFreq;
                }

                if (minDocFreq > 0 && docFreq < minDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }

                if (docFreq > maxDocfreq)
                {
                    continue; // filter out words that occur in too many docs
                }

                if (docFreq == 0)
                {
                    continue; // index update problem?
                }

                float idf   = similarity.Idf(docFreq, numDocs);
                float score = tf * idf;

                // only really need 1st 3 entries, other ones are for troubleshooting
                res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf });
            }
            return(res);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Create a <see cref="T:Util.PriorityQueue{object[]}"/> from a word-&gt;tf map.
        /// </summary>
        /// <param name="words"> a map of words keyed on the word(<see cref="string"/>) with <see cref="Int32"/> objects as the values. </param>
        /// <exception cref="IOException"/>
        private Util.PriorityQueue <object[]> CreateQueue(IDictionary <string, Int32> words)
        {
            // have collected all words in doc and their freqs
            int   numDocs = ir.NumDocs;
            FreqQ res     = new FreqQ(words.Count); // will order words by score

            foreach (string word in words.Keys)     // for every word
            {
                int tf = words[word].x;             // term freq in the source doc
                if (MinTermFreq > 0 && tf < MinTermFreq)
                {
                    continue; // filter out words that don't occur enough times in the source
                }

                // go through all the fields and find the largest document frequency
                string topField = FieldNames[0];
                int    docFreq  = 0;
                foreach (string fieldName in FieldNames)
                {
                    int freq = ir.DocFreq(new Term(fieldName, word));
                    topField = (freq > docFreq) ? fieldName : topField;
                    docFreq  = (freq > docFreq) ? freq : docFreq;
                }

                if (MinDocFreq > 0 && docFreq < MinDocFreq)
                {
                    continue; // filter out words that don't occur in enough docs
                }

                if (docFreq > MaxDocFreq)
                {
                    continue; // filter out words that occur in too many docs
                }

                if (docFreq == 0)
                {
                    continue; // index update problem?
                }

                float idf   = Similarity.Idf(docFreq, numDocs);
                float score = tf * idf;

                // only really need 1st 3 entries, other ones are for troubleshooting
                res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs -  idf -  overall score -  the top field -  the word
            }
            return(res);
        }
Exemplo n.º 3
0
		/// <summary> Create a PriorityQueue from a word->tf map.
		/// 
		/// </summary>
		/// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
		/// </param>
		protected PriorityQueue CreateQueue(System.Collections.IDictionary words)
		{
			// have collected all words in doc and their freqs
			int numDocs = ir.NumDocs();
			FreqQ res = new FreqQ(words.Count); // will order words by score

			System.Collections.IEnumerator it = words.Keys.GetEnumerator();
			while (it.MoveNext())
			{
				// for every word
				System.String word = (System.String)it.Current;

				int tf = ((Int)words[word]).x; // term freq in the source doc
				if (minTermFreq > 0 && tf < minTermFreq)
				{
					continue; // filter out words that don't occur enough times in the source
				}

				// go through all the fields and find the largest document frequency
				System.String topField = fieldNames[0];
				int docFreq = 0;
				for (int i = 0; i < fieldNames.Length; i++)
				{
					int freq = ir.DocFreq(new Term(fieldNames[i], word));
					topField = (freq > docFreq) ? fieldNames[i] : topField;
					docFreq = (freq > docFreq) ? freq : docFreq;
				}

				if (minDocFreq > 0 && docFreq < minDocFreq)
				{
					continue; // filter out words that don't occur in enough docs
				}

				if (docFreq == 0)
				{
					continue; // index update problem?
				}

				float idf = similarity.Idf(docFreq, numDocs);
				float score = tf * idf;

				// only really need 1st 3 entries, other ones are for troubleshooting
				res.InsertWithOverflow(new System.Object[] { word, topField, (float)score, (float)idf, (System.Int32)docFreq, (System.Int32)tf });
			}
			return res;
		}