상속: Lucene.Net.Util.PriorityQueue
예제 #1
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="num_sug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular)
        {
            float            min = this.minScore;
            TRStringDistance sd  = new TRStringDistance(word);
            int lengthWord       = word.Length;

            int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0;

            if (!morePopular && goalFreq > 0)
            {
                return(new System.String[] { word }); // return the word if it exist in the index and i don't want a more popular word
            }

            BooleanQuery query = new BooleanQuery();

            System.String[] grams;
            System.String   key;

            for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
            {
                key = "gram" + ng;           // form key

                grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                if (grams.Length == 0)
                {
                    continue; // hmm
                }

                if (bStart > 0)
                {
                    // should we boost prefixes?
                    Add(query, "start" + ng, grams[0], bStart); // matches start of word
                }
                if (bEnd > 0)
                {
                    // should we boost suffixes
                    Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                }
                for (int i = 0; i < grams.Length; i++)
                {
                    Add(query, key, grams[i]);
                }
            }

            IndexSearcher    searcher = new IndexSearcher(this.spellindex);
            Hits             hits     = searcher.Search(query);
            SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);

            int         stop    = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
            SuggestWord sugword = new SuggestWord();

            for (int i = 0; i < stop; i++)
            {
                sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word)

                if (sugword.string_Renamed.Equals(word))
                {
                    continue; // don't suggest a word for itself, that would be silly
                }

                //edit distance/normalize with the min word length
                sugword.score = 1.0f - ((float)sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord));
                if (sugword.score < min)
                {
                    continue;
                }

                if (ir != null)
                {
                    // use the user index
                    sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index
                    if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1)
                    {
                        // don't suggest a word that is not present in the field
                        continue;
                    }
                }
                sugqueue.Insert(sugword);
                if (sugqueue.Size() == num_sug)
                {
                    //if queue full , maintain the min score
                    min = ((SuggestWord)sugqueue.Top()).score;
                }
                sugword = new SuggestWord();
            }

            // convert to array string
            System.String[] list = new System.String[sugqueue.Size()];
            for (int i = sugqueue.Size() - 1; i >= 0; i--)
            {
                list[i] = ((SuggestWord)sugqueue.Pop()).string_Renamed;
            }

            searcher.Close();
            return(list);
        }
예제 #2
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="numSug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular)
        {    // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();

            try
            {
                float min        = this.minScore;
                int   lengthWord = word.Length;

                int freq     = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
                int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
                // if the word exists in the real index and we don't care for word frequency, return the word itself
                if (!morePopular && freq > 0)
                {
                    return(new String[] { word });
                }

                var      query = new BooleanQuery();
                String[] grams;
                String   key;

                var alreadySeen = new HashSet <string>();
                for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
                {
                    key = "gram" + ng;           // form key

                    grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                    if (grams.Length == 0)
                    {
                        continue; // hmm
                    }

                    if (bStart > 0)
                    {                                               // should we boost prefixes?
                        Add(query, "start" + ng, grams[0], bStart); // matches start of word
                    }
                    if (bEnd > 0)
                    {                                                          // should we boost suffixes
                        Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                    }
                    for (int i = 0; i < grams.Length; i++)
                    {
                        Add(query, key, grams[i]);
                    }
                }

                int maxHits = 10 * numSug;

                //    System.out.println("Q: " + query);
                ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
                //    System.out.println("HITS: " + hits.length());
                SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

                // go thru more than 'maxr' matches in case the distance filter triggers
                int         stop    = Math.Min(hits.Length, maxHits);
                SuggestWord sugWord = new SuggestWord();
                for (int i = 0; i < stop; i++)
                {
                    sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word

                    // don't suggest a word for itself, that would be silly
                    if (sugWord.termString.Equals(word))
                    {
                        continue;
                    }

                    // edit distance
                    sugWord.score = sd.GetDistance(word, sugWord.termString);
                    if (sugWord.score < min)
                    {
                        continue;
                    }

                    if (ir != null && field != null)
                    {                                                                   // use the user index
                        sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index
                        // don't suggest a word that is not present in the field
                        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
                        {
                            continue;
                        }
                    }

                    if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice
                    {
                        continue;
                    }

                    sugQueue.InsertWithOverflow(sugWord);
                    if (sugQueue.Size() == numSug)
                    {
                        // if queue full, maintain the minScore score
                        min = ((SuggestWord)sugQueue.Top()).score;
                    }
                    sugWord = new SuggestWord();
                }

                // convert to array string
                String[] list = new String[sugQueue.Size()];
                for (int i = sugQueue.Size() - 1; i >= 0; i--)
                {
                    list[i] = ((SuggestWord)sugQueue.Pop()).termString;
                }

                return(list);
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }
예제 #3
0
		/// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
		/// <param name="word">String the word you want a spell check done on
		/// </param>
		/// <param name="numSug">int the number of suggest words
		/// </param>
		/// <param name="ir">the indexReader of the user index (can be null see field param)
		/// </param>
		/// <param name="field">String the field of the user index: if field is not null, the suggested
		/// words are restricted to the words present in this field.
		/// </param>
		/// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
		/// (only if restricted mode = (indexReader!=null and field!=null)
		/// </param>
		/// <throws>  IOException </throws>
		/// <returns> String[] the sorted list of the suggest words with this 2 criteria:
		/// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
		/// of the suggest words in the field of the user index
		/// </returns>
		public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular)
		{    // obtainSearcher calls ensureOpen
			IndexSearcher indexSearcher = ObtainSearcher();
			try
			{
				float min = this.minScore;
				int lengthWord = word.Length;

				int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
				int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
				// if the word exists in the real index and we don't care for word frequency, return the word itself
				if (!morePopular && freq > 0)
				{
					return new String[] { word };
				}

				var query = new BooleanQuery();
				String[] grams;
				String key;

				var alreadySeen = new HashSet<string>();
				for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
				{
					key = "gram" + ng; // form key

					grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

					if (grams.Length == 0)
					{
						continue; // hmm
					}

					if (bStart > 0)
					{ // should we boost prefixes?
						Add(query, "start" + ng, grams[0], bStart); // matches start of word

					}
					if (bEnd > 0)
					{ // should we boost suffixes
						Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word

					}
					for (int i = 0; i < grams.Length; i++)
					{
						Add(query, key, grams[i]);
					}
				}

				int maxHits = 10 * numSug;

				//    System.out.println("Q: " + query);
				ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
				//    System.out.println("HITS: " + hits.length());
				SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

				// go thru more than 'maxr' matches in case the distance filter triggers
				int stop = Math.Min(hits.Length, maxHits);
				SuggestWord sugWord = new SuggestWord();
				for (int i = 0; i < stop; i++)
				{
					sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word

					// don't suggest a word for itself, that would be silly
					if (sugWord.termString.Equals(word,StringComparison.OrdinalIgnoreCase))
					{
						continue;
					}

					// edit distance
					sugWord.score = sd.GetDistance(word, sugWord.termString);
					if (sugWord.score < min)
					{
						continue;
					}

					if (ir != null && field != null)
					{ // use the user index
						sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index
						// don't suggest a word that is not present in the field
						if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1)
						{
							continue;
						}
					}

					if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice
						continue;

					sugQueue.InsertWithOverflow(sugWord);
					if (sugQueue.Size() == numSug)
					{
						// if queue full, maintain the minScore score
						min = ((SuggestWord)sugQueue.Top()).score;
					}
					sugWord = new SuggestWord();
				}

				// convert to array string
				String[] list = new String[sugQueue.Size()];
				for (int i = sugQueue.Size() - 1; i >= 0; i--)
				{
					list[i] = ((SuggestWord)sugQueue.Pop()).termString;
				}

				return list;
			}
			finally
			{
				ReleaseSearcher(indexSearcher);
			}

		}
예제 #4
0
        /// <summary> Suggest similar words (restricted or not to a field of a user index)</summary>
        /// <param name="word">String the word you want a spell check done on
        /// </param>
        /// <param name="num_sug">int the number of suggest words
        /// </param>
        /// <param name="ir">the indexReader of the user index (can be null see field param)
        /// </param>
        /// <param name="field">String the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field.
        /// </param>
        /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word
        /// (only if restricted mode = (indexReader!=null and field!=null)
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> String[] the sorted list of the suggest words with this 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        /// </returns>
        public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular)
        {
            float min = this.minScore;
            TRStringDistance sd = new TRStringDistance(word);
            int lengthWord = word.Length;
			
            int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0;
            if (!morePopular && goalFreq > 0)
            {
                return new System.String[]{word}; // return the word if it exist in the index and i don't want a more popular word
            }
			
            BooleanQuery query = new BooleanQuery();
            System.String[] grams;
            System.String key;
			
            for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
            {
				
                key = "gram" + ng; // form key
				
                grams = FormGrams(word, ng); // form word into ngrams (allow dups too)
				
                if (grams.Length == 0)
                {
                    continue; // hmm
                }
				
                if (bStart > 0)
                {
                    // should we boost prefixes?
                    Add(query, "start" + ng, grams[0], bStart); // matches start of word
                }
                if (bEnd > 0)
                {
                    // should we boost suffixes
                    Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
                }
                for (int i = 0; i < grams.Length; i++)
                {
                    Add(query, key, grams[i]);
                }
            }
			
            IndexSearcher searcher = new IndexSearcher(this.spellindex);
            Hits hits = searcher.Search(query);
            SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);
			
            int stop = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
            SuggestWord sugword = new SuggestWord();
            for (int i = 0; i < stop; i++)
            {
				
                sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word)
				
                if (sugword.string_Renamed.Equals(word))
                {
                    continue; // don't suggest a word for itself, that would be silly
                }
				
                //edit distance/normalize with the min word length
                sugword.score = 1.0f - ((float) sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord));
                if (sugword.score < min)
                {
                    continue;
                }
				
                if (ir != null)
                {
                    // use the user index
                    sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index
                    if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1)
                    {
                        // don't suggest a word that is not present in the field
                        continue;
                    }
                }
                sugqueue.Insert(sugword);
                if (sugqueue.Size() == num_sug)
                {
                    //if queue full , maintain the min score
                    min = ((SuggestWord) sugqueue.Top()).score;
                }
                sugword = new SuggestWord();
            }
			
            // convert to array string
            System.String[] list = new System.String[sugqueue.Size()];
            for (int i = sugqueue.Size() - 1; i >= 0; i--)
            {
                list[i] = ((SuggestWord) sugqueue.Pop()).string_Renamed;
            }
			
            searcher.Close();
            return list;
        }