private string[] QueryOverSingleWord <TDistance>(SuggestionField suggestionField, string word, SuggestionOptions options, TDistance sd)
            where TDistance : IStringDistance
        {
            var min         = options.Accuracy ?? SuggestionOptions.DefaultAccuracy;
            var field       = suggestionField.Name;
            var pageSize    = options.PageSize;
            var morePopular = options.SortMode == SuggestionSortMode.Popularity;

            int lengthWord = word.Length;

            var ir = _searcher.IndexReader;

            int freq     = (ir != null && field != null) ? ir.DocFreq(new Term(FWord, word), _state) : 0;
            int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;

            // if the word exists in the real index and we don't care for word frequency, return the word itself
            if (!morePopular && freq > 0)
            {
                return(new[] { word });
            }

            var query = new BooleanQuery();

            var alreadySeen = new HashSet <string>();

            int ng  = GetMin(lengthWord);
            int max = ng + 1;

            var table = GramsTable;

            for (; ng <= max; ng++)
            {
                string[] grams = FormGrams(word, ng);

                if (grams.Length == 0)
                {
                    continue; // hmm
                }

                if (BoostStart > 0)
                {
                    // should we boost prefixes?
                    Add(query, table[ng].Start, grams[0], BoostStart); // matches start of word
                }

                if (BoostEnd > 0)
                {
                    // should we boost suffixes
                    Add(query, table[ng].End, grams[grams.Length - 1], BoostEnd); // matches end of word
                }

                for (int i = 0; i < grams.Length; i++)
                {
                    Add(query, table[ng].Gram, grams[i]);
                }
            }

            int maxHits = 10 * pageSize;

            //    System.out.println("Q: " + query);
            ScoreDoc[] hits = _searcher.Search(query, null, maxHits, _state).ScoreDocs;

            //    System.out.println("HITS: " + hits.length());
            var queue = new SuggestWordQueue(pageSize);

            // go thru more than 'maxr' matches in case the distance filter triggers
            int stop = Math.Min(hits.Length, maxHits);

            var suggestedWord = new SuggestWord();

            for (int i = 0; i < stop; i++)
            {
                suggestedWord.Term = _searcher.Doc(hits[i].Doc, _state).Get(FWord, _state); // get orig word

                // don't suggest a word for itself, that would be silly
                if (suggestedWord.Term.Equals(word, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                // edit distance
                suggestedWord.Score = sd.GetDistance(word, suggestedWord.Term);
                if (suggestedWord.Score < min)
                {
                    continue;
                }

                if (ir != null && field != null)
                {
                    // use the user index
                    suggestedWord.Freq = _searcher.DocFreq(new Term(FWord, suggestedWord.Term), _state); // freq in the index

                    // don't suggest a word that is not present in the field
                    if ((morePopular && goalFreq > suggestedWord.Freq) || suggestedWord.Freq < 1)
                    {
                        continue;
                    }
                }

                if (alreadySeen.Add(suggestedWord.Term) == false) // we already seen this word, no point returning it twice
                {
                    continue;
                }

                queue.InsertWithOverflow(suggestedWord);
                if (queue.Size() == pageSize)
                {
                    // if queue full, maintain the minScore score
                    min = queue.Top().Score;
                }

                suggestedWord = new SuggestWord();
            }

            int size = queue.Size();

            if (size == 0)
            {
                return(EmptyArray);
            }

            // convert to array string
            string[] list = new string[size];
            for (int i = size - 1; i >= 0; i--)
            {
                list[i] = queue.Pop().Term;
            }

            return(list);
        }
Beispiel #2
0
        /// <summary>
        /// Suggest similar words (optionally restricted to a field of an index).
        /// 
        /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
        /// is not the same as the edit distance strategy used to calculate the best
        /// matching spell-checked word from the hits that Lucene found, one usually has
        /// to retrieve a couple of numSug's in order to get the true best match.
        /// 
        /// </para>
        /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one.
        /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
        /// 
        /// </para>
        /// </summary>
        /// <param name="word"> the word you want a spell check done on </param>
        /// <param name="numSug"> the number of suggested words </param>
        /// <param name="ir"> the indexReader of the user index (can be null see field param) </param>
        /// <param name="field"> the field of the user index: if field is not null, the suggested
        /// words are restricted to the words present in this field. </param>
        /// <param name="suggestMode"> 
        /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param>
        /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param>
        /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception>
        /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
        /// <returns> String[] the sorted list of the suggest words with these 2 criteria:
        /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
        /// of the suggest words in the field of the user index
        ///  </returns>
        public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy)
        {
            // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();
            try
            {
                if (ir == null || field == null)
                {
                    suggestMode = SuggestMode.SUGGEST_ALWAYS;
                }
                if (suggestMode == SuggestMode.SUGGEST_ALWAYS)
                {
                    ir = null;
                    field = null;
                }

                int lengthWord = word.Length;

                int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
                int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
                // if the word exists in the real index and we don't care for word frequency, return the word itself
                if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0)
                {
                    return new string[] { word };
                }

                BooleanQuery query = new BooleanQuery();
                string[] grams;
                string key;

                for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
                {

                    key = "gram" + ng; // form key

                    grams = FormGrams(word, ng); // form word into ngrams (allow dups too)

                    if (grams.Length == 0)
                    {
                        continue; // hmm
                    }

                    if (bStart > 0) // should we boost prefixes?
                    {
                        Add(query, "start" + ng, grams[0], bStart); // matches start of word

                    }
                    if (bEnd > 0) // should we boost suffixes
                    {
                        Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word

                    }
                    for (int i = 0; i < grams.Length; i++)
                    {
                        Add(query, key, grams[i]);
                    }
                }

                int maxHits = 10 * numSug;

                //    System.out.println("Q: " + query);
                ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
                //    System.out.println("HITS: " + hits.length());
                SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);

                // go thru more than 'maxr' matches in case the distance filter triggers
                int stop = Math.Min(hits.Length, maxHits);
                SuggestWord sugWord = new SuggestWord();
                for (int i = 0; i < stop; i++)
                {

                    sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word

                    // don't suggest a word for itself, that would be silly
                    if ([email protected](word))
                    {
                        continue;
                    }

                    // edit distance
                    sugWord.score = sd.GetDistance(word, sugWord.@string);
                    if (sugWord.score < accuracy)
                    {
                        continue;
                    }

                    if (ir != null && field != null) // use the user index
                    {
                        sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index
                        // don't suggest a word that is not present in the field
                        if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1)
                        {
                            continue;
                        }
                    }
                    sugQueue.InsertWithOverflow(sugWord);
                    if (sugQueue.Size() == numSug)
                    {
                        // if queue full, maintain the minScore score
                        accuracy = sugQueue.Top().score;
                    }
                    sugWord = new SuggestWord();
                }

                // convert to array string
                string[] list = new string[sugQueue.Size()];
                for (int i = sugQueue.Size() - 1; i >= 0; i--)
                {
                    list[i] = sugQueue.Pop().@string;
                }

                return list;
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }