public static string GetStringValue(this SuggestMode enumValue) { switch (enumValue) { case SuggestMode.Missing: return("missing"); case SuggestMode.Popular: return("popular"); case SuggestMode.Always: return("always"); } throw new ArgumentException($"'{enumValue.ToString()}' is not a valid value for enum 'SuggestMode'"); }
/// <summary> /// Generate suggestions by breaking the passed-in term into multiple words. /// The scores returned are equal to the number of word breaks needed so a /// lower score is generally preferred over a higher score. /// </summary> /// <param name="suggestMode"> /// - default = <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/> </param> /// <param name="sortMethod"> /// - default = <see cref="BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY"/> </param> /// <returns> one or more arrays of words formed by breaking up the original term </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> public virtual SuggestWord[][] SuggestWordBreaks(Term term, int maxSuggestions, IndexReader ir, SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { if (maxSuggestions < 1) { return(Arrays.Empty <SuggestWord[]>()); } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; IComparer <SuggestWordArrayWrapper> queueComparer = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? (IComparer <SuggestWordArrayWrapper>) new LengthThenMaxFreqComparer() : new LengthThenSumFreqComparer(); JCG.PriorityQueue <SuggestWordArrayWrapper> suggestions = new JCG.PriorityQueue <SuggestWordArrayWrapper>(queueInitialCapacity, queueComparer); int origFreq = ir.DocFreq(term); if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { return(Arrays.Empty <SuggestWord[]>()); } int useMinSuggestionFrequency = minSuggestionFrequency; if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) { useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq); } GenerateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, Arrays.Empty <SuggestWord>(), suggestions, 0, sortMethod); SuggestWord[][] suggestionArray = new SuggestWord[suggestions.Count][]; for (int i = suggestions.Count - 1; i >= 0; i--) { suggestionArray[i] = suggestions.Dequeue().SuggestWords; } return(suggestionArray); }
/// <summary> /// Suggest similar words. /// /// <para> /// Unlike <see cref="SpellChecker"/>, the similarity used to fetch the most /// relevant terms is an edit distance, therefore typically a low value /// for numSug will work very well. /// </para> /// </summary> /// <param name="term"> Term you want to spell check on </param> /// <param name="numSug"> the maximum number of suggested words </param> /// <param name="ir"> IndexReader to find terms from </param> /// <param name="suggestMode"> specifies when to return suggested words </param> /// <param name="accuracy"> return only suggested words that match with this similarity </param> /// <returns> sorted list of the suggested words according to the comparer </returns> /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception> public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) { CharsRef spare = new CharsRef(); string text = term.Text(); if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength) { return(new SuggestWord[0]); } if (lowerCaseTerms) { term = new Term(term.Field, text.ToLower()); } int docfreq = ir.DocFreq(term); if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) { return(new SuggestWord[0]); } int maxDoc = ir.MaxDoc; if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { return(new SuggestWord[0]); } else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * maxDoc)) { return(new SuggestWord[0]); } if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) { docfreq = 0; } if (thresholdFrequency >= 1f) { docfreq = Math.Max(docfreq, (int)thresholdFrequency); } else if (thresholdFrequency > 0f) { docfreq = Math.Max(docfreq, (int)(thresholdFrequency * maxDoc) - 1); } IEnumerable <ScoreTerm> terms = null; int inspections = numSug * maxInspections; // try ed=1 first, in case we get lucky terms = SuggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); if (maxEdits > 1 && terms.Count() < inspections) { var moreTerms = new HashSet <ScoreTerm>(); moreTerms.AddAll(terms); moreTerms.AddAll(SuggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); terms = moreTerms; } // create the suggestword response, sort it, and trim it to size. var suggestions = new SuggestWord[terms.Count()]; int index = suggestions.Length - 1; foreach (ScoreTerm s in terms) { SuggestWord suggestion = new SuggestWord(); if (s.TermAsString == null) { UnicodeUtil.UTF8toUTF16(s.Term, spare); s.TermAsString = spare.ToString(); } suggestion.String = s.TermAsString; suggestion.Score = s.Score; suggestion.Freq = s.Docfreq; suggestions[index--] = suggestion; } ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparer)); if (numSug < suggestions.Length) { SuggestWord[] trimmed = new SuggestWord[numSug]; Array.Copy(suggestions, 0, trimmed, 0, numSug); suggestions = trimmed; } return(suggestions); }
/// <summary> /// Calls <see cref="SuggestSimilar(Term, int, IndexReader, SuggestMode, float)"/> /// SuggestSimilar(term, numSug, ir, suggestMode, this.accuracy) /// </summary> public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode) { return(SuggestSimilar(term, numSug, ir, suggestMode, this.accuracy)); }
public DirectGeneratorDescriptor <T> SuggestMode(SuggestMode mode) { this._SuggestMode = Enum.GetName(typeof(SuggestMode), mode).ToLower(); return(this); }
/// <summary> /// <para> /// Generate suggestions by combining one or more of the passed-in terms into /// single words. The returned <see cref="CombineSuggestion"/> contains both a /// <see cref="SuggestWord"/> and also an array detailing which passed-in terms were /// involved in creating this combination. The scores returned are equal to the /// number of word combinations needed, also one less than the length of the /// array <see cref="CombineSuggestion.OriginalTermIndexes"/>. Generally, a /// suggestion with a lower score is preferred over a higher score. /// </para> /// <para> /// To prevent two adjacent terms from being combined (for instance, if one is /// mandatory and the other is prohibited), separate the two terms with /// <see cref="WordBreakSpellChecker.SEPARATOR_TERM"/> /// </para> /// <para> /// When suggestMode equals <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/>, each /// suggestion will include at least one term not in the index. /// </para> /// <para> /// When suggestMode equals <see cref="SuggestMode.SUGGEST_MORE_POPULAR"/>, each /// suggestion will have the same, or better frequency than the most-popular /// included term. /// </para> /// </summary> /// <returns> an array of words generated by combining original terms </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> public virtual CombineSuggestion[] SuggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir, SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { if (maxSuggestions < 1) { return(Arrays.Empty <CombineSuggestion>()); } int[] origFreqs = null; if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { origFreqs = new int[terms.Length]; for (int i = 0; i < terms.Length; i++) { origFreqs[i] = ir.DocFreq(terms[i]); } } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; IComparer <CombineSuggestionWrapper> queueComparer = new CombinationsThenFreqComparer(); JCG.PriorityQueue <CombineSuggestionWrapper> suggestions = new JCG.PriorityQueue <CombineSuggestionWrapper>(queueInitialCapacity, queueComparer); int thisTimeEvaluations = 0; for (int i = 0; i < terms.Length - 1; i++) { if (terms[i].Equals(SEPARATOR_TERM)) { continue; } string leftTermText = terms[i].Text; int leftTermLength = leftTermText.CodePointCount(0, leftTermText.Length); if (leftTermLength > maxCombineWordLength) { continue; } int maxFreq = 0; int minFreq = int.MaxValue; if (origFreqs != null) { maxFreq = origFreqs[i]; minFreq = origFreqs[i]; } string combinedTermText = leftTermText; int combinedLength = leftTermLength; for (int j = i + 1; j < terms.Length && j - i <= maxChanges; j++) { if (terms[j].Equals(SEPARATOR_TERM)) { break; } string rightTermText = terms[j].Text; int rightTermLength = rightTermText.CodePointCount(0, rightTermText.Length); combinedTermText += rightTermText; combinedLength += rightTermLength; if (combinedLength > maxCombineWordLength) { break; } if (origFreqs != null) { maxFreq = Math.Max(maxFreq, origFreqs[j]); minFreq = Math.Min(minFreq, origFreqs[j]); } Term combinedTerm = new Term(terms[0].Field, combinedTermText); int combinedTermFreq = ir.DocFreq(combinedTerm); if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq) { if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0) { if (combinedTermFreq >= minSuggestionFrequency) { int[] origIndexes = new int[j - i + 1]; origIndexes[0] = i; for (int k = 1; k < origIndexes.Length; k++) { origIndexes[k] = i + k; } SuggestWord word = new SuggestWord(); word.Freq = combinedTermFreq; word.Score = origIndexes.Length - 1; word.String = combinedTerm.Text; CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(new CombineSuggestion(word, origIndexes), (origIndexes.Length - 1)); suggestions.Enqueue(suggestion); if (suggestions.Count > maxSuggestions) { suggestions.TryDequeue(out CombineSuggestionWrapper _); } } } } thisTimeEvaluations++; if (thisTimeEvaluations == maxEvaluations) { break; } } } CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.Count]; for (int i = suggestions.Count - 1; i >= 0; i--) { combineSuggestions[i] = suggestions.Dequeue().CombineSuggestion; } return(combineSuggestions); }
/// <summary> /// Suggest similar words (optionally restricted to a field of an index). /// /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms /// is not the same as the edit distance strategy used to calculate the best /// matching spell-checked word from the hits that Lucene found, one usually has /// to retrieve a couple of numSug's in order to get the true best match. /// /// </para> /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one. /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion. /// /// </para> /// </summary> /// <param name="word"> the word you want a spell check done on </param> /// <param name="numSug"> the number of suggested words </param> /// <param name="ir"> the indexReader of the user index (can be null see field param) </param> /// <param name="field"> the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. </param> /// <param name="suggestMode"> /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param> /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param> /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception> /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception> /// <returns> String[] the sorted list of the suggest words with these 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { if (ir == null || field == null) { suggestMode = SuggestMode.SUGGEST_ALWAYS; } if (suggestMode == SuggestMode.SUGGEST_ALWAYS) { ir = null; field = null; } int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) { return(new string[] { word }); } BooleanQuery query = new BooleanQuery(); string[] grams; string key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) // should we boost prefixes? { Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) // should we boost suffixes { Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if ([email protected](word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.@string); if (sugWord.score < accuracy) { continue; } if (ir != null && field != null) // use the user index { sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index // don't suggest a word that is not present in the field if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score accuracy = sugQueue.Top().score; } sugWord = new SuggestWord(); } // convert to array string string[] list = new string[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = sugQueue.Pop().@string; } return(list); } finally { ReleaseSearcher(indexSearcher); } }
public TermSuggestDescriptor <T> SuggestMode(SuggestMode mode) { this._SuggestMode = Enum.GetName(typeof(SuggestMode), mode).ToLowerInvariant(); return(this); }
public DirectGeneratorDescriptor <T> SuggestMode(SuggestMode mode) { Self.SuggestMode = mode; return(this); }
/// <summary> /// Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) /// suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)} /// /// </summary> public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode) { return(SuggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy)); }
/// <summary> /// <para> /// Generate suggestions by breaking the passed-in term into multiple words. /// The scores returned are equal to the number of word breaks needed so a /// lower score is generally preferred over a higher score. /// </para> /// </summary> /// <param name="suggestMode"> /// - default = <seealso cref="SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX"/> </param> /// <param name="sortMethod"> /// - default = /// <seealso cref="BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY"/> </param> /// <returns> one or more arrays of words formed by breaking up the original term </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> public virtual SuggestWord[][] SuggestWordBreaks(Term term, int maxSuggestions, IndexReader ir, SuggestMode suggestMode, BreakSuggestionSortMethod sortMethod) { if (maxSuggestions < 1) { return(new SuggestWord[0][]); } if (suggestMode == null) { suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; } if (sortMethod == null) { sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; IComparer <SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator(this) : new LengthThenSumFreqComparator(this); LinkedList <SuggestWordArrayWrapper> suggestions = new PriorityQueue <SuggestWordArrayWrapper>(queueInitialCapacity, queueComparator); int origFreq = ir.DocFreq(term); if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { return(new SuggestWord[0][]); } int useMinSuggestionFrequency = minSuggestionFrequency; if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) { useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq); } GenerateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0, sortMethod); SuggestWord[][] suggestionArray = new SuggestWord[suggestions.Count][]; for (int i = suggestions.Count - 1; i >= 0; i--) { suggestionArray[i] = suggestions.RemoveFirst().SuggestWords; } return(suggestionArray); }
public TermSuggestDescriptor <T> SuggestMode(SuggestMode mode) { Self.SuggestMode = mode; return(this); }
public TermSuggesterDescriptor <T> SuggestMode(SuggestMode mode) => Assign(a => a.SuggestMode = mode);
/// <summary> /// Suggest similar words (optionally restricted to a field of an index). /// /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms /// is not the same as the edit distance strategy used to calculate the best /// matching spell-checked word from the hits that Lucene found, one usually has /// to retrieve a couple of numSug's in order to get the true best match. /// /// </para> /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one. /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion. /// /// </para> /// </summary> /// <param name="word"> the word you want a spell check done on </param> /// <param name="numSug"> the number of suggested words </param> /// <param name="ir"> the indexReader of the user index (can be null see field param) </param> /// <param name="field"> the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. </param> /// <param name="suggestMode"> /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param> /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param> /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception> /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception> /// <returns> String[] the sorted list of the suggest words with these 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { if (ir == null || field == null) { suggestMode = SuggestMode.SUGGEST_ALWAYS; } if (suggestMode == SuggestMode.SUGGEST_ALWAYS) { ir = null; field = null; } int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) { return new string[] { word }; } BooleanQuery query = new BooleanQuery(); string[] grams; string key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) // should we boost prefixes? { Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) // should we boost suffixes { Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if ([email protected](word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.@string); if (sugWord.score < accuracy) { continue; } if (ir != null && field != null) // use the user index { sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index // don't suggest a word that is not present in the field if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score accuracy = sugQueue.Top().score; } sugWord = new SuggestWord(); } // convert to array string string[] list = new string[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = sugQueue.Pop().@string; } return list; } finally { ReleaseSearcher(indexSearcher); } }
/// <summary> /// Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) /// suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)} /// /// </summary> public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode) { return SuggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy); }