private IEnumerable <IWordSimilarityNode> OrderPossibilities(string input, int maxResults, ICollection <string> possibleResults) { //If we only have a small amount, we can sort them all. if (possibleResults.Count <= maxResults) { var queue = new WordQueue(possibleResults.Count); foreach (var possibleResult in possibleResults) { queue.Enqueue(possibleResult, StringSimilarityProvider.Similarity(input, possibleResult)); } return(queue.OrderByDescending(x => x.Similarity)); } var max = Math.Min(possibleResults.Count, maxResults); //With a large number of words to evaluate, but we only care about the top matches. //Instead of sorting a large number of elements at the end, we use a queue and restrict it's running size to max. //This queue will do that faster than other container types. //SortedList.RemoveAt is O(n) //SortedDictionary/SortedSet.ElementAt is O(n) var likelyWordsQueue = new WordQueue(max); foreach (var word in possibleResults) { var jw = StringSimilarityProvider.Similarity(input, word); likelyWordsQueue.Enqueue(word, jw); } return(likelyWordsQueue.OrderByDescending(x => x.Similarity)); }
/// <summary> /// Finds possible corrections or extensions for a given word. /// </summary> /// <param name="input">The word to spell-check.</param> /// <param name="maxResults">The maximum number of results returned.</param> /// <returns>Up to <see cref="maxResults">maxResults</see> words along with their similarity score. Not sorted.</returns> public IEnumerable<IWordSimilarityNode> PrefixLookup(string input, int maxResults) { var iLen = input.Length; ICollection<string> possibleResults; if (iLen < 3) { List<int> di; if (!Dictionary.TryGetValue(input, out di)) { return new WordSimilarityNode[0]; } possibleResults = di.Select(index => Wordlist[index]).Where(word => word.Length >= iLen).ToList(); } else { possibleResults = new HashSet<string>(); var deletes = new HashSet<string> { input }; //Arbitray decision of the allowable number of deletes. //Could have this start smaller and increase it if a threshold is not met. //But then we risk the problem of having a prefix with a lot of very close but uncommon words. //A small deletion count could miss a word with slightly more distance, but that is much more common. //I don't even know if those exist though. Edits(input, (iLen - 1) / 3, deletes); foreach (var delete in deletes) { List<int> di; if (!Dictionary.TryGetValue(delete, out di)) { continue; } foreach (var word in di.Select(index => Wordlist[index]).Where(word => word.Length >= iLen)) { possibleResults.Add(word); } } } List<int> se; if (Dictionary.TryGetValue(input, out se) && se.Any(x => Wordlist[x] == input)) { possibleResults.Add(input); } //If we only have a small amount, we can sort them all. if (possibleResults.Count <= maxResults) { return possibleResults.Select(word => new WordSimilarityNode { Word = word, Similarity = _stringSimilarityProvider.Similarity(input, word) }); } var max = Math.Min(possibleResults.Count, maxResults); //With a large number of words to evaluate, but we only care about the top matches. //Instead of sorting a large number of elements at the end, we use a queue and restrict it's running size to max. //This queue will do that faster than other container types. //SortedList.RemoveAt is O(n) //SortedDictionary/SortedSet.ElementAt is O(n) var likelyWordsQueue = new WordQueue(max); foreach (var word in possibleResults) { var jw = _stringSimilarityProvider.Similarity(input, word); likelyWordsQueue.Enqueue(word, jw); } return likelyWordsQueue; }