static void TestTime() { int repeatCount = 1000; int currentWordLength = 0; int finalWordLength = 16; for (; currentWordLength < finalWordLength; currentWordLength++) { Console.WriteLine("Word length: " + currentWordLength); foreach (StringDistance.Measure method in Enum.GetValues(typeof(StringDistance.Measure))) { string a = GetRandomWord(currentWordLength); string b = GetRandomWord(currentWordLength); Stopwatch watch = new Stopwatch(); watch.Start(); for (int i = 0; i < repeatCount; i++) { StringDistance measureMethod = StringDistance.StringDistanceBuilder.GetInstance(method, a, b); int result = measureMethod.GetDistance(); } watch.Stop(); long ticks = watch.ElapsedTicks; double avgTicks = (double)ticks / repeatCount; Console.WriteLine("Method: " + method.ToString() + "; Ticks average: " + avgTicks); } Console.WriteLine(); } }
static void Main(string[] args) { if (args.Length > 0 && args[0] == "-t") { TestTime(); Console.WriteLine("TESTING COMPLETED"); Console.ReadKey(); return; } if (args.Length > 0 && args[0] == "-rt") { TestTree(); Console.WriteLine("TESTING COMPLETED"); Console.ReadKey(); return; } string a, b; Console.Write("Первая строка: "); a = Console.ReadLine(); Console.Write("Вторая строка: "); b = Console.ReadLine(); Console.WriteLine("\nРезультат: \n"); foreach (StringDistance.Measure measure in Enum.GetValues(typeof(StringDistance.Measure))) { StringDistance distance = StringDistance.StringDistanceBuilder.GetInstance(measure, a, b); if (distance == null) { continue; } System.Diagnostics.Stopwatch stopwatch = new System.Diagnostics.Stopwatch(); GC.Collect(); stopwatch.Start(); int result = distance.GetDistance(); stopwatch.Stop(); LetterMatrix matrix = distance.GetLetterMatrix(); Console.WriteLine("Метод: " + distance.MethodName); Console.WriteLine("Значение: " + result); Console.WriteLine("Матрица: "); Console.Write(matrix.ToString()); Console.WriteLine("Прошло времени (тиков): " + stopwatch.ElapsedTicks); Console.WriteLine("Прошло времени (секунд): " + stopwatch.ElapsedMilliseconds / 1000f); Console.WriteLine("\n"); } Console.ReadKey(); }
public float GetDistance(string s1, string s2) { if (s1.Length < s2.Length || _parentDistance.GetDistance(s1.Substring(0, s2.Length), s2) != 1) { return(0); } if (s1.Length == s2.Length) { return(1); } return(1 / (float)(s2.Length - s1.Length)); }
/// <summary> Suggest similar words (restricted or not to a field of a user index)</summary> /// <param name="word">String the word you want a spell check done on /// </param> /// <param name="numSug">int the number of suggest words /// </param> /// <param name="ir">the indexReader of the user index (can be null see field param) /// </param> /// <param name="field">String the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. /// </param> /// <param name="morePopular">boolean return only the suggest words that are more frequent than the searched word /// (only if restricted mode = (indexReader!=null and field!=null) /// </param> /// <throws> IOException </throws> /// <returns> String[] the sorted list of the suggest words with this 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { float min = this.minScore; int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && freq > 0) { return(new String[] { word }); } var query = new BooleanQuery(); String[] grams; String key; var alreadySeen = new HashSet <string>(); for (var ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.termString = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if (sugWord.termString.Equals(word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.termString); if (sugWord.score < min) { continue; } if (ir != null && field != null) { // use the user index sugWord.freq = ir.DocFreq(new Term(field, sugWord.termString)); // freq in the index // don't suggest a word that is not present in the field if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } if (alreadySeen.Add(sugWord.termString) == false) // we already seen this word, no point returning it twice { continue; } sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score min = ((SuggestWord)sugQueue.Top()).score; } sugWord = new SuggestWord(); } // convert to array string String[] list = new String[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = ((SuggestWord)sugQueue.Pop()).termString; } return(list); } finally { ReleaseSearcher(indexSearcher); } }
public void TestGetDistance() { float d = sd.GetDistance("al", "al"); Assert.AreEqual(d, 1.0f, 0.001); d = sd.GetDistance("martha", "marhta"); Assert.AreEqual(d, 0.6666, 0.001); d = sd.GetDistance("jones", "johnson"); Assert.AreEqual(d, 0.4285, 0.001); d = sd.GetDistance("abcvwxyz", "cabvwxyz"); Assert.AreEqual(d, 0.75, 0.001); d = sd.GetDistance("dwayne", "duane"); Assert.AreEqual(d, 0.666, 0.001); d = sd.GetDistance("dixon", "dicksonx"); Assert.AreEqual(d, 0.5, 0.001); d = sd.GetDistance("six", "ten"); Assert.AreEqual(d, 0, 0.001); float d1 = sd.GetDistance("zac ephron", "zac efron"); float d2 = sd.GetDistance("zac ephron", "kai ephron"); Assert.AreEqual(d1, d2, 0.001); d1 = sd.GetDistance("brittney spears", "britney spears"); d2 = sd.GetDistance("brittney spears", "brittney startzman"); Assert.True(d1 > d2); }
/// <summary> /// Suggest similar words (optionally restricted to a field of an index). /// /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms /// is not the same as the edit distance strategy used to calculate the best /// matching spell-checked word from the hits that Lucene found, one usually has /// to retrieve a couple of numSug's in order to get the true best match. /// /// </para> /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one. /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion. /// /// </para> /// </summary> /// <param name="word"> the word you want a spell check done on </param> /// <param name="numSug"> the number of suggested words </param> /// <param name="ir"> the indexReader of the user index (can be null see field param) </param> /// <param name="field"> the field of the user index: if field is not null, the suggested /// words are restricted to the words present in this field. </param> /// <param name="suggestMode"> /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param> /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param> /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception> /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception> /// <returns> String[] the sorted list of the suggest words with these 2 criteria: /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// </returns> public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy) { // obtainSearcher calls ensureOpen IndexSearcher indexSearcher = ObtainSearcher(); try { if (ir == null || field == null) { suggestMode = SuggestMode.SUGGEST_ALWAYS; } if (suggestMode == SuggestMode.SUGGEST_ALWAYS) { ir = null; field = null; } int lengthWord = word.Length; int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) { return(new string[] { word }); } BooleanQuery query = new BooleanQuery(); string[] grams; string key; for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = FormGrams(word, ng); // form word into ngrams (allow dups too) if (grams.Length == 0) { continue; // hmm } if (bStart > 0) // should we boost prefixes? { Add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) // should we boost suffixes { Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.Length; i++) { Add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.Min(hits.Length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if ([email protected](word)) { continue; } // edit distance sugWord.score = sd.GetDistance(word, sugWord.@string); if (sugWord.score < accuracy) { continue; } if (ir != null && field != null) // use the user index { sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index // don't suggest a word that is not present in the field if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.InsertWithOverflow(sugWord); if (sugQueue.Size() == numSug) { // if queue full, maintain the minScore score accuracy = sugQueue.Top().score; } sugWord = new SuggestWord(); } // convert to array string string[] list = new string[sugQueue.Size()]; for (int i = sugQueue.Size() - 1; i >= 0; i--) { list[i] = sugQueue.Pop().@string; } return(list); } finally { ReleaseSearcher(indexSearcher); } }
public void TestGetDistance() { float d = sd.GetDistance("al", "al"); Assert.IsTrue(d == 1.0f); d = sd.GetDistance("martha", "marhta"); Assert.IsTrue(d > 0.961 && d < 0.962); d = sd.GetDistance("jones", "johnson"); Assert.IsTrue(d > 0.832 && d < 0.833); d = sd.GetDistance("abcvwxyz", "cabvwxyz"); Assert.IsTrue(d > 0.958 && d < 0.959); d = sd.GetDistance("dwayne", "duane"); Assert.IsTrue(d > 0.84 && d < 0.841); d = sd.GetDistance("dixon", "dicksonx"); Assert.IsTrue(d > 0.813 && d < 0.814); d = sd.GetDistance("fvie", "ten"); Assert.IsTrue(d == 0f); float d1 = sd.GetDistance("zac ephron", "zac efron"); float d2 = sd.GetDistance("zac ephron", "kai ephron"); Assert.IsTrue(d1 > d2); d1 = sd.GetDistance("brittney spears", "britney spears"); d2 = sd.GetDistance("brittney spears", "brittney startzman"); Assert.IsTrue(d1 > d2); }