public void LookupShouldNotReturnLowCountWord() { var symSpell = new SymSpell(16, 2, 7, 10); symSpell.CreateDictionaryEntry("pawn", 1); var result = symSpell.Lookup("pawn", SymSpell.Verbosity.Top, 0); Assert.AreEqual(0, result.Count); }
// pre-run to ensure code has executed once before timing benchmarks static void WarmUp() { SymSpell dict = new SymSpell(16, 2, 7); dict.LoadDictionary(DictionaryPath[0], 0, 1); var result = dict.Lookup("hockie", SymSpell.Verbosity.All, 1); Original.SymSpell dictOrig = new Original.SymSpell(2, 7); dictOrig.LoadDictionary(DictionaryPath[0], "", 0, 1); var resultOrig = dictOrig.Lookup("hockie", "", 1, 2); }
public List <SymSpell.SuggestItem> getSuggestions(string word, int verbosity, int distance) { try { return(symSpell.Lookup(word, (SymSpell.Verbosity)verbosity, distance)); } catch (Exception e) { Console.WriteLine(e.Message, word, verbosity, distance); return(new List <SymSpell.SuggestItem>()); } }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { Console.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; var symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = AppDomain.CurrentDomain.BaseDirectory + "../../../../SymSpell/frequency_dictionary_en_82_765.txt"; //path when targeting .NET Core 2.0 & using symspell.cs //string path = "../../../SymSpell/frequency_dictionary_en_82_765.txt"; //path when targeting .NET Framework & using symspell.cs //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use https://github.com/wolfgarbe/SymSpellCompound //string path = "big.txt"; //if (!symSpell.CreateDictionary(path)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All, 1); string input; Console.WriteLine("Type a work and hit enter key to get spelling suggestions:"); while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim())) { Correct(input, symSpell); } }
public void LookupShouldFindExactMatch() { var symSpell = new SymSpell(); symSpell.CreateDictionaryEntry("steama", 4); symSpell.CreateDictionaryEntry("steamb", 6); symSpell.CreateDictionaryEntry("steamc", 2); var result = symSpell.Lookup("steama", SymSpell.Verbosity.Top, 2); Assert.AreEqual(1, result.Count); Assert.AreEqual("steama", result[0].term); }
private static void Experimento1() { Stopwatch stopWatch = new Stopwatch(); string strPath = @"D:\json\"; string[] fileEntries = Directory.GetFiles(strPath); StringBuilder OCROriginal = new StringBuilder(); EditDistanceLength editDistance = new EditDistanceLength(); //Symspell parameters const int initialCapacity = 82765; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>(); foreach (string path in fileEntries) { string jsonText = File.ReadAllText(path, Encoding.Default); var response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText); foreach (var respuestas in response.Responses) { var annotation = respuestas.FullTextAnnotation; if (annotation != null) { OCROriginal.Append(annotation.Text); } } } symSpell.LoadDictionary(@"D:\DictionaryFiles\default.txt", 0, 1); var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' '); int j = 0, k = 0; foreach (string item in arrayOCROriginal) { ExperimentSpell exp1 = new ExperimentSpell(); exp1.correction = "igual"; exp1.original = item; exp1.correctionLookupCompound = item; List <SymSpell.SuggestItem> suggestions = symSpell.Lookup(item, SymSpell.Verbosity.Top); if (suggestions.Count > 0) { exp1.correction = "modificada"; exp1.correctionLookupCompound = suggestions[0].term; } excelMatrix.Add(k++, exp1); } CreateExcelFileExperimento(excelMatrix, "1"); }
public void LookupShouldReturnMostFrequent() { var symSpell = new SymSpell(); symSpell.CreateDictionaryEntry("steama", 4); symSpell.CreateDictionaryEntry("steamb", 6); symSpell.CreateDictionaryEntry("steamc", 2); var result = symSpell.Lookup("steam", SymSpell.Verbosity.Top, 2); Assert.AreEqual(1, result.Count); Assert.AreEqual("steamb", result[0].term); Assert.AreEqual(6, result[0].count); }
public void AddAdditionalCountsShouldNotOverflow() { var symSpell = new SymSpell(); var word = "hello"; symSpell.CreateDictionaryEntry(word, long.MaxValue - 10); var result = symSpell.Lookup(word, SymSpell.Verbosity.Top); long count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(long.MaxValue - 10, count); symSpell.CreateDictionaryEntry(word, 11); result = symSpell.Lookup(word, SymSpell.Verbosity.Top); count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(long.MaxValue, count); }
public void AddAdditionalCountsShouldIncreaseCount() { var symSpell = new SymSpell(); var word = "hello"; symSpell.CreateDictionaryEntry(word, 11); var result = symSpell.Lookup(word, SymSpell.Verbosity.Top); long count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(11, count); symSpell.CreateDictionaryEntry(word, 3); result = symSpell.Lookup(word, SymSpell.Verbosity.Top); count = 0; if (result.Count == 1) { count = result[0].count; } Assert.AreEqual(11 + 3, count); }
public static void Benchmark(string path, int testNumber) { int resultSum = 0; string[] testList = new string[testNumber]; List <SymSpell.SuggestItem> suggestions = null; //load 1000 terms with random spelling errors int i = 0; using (StreamReader sr = new StreamReader(File.OpenRead(path))) { String line; //process a single line at a time only for memory efficiency while ((line = sr.ReadLine()) != null) { string[] lineParts = line.Split(null); if (lineParts.Length >= 2) { string key = lineParts[0]; testList[i++] = key; } } } Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //perform n rounds of Lookup of 1000 terms with random spelling errors int rounds = 10; for (int j = 0; j < rounds; j++) { resultSum = 0; //spellcheck strings for (i = 0; i < testNumber; i++) { suggestions = SymSpell.Lookup(testList[i], "", SymSpell.editDistanceMax); resultSum += suggestions.Count; } } stopWatch.Stop(); Console.WriteLine(resultSum.ToString("N0") + " results in " + (stopWatch.ElapsedMilliseconds / rounds).ToString() + " ms"); }
public void warmAttention2(string input, bool onlyWarn, string session, string context) { var tokens_ = input.Split(' '); List <string> correcteds = new List <string>(); List <bool> notPush = new List <bool>(); for (int t = 0; t != tokens_.Length; t++) { List <SymSpell.SuggestItem> items = corrector.Lookup(tokens_[t], SymSpell.Verbosity.Closest); notPush.Add(false); if (items.Count > 0) { correcteds.Add(items[0].term); } else { correcteds.Add(""); } } float max = 0; for (int kInd = 0; kInd != knowledge.Count; kInd++) { var sp = knowledge[kInd]; if (sp.isDirective || !sp.context.Equals(context)) { continue; } //Console.WriteLine("at pattern {0}", kInd); warmXcorr(sp, tokens_, correcteds.ToArray(), max, (string s, Action a, string extra) => { //(Parlogike self, string input, List<Variable> args, char dir, bool mutate, Pattern pattern) if (!Parlogike.externFunctors.ContainsKey(a._operator)) { Console.WriteLine("Operator {0} doesnt exists at line {1}", a._operator, a.line); return(0); } return((Parlogike.externFunctors [a._operator](this, s, a.arguments, 'i', false, sp, session, extra)).w); }); } }
public static List <SymSpell.SuggestItem> Correct(string input, SymSpell symSpell) { List <SymSpell.SuggestItem> suggestions = null; //check if input term or similar terms within edit-distance are in dictionary, return results sorted by ascending edit distance, then by descending word frequency const SymSpell.Verbosity verbosity = SymSpell.Verbosity.All; suggestions = symSpell.Lookup(input, verbosity); //return suggestions; //display term and frequency foreach (var suggestion in suggestions) { //Debug.Log(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } if (verbosity != SymSpell.Verbosity.Top) { Debug.Log(suggestions.Count.ToString() + " suggestions"); } return(suggestions); }
// Use this for initialization void Start() { if (!targetkeyboard) { targetkeyboard = KeyboardLayout.Instance; } if (!targetkeyboard) { Debug.LogError("Target Keyboard Empty"); } else { targetkeyboard.KeyboardLayout_OnKeyPressed += WordPrediction_KeyPressedHandler; } Debug.Log("Creating dictionary ..."); //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = Application.dataPath + @"\SpellChecker\Resources\frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Debug.LogError("\rFile not found: " + System.IO.Path.GetFullPath(path)); } //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); }
public void LookupShouldReplicateNoisyResults() { var dir = AppDomain.CurrentDomain.BaseDirectory; const int editDistanceMax = 2; const int prefixLength = 7; const SymSpell.Verbosity verbosity = SymSpell.Verbosity.Closest; var symSpell = new SymSpell(83000, editDistanceMax, prefixLength); string path = dir + "../../../SymSpell/frequency_dictionary_en_82_765.txt"; //for spelling correction (genuine English words) symSpell.LoadDictionary(path, 0, 1); //load 1000 terms with random spelling errors string[] testList = new string[1000]; int i = 0; using (StreamReader sr = new StreamReader(File.OpenRead(dir + "../../../SymSpell.Demo/test_data/noisy_query_en_1000.txt"))) { String line; //process a single line at a time only for memory efficiency while ((line = sr.ReadLine()) != null) { string[] lineParts = line.Split(null); if (lineParts.Length >= 2) { testList[i++] = lineParts[0]; } } } int resultSum = 0; for (i = 0; i < testList.Length; i++) { resultSum += symSpell.Lookup(testList[i], verbosity, symSpell.MaxDictionaryEditDistance).Count; } Assert.AreEqual(4945, resultSum); }
public static void Correct(string input, string language) { List <SymSpell.SuggestItem> suggestions = null; Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //check if input term or similar terms within edit-distance are in dictionary, return results sorted by ascending edit distance, then by descending word frequency suggestions = SymSpell.Lookup(input, language, SymSpell.editDistanceMax); stopWatch.Stop(); Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString() + " ms"); //display term and frequency foreach (var suggestion in suggestions) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } if (SymSpell.verbose != 0) { Console.WriteLine(suggestions.Count.ToString() + " suggestions"); } }
static void Main(string[] args) { if (args.Length > 2) { Console.Error.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //parameters int initialCapacity = 82765; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file //dictionaryType string dictionaryType = args[0].ToLower(); if ("load.create".IndexOf(dictionaryType) == -1) { Console.Error.WriteLine("Error in parameter 1"); return; } //dictionaryPath string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; //prefix length (optional parameter) int offset = 0; string lookupType = ""; int prefixLength = 7; if (!int.TryParse(args[2], out prefixLength)) { prefixLength = 7; } else { offset = 1; } //lookupType if (args.Length > 2 + offset) { lookupType = args[2 + offset].ToLower(); if ("lookup.lookupcompound.wordsegment".IndexOf(lookupType) == -1) { Console.Error.WriteLine("Error in parameter " + (3 + offset).ToString()); return; } } //maxEditDistance int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation if (args.Length > 3 + offset) { if (!int.TryParse(args[3 + offset], out maxEditDistanceDictionary)) { Console.Error.WriteLine("Error in parameter " + (4 + offset).ToString()); return; } } //output stats bool outputStats = false;//false, true if (args.Length > 4 + offset) { if (!bool.TryParse(args[4 + offset], out outputStats)) { Console.Error.WriteLine("Error in parameter " + (5 + offset).ToString()); return; } } //verbosity var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All if (args.Length > 5 + offset) { if (!Enum.TryParse(args[5 + offset], true, out suggestionVerbosity)) { Console.Error.WriteLine("Error in parameter " + (6 + offset).ToString()); return; } } //create object var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength); //load dictionary switch (dictionaryType) { case "load": if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.Error.WriteLine("File not found!"); return; } break; case "create": if (!symSpell.CreateDictionary(dictionaryPath)) { Console.Error.WriteLine("File not found!"); return; } break; default: break; } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); //lookup suggestions for single-word input strings string inputTerm; while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim())) { switch (lookupType) { case "lookup": var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceDictionary, true); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "lookupcompound": var suggestions2 = symSpell.LookupCompound(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions2) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "wordsegment": var suggestions3 = symSpell.WordSegmentation(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions3) { if (outputStats) { Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString()); } else { Console.WriteLine(suggestion.correctedString); } } break; default: break; } } } else { //PrefixLength is number //help Console.WriteLine("SymSpell.CommandLine DictionaryType DictionaryPath [PrefixLength] LookupType [MaxEditDistance] [OutputStats] [Verbosity]"); Console.WriteLine(); Console.WriteLine("DictionaryType=load|create"); Console.WriteLine(" load: load dictionary from dictionary file"); Console.WriteLine(" create: create dictionary from text corpus"); Console.WriteLine("DictionaryPath: path to dictionary/corpus file"); Console.WriteLine("PrefixLength: default=7 (speed/memory consumption trade-off)"); //dictionary param Console.WriteLine(" 5: low memory, slow lookup"); Console.WriteLine(" 6: medium memory, medium lookup"); Console.WriteLine(" 7: high memory, fast lookup"); //lookup intended for correction of single word //lookupcompound intended for correction of multiple words, it can insert only a single space per token, faster than wordsegmentation //wordsegmentation intended for segmentation and correction of multiple words, it can insert multiple spaces per token, slower than lookupcompound Console.WriteLine("LookupType=lookup|lookupcompound|wordsegment"); Console.WriteLine(" lookup: correct single word"); Console.WriteLine(" lookupcompound: correct multiple-word string (supports splitting/merging)"); Console.WriteLine(" wordsegment: word segment and correct input string"); Console.WriteLine("MaxEditDistance: default=2 (0: no correction, word segmentation only)"); Console.WriteLine("OutputStats=false|true"); Console.WriteLine(" false: only corrected string"); Console.WriteLine(" true: corrected string, edit distance, word frequency/probability"); Console.WriteLine("Verbosity=top|closest|all"); //no effect for lookupcompound and wordsegment Console.WriteLine(" top: Top suggestion"); Console.WriteLine(" closest: All suggestions of smallest edit distance found"); Console.WriteLine(" all: All suggestions within maxEditDistance"); Console.WriteLine(); } }
private void bw_DEBUG_OCR(object sender, DoWorkEventArgs e) { var watch = Stopwatch.StartNew(); string processedTextResults = ""; string preProcessedTextResults = ""; int hits = 0; BackgroundWorker worker = sender as BackgroundWorker; Process[] processes = Process.GetProcessesByName("Eternal"); Process p = processes.FirstOrDefault(); IntPtr windowHandle; if (p != null) { windowHandle = p.MainWindowHandle; // difference is 20 ms between these two function //Image img22 = CaptureWindow(windowHandle); Pix img = CaptureWindowPix(windowHandle); img = img.ConvertRGBToGray(0.40f, 0.34f, 0.26f); //img = img.BinarizeOtsuAdaptiveThreshold(img.Width / 5, img.Height / 5, 10, 10, 0.1f); // img = img.BinarizeSauvolaTiled(); //img = img.INVERT img = img.Scale(scalingFactor, scalingFactor); //img = img.BinarizeOtsuAdaptiveThreshold(img.Width / 5, img.Height / 5, 10, 10, 0.1f); //img = img.UNSHARPMASK //img = img.BinarizeOtsuAdaptiveThreshold(2000, 2000, 0, 0, 0.0f); //img = img.SELECTBYSIZE // removeNoise //var dpiX = 300; //var dpiY = 300; //Bitmap screenshotBitmap = PixConverter.ToBitmap(img); //screenshotBitmap.SetResolution(dpiX, dpiY); //DEBUG_PrintImage(img, "manualPreProcessing", watch); for (int i = 0; i < 12; i++) { //formGraphics.DrawRectangle(new Pen(new SolidBrush(Color.Pink)), Cards[i].WholeCardBounding); //formGraphics.DrawRectangle(new Pen(new SolidBrush(Color.Aqua)), Cards[i].TextboxBounding); //formGraphics.DrawRectangle(new Pen(new SolidBrush(Color.Bisque)), Cards[i].RankLocation.X, Cards[i].RankLocation.Y, 10, 10); Rect textbox_Scaled = new Rect( Cards[i].TextboxBounding.X * (int)scalingFactor, Cards[i].TextboxBounding.Y * (int)scalingFactor, Cards[i].TextboxBounding.Width * (int)scalingFactor, Cards[i].TextboxBounding.Height * (int)scalingFactor); using (Page processedImage = ocrEngine.Process(img, textbox_Scaled)) { DEBUG_PrintImage(processedImage.GetThresholdedImage(), "lower rez (individual)" + i, watch); var text = processedImage.GetText(); preProcessedTextResults += text.Replace("\n", "") + Environment.NewLine; text = CleanText(text); processedTextResults += text + Environment.NewLine; if (cardRankings.ContainsKey(text)) { Cards[i].Rank = cardRankings[text]; hits++; } else if (!String.IsNullOrEmpty(text)) { List <SymSpell.suggestItem> suggestions = null; suggestions = SymSpell.Lookup(text, "", SymSpell.editDistanceMax); if (suggestions.Count > 0) { Cards[i].Rank = cardRankings[suggestions.First().term]; hits++; } else { Cards[i].Rank = "U"; } } else { Cards[i].Rank = string.Empty; } } } RenderRankings(); } watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; OutputTestResults(elapsedMs, processedTextResults, hits, preProcessedTextResults); }
static void Main(string[] args) { if (args.Length >= 2) { Console.Error.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //parameters int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation if (args.Length > 2) { if (!int.TryParse(args[2], out maxEditDistanceDictionary)) { Console.Error.WriteLine("Error in parameter 3"); return; } } int maxEditDistanceLookup = maxEditDistanceDictionary; //max edit distance per lookup var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All if (args.Length > 3) { if (!Enum.TryParse(args[3], out suggestionVerbosity)) { Console.Error.WriteLine("Error in parameter 4"); return; } } int prefixLength = 7; if (args.Length > 4) { if (!int.TryParse(args[4], out prefixLength)) { Console.Error.WriteLine("Error in parameter 5"); return; } } string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; // "../../../../SymSpell/frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file //create object var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength); //load dictionary switch (args[0].ToLower()) { case "load": if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.Error.WriteLine("File not found!"); return; } break; case "create": if (!symSpell.CreateDictionary(dictionaryPath)) { Console.Error.WriteLine("File not found!"); return; } break; default: break; } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All, 1); //lookup suggestions for single-word input strings string inputTerm; while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim())) { var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup, true); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } } } else { //help Console.WriteLine("SymSpell.CommandLine load Path [MaxEditDistance] [Verbosity] [PrefixLength]"); Console.WriteLine("SymSpell.CommandLine create Path [MaxEditDistance] [Verbosity] [PrefixLength]"); Console.WriteLine(); Console.WriteLine("load: load dictionary from dictionary file"); Console.WriteLine("create: create dictionary from text corpus"); Console.WriteLine("MaxEditDistance: default=2"); Console.WriteLine("Verbosity=Top|Closest|All (case-sensitive)"); Console.WriteLine("PrefixLength: default=7 (5:low memory; 7:fast lookup)"); Console.WriteLine(); } }
private static async Task Main(string[] args) { var directory = new DirectoryInfo("temp_git"); if (directory.Exists) { NormalizeDirectoryAttributes(directory); directory.Delete(true); } void NormalizeDirectoryAttributes(DirectoryInfo directoryInfo) { foreach (var subPath in directoryInfo.GetDirectories()) { NormalizeDirectoryAttributes(subPath); } foreach (var file in directoryInfo.GetFiles()) { file.Attributes = FileAttributes.Normal; } } await Task.Delay(TimeSpan.FromSeconds(1)); var info = new ProcessStartInfo("git", "clone https://github.com/discord-csharp/MODiX temp_git"); var p = Process.Start(info); if (p == null) { throw new InvalidOperationException("process handle was null"); } p.WaitForExit(); await Task.Delay(TimeSpan.FromSeconds(2)); var extensions = new[] { ".txt", ".md", ".cs" }; var spellingInfos = new List <FileSpellingInfo>(); var spell = new SymSpell(); if (!spell.LoadDictionary(Path.Combine(Environment.CurrentDirectory, "frequency_dictionary_en_82_765.txt"), 0, 1)) { throw new InvalidOperationException(); } foreach (var file in Directory.GetFiles(Path.Combine(Environment.CurrentDirectory, "temp_git"), "*", SearchOption.AllDirectories)) { if (!extensions.Contains(Path.GetExtension(file))) { continue; } var spellingInfo = new FileSpellingInfo { Path = file }; var fileContents = File.ReadAllLines(file); for (var i = 0; i < fileContents.Length; i++) { var line = fileContents[i].Trim(); if (string.IsNullOrWhiteSpace(line)) { continue; } spellingInfo.LineMistakes.Add((i + 1, line), new List <(string, string)>()); var words = line.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(c => c.RemoveSpecialCharacters()); foreach (var word in words.Where(c => !string.IsNullOrWhiteSpace(c))) { var results = spell.Lookup(word.ToLower(), SymSpell.Verbosity.Top); if (results == null || results.Any() == false) { continue; } var suggestion = results.First(); if (suggestion.term == word.ToLower()) { continue; } spellingInfo.LineMistakes[(i + 1, line)].Add((word, suggestion.term));
static void BenchmarkPrecalculationLookup() { string[] query1k = BuildQuery1K(); int resultNumber = 0; int repetitions = 1000; int totalLoopCount = 0; long totalMatches = 0; long totalOrigMatches = 0; double totalLoadTime, totalMem, totalLookupTime, totalOrigLoadTime, totalOrigMem, totalOrigLookupTime; totalLoadTime = totalMem = totalLookupTime = totalOrigLoadTime = totalOrigMem = totalOrigLookupTime = 0; long totalRepetitions = 0; Stopwatch stopWatch = new Stopwatch(); for (int maxEditDistance = 1; maxEditDistance <= 3; maxEditDistance++) { for (int prefixLength = 5; prefixLength <= 7; prefixLength++) { //benchmark dictionary precalculation size and time //maxEditDistance=1/2/3; prefixLength=5/6/7; dictionary=30k/82k/500k; class=instantiated/static for (int i = 0; i < DictionaryPath.Length; i++) { totalLoopCount++; //instantiated dictionary long memSize = GC.GetTotalMemory(true); stopWatch.Restart(); SymSpell dict = new SymSpell(DictionarySize[i], maxEditDistance, prefixLength); dict.LoadDictionary(DictionaryPath[i], 0, 1); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; totalLoadTime += stopWatch.Elapsed.TotalSeconds; totalMem += memDelta / 1024.0 / 1024.0; Console.WriteLine("Precalculation instance " + stopWatch.Elapsed.TotalSeconds.ToString("N3") + "s " + (memDelta / 1024.0 / 1024.0).ToString("N1") + "MB " + dict.WordCount.ToString("N0") + " words " + dict.EntryCount.ToString("N0") + " entries MaxEditDistance=" + maxEditDistance.ToString() + " prefixLength=" + prefixLength.ToString() + " dict=" + DictionaryName[i]); //static dictionary memSize = GC.GetTotalMemory(true); stopWatch.Restart(); Original.SymSpell dictOrig = new Original.SymSpell(maxEditDistance, prefixLength); dictOrig.LoadDictionary(DictionaryPath[i], "", 0, 1); stopWatch.Stop(); memDelta = GC.GetTotalMemory(true) - memSize; totalOrigLoadTime += stopWatch.Elapsed.TotalSeconds; totalOrigMem += memDelta / 1024.0 / 1024.0; Console.WriteLine("Precalculation static " + stopWatch.Elapsed.TotalSeconds.ToString("N3") + "s " + (memDelta / 1024 / 1024.0).ToString("N1") + "MB " + dictOrig.Count.ToString("N0") + " words " + dictOrig.EntryCount.ToString("N0") + " entries MaxEditDistance=" + maxEditDistance.ToString() + " prefixLength=" + prefixLength.ToString() + " dict=" + DictionaryName[i]); //benchmark lookup result number and time //maxEditDistance=1/2/3; prefixLength=5/6/7; dictionary=30k/82k/500k; verbosity=0/1/2; query=exact/non-exact/mix; class=instantiated/static foreach (SymSpell.Verbosity verbosity in Enum.GetValues(typeof(SymSpell.Verbosity))) { //instantiated exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dict.Lookup("different", verbosity, maxEditDistance).Count; } stopWatch.Stop(); totalLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalMatches += resultNumber; Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=exact"); //static exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dictOrig.Lookup("different", "", maxEditDistance, (int)verbosity).Count; } stopWatch.Stop(); totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalOrigMatches += resultNumber; Console.WriteLine("Lookup static " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=exact"); Console.WriteLine(); totalRepetitions += repetitions; //instantiated non-exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dict.Lookup("hockie", verbosity, maxEditDistance).Count; } stopWatch.Stop(); totalLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalMatches += resultNumber; Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=non-exact"); //static non-exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dictOrig.Lookup("hockie", "", maxEditDistance, (int)verbosity).Count; } stopWatch.Stop(); totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalOrigMatches += resultNumber; Console.WriteLine("Lookup static " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=non-exact"); Console.WriteLine(); totalRepetitions += repetitions; //instantiated mix stopWatch.Restart(); resultNumber = 0; foreach (var word in query1k) { resultNumber += dict.Lookup(word, verbosity, maxEditDistance).Count; } stopWatch.Stop(); totalLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalMatches += resultNumber; Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / query1k.Length).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=mix"); //static mix stopWatch.Restart(); resultNumber = 0; foreach (var word in query1k) { resultNumber += dictOrig.Lookup(word, "", maxEditDistance, (int)verbosity).Count; } stopWatch.Stop(); totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalOrigMatches += resultNumber; Console.WriteLine("Lookup static " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / query1k.Length).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=mix"); Console.WriteLine(); totalRepetitions += query1k.Length; } Console.WriteLine(); dict = null; dictOrig = null; } } } Console.WriteLine("Average Precalculation time instance " + (totalLoadTime / totalLoopCount).ToString("N3") + "s " + ((totalLoadTime / totalOrigLoadTime) - 1).ToString("P1")); Console.WriteLine("Average Precalculation time static " + (totalOrigLoadTime / totalLoopCount).ToString("N3") + "s"); Console.WriteLine("Average Precalculation memory instance " + (totalMem / totalLoopCount).ToString("N1") + "MB " + ((totalMem / totalOrigMem) - 1).ToString("P1")); Console.WriteLine("Average Precalculation memory static " + (totalOrigMem / totalLoopCount).ToString("N1") + "MB"); Console.WriteLine("Average Lookup time instance " + (totalLookupTime / totalRepetitions).ToString("N3") + "ms " + ((totalLookupTime / totalOrigLookupTime) - 1).ToString("P1")); Console.WriteLine("Average Lookup time static " + (totalOrigLookupTime / totalRepetitions).ToString("N3") + "ms"); Console.WriteLine("Total Lookup results instance " + totalMatches.ToString("N0") + " " + (totalMatches - totalOrigMatches) + " differences"); Console.WriteLine("Total Lookup results static " + totalOrigMatches.ToString("N0")); }
private IEnumerator populatePrediction(string text) { string[] words = text.Split(' '); string lastWord = words [words.Length - 1]; lastWord = lastWord.ToLower(); if (symSpell == null) { initDict(); } if (lastWord != null && symSpell != null) { Debug.Log("GFX: LastWord:" + lastWord); List <SymSpell.SuggestItem> suggestions = symSpell.Lookup(lastWord, SymSpell.Verbosity.All, 3); suggestions.Sort(); Text txtTag = PredictedWord.GetComponentInChildren <Text> (); if (suggestions != null) { txtTag.text = suggestions [0].term.ToUpper(); PredictedWord.name = suggestions [0].term.ToUpper(); } List <SymSpell.SuggestItem> fitered1 = suggestions; /*List<SymSpell.SuggestItem> filtered = suggestions.Where(s => (s.term.Length>text.Length)).ToList(); * filtered = filtered.Where(s => ((s.term.IndexOf(text)==0))).ToList(); * filtered.Sort (); * List<SymSpell.SuggestItem> fitered1, fitered2; * if (filtered.Count<= 0) { * Debug.Log ("GFX: using Default List"); * fitered1 = suggestions.Where (s => (s.distance == 1)).ToList (); * fitered2 = suggestions.Where (s => (s.distance == 2)).ToList (); * } * else{ * Debug.Log ("GFX: using Filtered List"); * fitered1 = filtered.Where (s => (s.distance == 1)).ToList (); * fitered2 = filtered.Where (s => (s.distance == 2)).ToList (); * } * int i = 0; * if (fitered2.Count > 0) { * Debug.Log ("GFX: LastWord:"+lastWord+" predicted word:"+fitered2 [0].term); * PredictedWord.name = fitered2 [0].term; * Text txtTag = PredictedWord.GetComponentInChildren<Text> (); * if (txtTag != null) * txtTag.text = fitered2 [0].term.ToUpper(); * }*/ int i = 0; HashSet <char> cSet = new HashSet <char>(); while (i < fitered1.Count && cSet.Count < 3) { string sug = fitered1 [i++].term; Debug.Log("GFX: Filter1: Sug" + sug + " last:" + sug [sug.Length - 1]); if (sug.Length > lastWord.Length && sug [sug.Length - 1] != '\0') { cSet.Add(sug [sug.Length - 1]); } } i = 1; foreach (char c in cSet) { GameObject cGameObject = null; switch (i) { case 1: cGameObject = opt1; break; case 2: cGameObject = opt2; break; case 3: cGameObject = opt3; break; } if (cGameObject != null) { cGameObject.name = "" + c; Text txtText = cGameObject.GetComponentInChildren <Text> (); if (txtText != null) { txtText.text = "" + c.ToString().ToUpper(); cGameObject.name = "" + c.ToString().ToUpper(); } i++; } } } yield return(null); }
private void SpellCorrect_Click(object sender, RoutedEventArgs e) { ConsoleManager.Show(); const int initialCapacity = 82765 * 2; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); long memSize = GC.GetTotalMemory(true); // Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = @"C:\Users\Emmanuel\source\repos\Project-Carl\Project-CARL\WpfApp1\frequency_dictionary_en_82_765.txt"; string dict2 = @"C:\Users\Emmanuel\source\repos\Project-Carl\Project-CARL\WpfApp1\unigram_freq.txt"; long memDelta = GC.GetTotalMemory(true) - memSize; if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + System.IO.Path.GetFullPath(path)); Console.ReadKey(); //return; } if (!symSpell.LoadDictionary(dict2, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + System.IO.Path.GetFullPath(path)); Console.ReadKey(); //return; } //Open textfile String correctionFile = ""; System.Windows.Forms.MessageBox.Show("Choose file to Correct"); OpenFileDialog openFileDialog1 = new OpenFileDialog(); if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK) { correctionFile = openFileDialog1.FileName; System.Windows.Forms.MessageBox.Show(correctionFile); } //read words into array/list string corp = File.ReadAllText(correctionFile, Encoding.UTF8); //read raw text file string[] words = corp.Split(new string[] { " " }, StringSplitOptions.None); //tokenize raw text file List <SymSpell.SuggestItem> suggestedWord = null; //list of all corrected words List <string> correctedWords = new List <string>(); //Output of the corrected words //submit word to symSpell for (int i = 0; i < words.Length; i++) { suggestedWord = (symSpell.Lookup(words[i], SymSpell.Verbosity.Closest)); correctedWords.Add(suggestedWord.First().term); } //save words to file string fileName = System.IO.Path.GetRandomFileName() + ".txt"; //random file name for our corrected text //save the directory of the correction file we selected previously string pathString = System.IO.Path.GetDirectoryName(correctionFile); // Use Combine again to add the file name to the path. pathString = System.IO.Path.Combine(pathString, fileName); string tmpstring = ""; foreach (string word in correctedWords) { tmpstring += (word + " "); } File.WriteAllText(pathString, tmpstring); }