public SymSpell CreateDictionary(out string ErrorMsg) { ErrorMsg = string.Empty; try { long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; var symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; if (!symSpell.LoadDictionary(path, 0, 1)) { return(null); } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); return(symSpell); } catch (Exception ex) { ErrorMsg = ex.ToString(); return(null); } }
private static void AddPostProcessing(IServiceCollection services) { var symSpell = new SymSpell(); Console.Out.WriteLine("Loading SymSpell dictionary..."); { symSpell.LoadDictionary("../../ru.dict", termIndex: 0, countIndex: 1); } Console.Out.WriteLine("SymSpell initialized!"); var postProcessor = new CombinedProcessor(new ITextPostProcessor[] { new RemoveEmptyLinesProcessor(new RemoveEmptyLinesOptions { NormalizeLineEndings = NormalizeLineEndingsStrategy.Lf }), new PerWordProcessor(new ITextPostProcessor[] { new SymSpellProcessor(symSpell, 1, Enumerable.Empty <string>()) }) }); services.AddSingleton(postProcessor); }
private void button1_Click(object sender, EventArgs e) { //create object int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); //load dictionary string dictionaryPath = "../../frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { richTextBox1.Text = "File not found!"; } var suggList = new List <string>(); string wrongWord = richTextBox1.Text.ToString(); string lowerWrongWord = wrongWord.ToLower(); int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary) var suggestionVerbosity = SymSpell.Verbosity.Closest; //Top, Closest, All var suggestions = symSpell.Lookup(lowerWrongWord, suggestionVerbosity, maxEditDistanceLookup); foreach (var suggestion in suggestions) { listBox1.Items.Add(suggestion.term.ToString()); } }
// pre-run to ensure code has executed once before timing benchmarks static void WarmUp() { SymSpell dict = new SymSpell(16, 2, 7); dict.LoadDictionary(DictionaryPath[0], 0, 1); var result = dict.Lookup("hockie", SymSpell.Verbosity.All, 1); Original.SymSpell dictOrig = new Original.SymSpell(2, 7); dictOrig.LoadDictionary(DictionaryPath[0], "", 0, 1); var resultOrig = dictOrig.Lookup("hockie", "", 1, 2); }
static void Main(string[] args) { //set parameters const int initialCapacity = 82765; const int maxEditDistance = 0; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Console.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //The dictionary may contain vocabulary from different languages. //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly. //You may use SymSpellCompound.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). //string path = "big.txt" //if (!SymSpellCompound.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.WordSegmentation("isit"); string input; Console.WriteLine("Type in a text and hit enter to get word segmentation and correction:"); while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim())) { Correct(input, symSpell); } }
private static void Experimento1() { Stopwatch stopWatch = new Stopwatch(); string strPath = @"D:\json\"; string[] fileEntries = Directory.GetFiles(strPath); StringBuilder OCROriginal = new StringBuilder(); EditDistanceLength editDistance = new EditDistanceLength(); //Symspell parameters const int initialCapacity = 82765; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>(); foreach (string path in fileEntries) { string jsonText = File.ReadAllText(path, Encoding.Default); var response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText); foreach (var respuestas in response.Responses) { var annotation = respuestas.FullTextAnnotation; if (annotation != null) { OCROriginal.Append(annotation.Text); } } } symSpell.LoadDictionary(@"D:\DictionaryFiles\default.txt", 0, 1); var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' '); int j = 0, k = 0; foreach (string item in arrayOCROriginal) { ExperimentSpell exp1 = new ExperimentSpell(); exp1.correction = "igual"; exp1.original = item; exp1.correctionLookupCompound = item; List <SymSpell.SuggestItem> suggestions = symSpell.Lookup(item, SymSpell.Verbosity.Top); if (suggestions.Count > 0) { exp1.correction = "modificada"; exp1.correctionLookupCompound = suggestions[0].term; } excelMatrix.Add(k++, exp1); } CreateExcelFileExperimento(excelMatrix, "1"); }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { Console.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; var symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) //string path = "../../../SymSpell.Demo/test_data/frequency_dictionary_en_30_000.txt"; //for benchmark only (contains also non-genuine English words) //string path = "../../../SymSpell.Demo/test_data/frequency_dictionary_en_500_000.txt"; //for benchmark only (contains also non-genuine English words) string path = "../../../SymSpell/frequency_dictionary_en_82_765.txt"; //for spelling correction (genuine English words) //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); //path when using symspell.cs } //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use https://github.com/wolfgarbe/SymSpellCompound //string path = "big.txt"; //if (!symSpell.CreateDictionary(path)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All, 1); string input; while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim())) { Correct(input, symSpell); } }
public void initDict() { int initialCapacity = 20000; int maxEditDistanceDictionary = 3; //maximum edit distance per dictionary precalculation symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); TextAsset dictionaryPath = Resources.Load <TextAsset>("dataset"); int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Debug.Log("Unable to load dictionary"); } }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { //set global parameters SymSpell.verbose = 0; SymSpell.editDistanceMax = 2; SymSpell.lp = 7; Console.Write("Creating dictionary ..."); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) //string path = "../../../symspelldemo/test_data/frequency_dictionary_en_30_000.txt"; //for benchmark only (contains also non-genuine English words) //string path = "../../../symspelldemo/test_data/frequency_dictionary_en_500_000.txt"; //for benchmark only (contains also non-genuine English words) string path = "../../../symspell/frequency_dictionary_en_82_765.txt"; //for spelling correction (genuine English words) //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!SymSpell.LoadDictionary(path, "", 0, 1)) { Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); //path when using symspell.cs } //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //The dictionary may contain vocabulary from different languages. //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly. //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use https://github.com/wolfgarbe/SymSpellCompound //string path = "big.txt"; //if (!SymSpell.CreateDictionary(path,"")) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); stopWatch.Stop(); Console.WriteLine("\rDictionary: " + SymSpell.wordlist.Count.ToString("N0") + " words, " + SymSpell.dictionary.Count.ToString("N0") + " entries, edit distance=" + SymSpell.editDistanceMax.ToString() + " in " + stopWatch.ElapsedMilliseconds.ToString() + "ms " + (Process.GetCurrentProcess().PrivateMemorySize64 / 1000000).ToString("N0") + " MB"); //Benchmark("../../../symspelldemo/test_data/noisy_query_en_1000.txt",1000); string input; while (!string.IsNullOrEmpty(input = (Console.ReadLine() ?? "").Trim())) { Correct(input, ""); } }
private void InitSym() { //create object int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation sym = new SymSpell(initialCapacity, maxEditDistanceDictionary); //load dictionary string dictionaryPath = Path.Combine(Application.streamingAssetsPath, "SymSpell", "frequency_dictionary_en_82_765.txt"); int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!sym.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Debug.LogError("Dictionary file not found! Aborting..."); return; } }
public void LoadDictionary() { //create object int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); //load dictionary string dictionaryPath = Application.dataPath + @"\SymSpell\frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Debug.Log("File not found!"); return; } }
public SymSpellInterface() { int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); //load dictionary string baseDirectory = AppDomain.CurrentDomain.BaseDirectory; string dictionaryPath = baseDirectory + "frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.WriteLine("File not found " + dictionaryPath); //press any key to exit program } Console.WriteLine("SymSpellInterface was initialized. You are ready to go!"); }
public SpellChecker() { compileDictionary(); // init spell checker const int InitialCapacity = 82765; const int MaxDistanceEditDictionary = 2; this.spellChecker = new SymSpell(InitialCapacity, MaxDistanceEditDictionary); //column of the term in the dictionary text file int termIndex = 0; //column of the term frequency in the dictionary text file int countIndex = 1; if (!spellChecker.LoadDictionary(CompiledDictionary, termIndex, countIndex)) { throw new FileNotFoundException("Dictionary Not found!"); } }
public void LookupShouldReplicateNoisyResults() { var dir = AppDomain.CurrentDomain.BaseDirectory; const int editDistanceMax = 2; const int prefixLength = 7; const SymSpell.Verbosity verbosity = SymSpell.Verbosity.Closest; var symSpell = new SymSpell(83000, editDistanceMax, prefixLength); string path = dir + "../../../SymSpell/frequency_dictionary_en_82_765.txt"; //for spelling correction (genuine English words) symSpell.LoadDictionary(path, 0, 1); //load 1000 terms with random spelling errors string[] testList = new string[1000]; int i = 0; using (StreamReader sr = new StreamReader(File.OpenRead(dir + "../../../SymSpell.Demo/test_data/noisy_query_en_1000.txt"))) { String line; //process a single line at a time only for memory efficiency while ((line = sr.ReadLine()) != null) { string[] lineParts = line.Split(null); if (lineParts.Length >= 2) { testList[i++] = lineParts[0]; } } } int resultSum = 0; for (i = 0; i < testList.Length; i++) { resultSum += symSpell.Lookup(testList[i], verbosity, symSpell.MaxDictionaryEditDistance).Count; } Assert.AreEqual(4945, resultSum); }
// Use this for initialization void Start() { if (!targetkeyboard) { targetkeyboard = KeyboardLayout.Instance; } if (!targetkeyboard) { Debug.LogError("Target Keyboard Empty"); } else { targetkeyboard.KeyboardLayout_OnKeyPressed += WordPrediction_KeyPressedHandler; } Debug.Log("Creating dictionary ..."); //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = Application.dataPath + @"\SpellChecker\Resources\frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) if (!symSpell.LoadDictionary(path, 0, 1)) { Debug.LogError("\rFile not found: " + System.IO.Path.GetFullPath(path)); } //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); }
public static List <string> SymEnglishSpellChecker(string word) { //Console.Write("Creating dictionary ..."); //long memSize = GC.GetTotalMemory(true); //Stopwatch stopWatch = new Stopwatch(); //stopWatch.Start(); //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; var symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); //Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = AppDomain.CurrentDomain.BaseDirectory + "frequency_dictionary_en_82_765.txt"; //path referencing the SymSpell core project //string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package) /******************/ if (!symSpell.LoadDictionary(path, 0, 1)) { throw new Exception("f'le not found"); } /******************/ //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction. //You may use SymSpell.CreateDictionaryEntry() to update a (self learning) dictionary incrementally //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). or use https://github.com/wolfgarbe/SymSpellCompound //string path = "big.txt"; //if (!symSpell.CreateDictionary(path)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); //stopWatch.Stop(); //long memDelta = GC.GetTotalMemory(true) - memSize; //Console.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " // + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() // + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " // + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up //var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); string input = word; /**/ //Console.WriteLine("Type a work and hit enter key to get spelling suggestions:"); /***/ /* * //while (!string.IsNullOrEmpty(input.Trim())) * //{6 * // return Correct(input, symSpell); * //} */ return(Correct(input, symSpell)); }
static void Main(string[] args) { //Console.WriteLine("Hello World!"); //create object int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); //load dictionary string baseDirectory = AppDomain.CurrentDomain.BaseDirectory; //string dictionaryPath= baseDirectory + "frequency_dictionary_en_82_765.txt"; Console.WriteLine(baseDirectory); string dictionaryPath = baseDirectory + "../../../frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.WriteLine("File not found!"); //press any key to exit program Console.ReadKey(); return; } int i = 0; string contentUndetermine = ""; string contentDate = ""; string contentScore = ""; string contentLineOfReviews = ""; /* * http://www.vcskicks.com/read_text_file.php * here is a method provided by vcskicks.com which allow user to read the whole document at once * and pass all the content as a single string * decide not to read the whole document as pass all the content as a string since the string may be so huge * */ // string path = "C:/Users/kongwh/Desktop/test2/t1.txt"; // StreamReader textFile = new StreamReader(path); // string input = textFile.ReadToEnd(); // textFile.Close(); /* * https://www.tutorialspoint.com/csharp/csharp_text_files.htm * here is a method provided by tutorial point which read a document line by line * and pass each line as a string * * and write string to a document */ string line = ""; //https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/main-and-command-args/command-line-arguments string nameOfadjustedDocument = args[0]; //https://www.geeksforgeeks.org/c-sharp-insert-method/ nameOfadjustedDocument = nameOfadjustedDocument.Insert(nameOfadjustedDocument.Length - 4, "_adj"); using (StreamWriter sw = new StreamWriter(nameOfadjustedDocument)){ //using(StreamWriter sw = new StreamWriter(adj.txt)){ using (StreamReader sr = new StreamReader(args[0])) { //using (StreamReader sr = new StreamReader("t1.txt")) { while ((line = sr.ReadLine()) != null) { //Console.WriteLine("line: " + line); //Console.WriteLine("line length: " + line.Length); contentUndetermine = line; if (isFormatOfDate(contentUndetermine)) { //contentUndetermine is a date contentDate = contentUndetermine; i = 1; Console.WriteLine(contentDate); sw.WriteLine(contentDate); } else if (i == 1) { //contentUndetermine is a score contentScore = contentUndetermine; Console.WriteLine(contentScore); sw.WriteLine(contentScore); i = 2; } else { string onlyEnglishAndSpace = ""; string notEnglishAndSpace = ""; string adjustedReviewLine = ""; contentLineOfReviews = contentUndetermine; foreach (char charInLine in contentLineOfReviews) { if (charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine)) { onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString(); } else { notEnglishAndSpace = charInLine.ToString(); if (onlyEnglishAndSpace.Equals("")) { adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace; } else { //word segmentation and correction for multi-word input strings with/without spaces var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace); adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace; } onlyEnglishAndSpace = ""; notEnglishAndSpace = ""; } } if (!onlyEnglishAndSpace.Equals("")) { //word segmentation and correction for multi-word input strings with/without spaces var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace); adjustedReviewLine = adjustedReviewLine + suggestion.correctedString; } Console.WriteLine(adjustedReviewLine); sw.WriteLine(adjustedReviewLine); i++; } } } } //----------------------------sample input---------------------------------------- // //sample input // string input="January 25, 2019\n5\ngooood😋\nJune 25, 2019\n1\nsofarsogood\n"; // //Console.WriteLine("input length: " + input.Length); // foreach(char c in input){ // contentUndetermine = contentUndetermine + c.ToString( ); // //Console.WriteLine("current: " + contentUndetermine); // if(contentUndetermine.Contains("\r") || contentUndetermine.Contains("\n")){ // if(isFormatOfDate(contentUndetermine)){ // //contentUndetermine is a date // contentDate = contentUndetermine; // Console.WriteLine(contentDate); // i = 1; // }else if(i == 1){ // //contentUndetermine is a score // contentScore = contentUndetermine; // Console.WriteLine(contentScore); // i = 2; // }else{ // string onlyEnglishAndSpace = ""; // string notEnglishAndSpace = ""; // string adjustedReviewLine = ""; // contentLineOfReviews = contentUndetermine; // foreach(char charInLine in contentLineOfReviews){ // if(charInLine.ToString().Contains(" ") || isEnglishLetter(charInLine)){ // onlyEnglishAndSpace = onlyEnglishAndSpace + charInLine.ToString(); // }else{ // notEnglishAndSpace = charInLine.ToString(); // if(onlyEnglishAndSpace.Equals("")){ // adjustedReviewLine = adjustedReviewLine + notEnglishAndSpace; // }else{ // //word segmentation and correction for multi-word input strings with/without spaces // var suggestion = symSpell.WordSegmentation(onlyEnglishAndSpace); // adjustedReviewLine = adjustedReviewLine + suggestion.correctedString + notEnglishAndSpace; // } // onlyEnglishAndSpace = ""; // notEnglishAndSpace = ""; // } // } // // //word segmentation and correction for multi-word input strings with/without spaces // // var suggestion = symSpell.WordSegmentation(contentLineOfReviews); // // //display term and edit distance // // Console.WriteLine(suggestion.correctedString); // //Console.WriteLine(contentLineOfReviews); // Console.WriteLine(adjustedReviewLine); // adjustedReviewLine = ""; // i++; // } // //clear the content // contentUndetermine = ""; // } // } //----------------------------sample input ends---------------------------------------- //----------------functions used for spell check provided by symSpell------------------ // //word segmentation and correction for multi-word input strings with/without spaces // var suggestion1 = symSpell.WordSegmentation(input); // //display term and edit distance // Console.WriteLine(suggestion1.correctedString); // //lookup suggestions for single-word input strings // string inputTerm="goodandnicedesign"; // //string inputTerm=suggestion1.correctedString; // int maxEditDistanceLookup = 1; //max edit distance per lookup (maxEditDistanceLookup<=maxEditDistanceDictionary) // var suggestionVerbosity = SymSpell.Verbosity.Closest; //Top, Closest, All // var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup); // //lookup suggestions for multi-word input strings (supports compound splitting & merging) // //inputTerm="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him"; // maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string) // suggestions = symSpell.LookupCompound(inputTerm, maxEditDistanceLookup); // //display suggestions, edit distance and term frequency // foreach (var suggestion in suggestions) // { // Console.WriteLine(suggestion.term); // } //press any key to exit program //Console.ReadKey(); }
static void Main(string[] args) { if (args.Length >= 2) { Console.Error.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //parameters int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation if (args.Length > 2) { if (!int.TryParse(args[2], out maxEditDistanceDictionary)) { Console.Error.WriteLine("Error in parameter 3"); return; } } int maxEditDistanceLookup = maxEditDistanceDictionary; //max edit distance per lookup var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All if (args.Length > 3) { if (!Enum.TryParse(args[3], out suggestionVerbosity)) { Console.Error.WriteLine("Error in parameter 4"); return; } } int prefixLength = 7; if (args.Length > 4) { if (!int.TryParse(args[4], out prefixLength)) { Console.Error.WriteLine("Error in parameter 5"); return; } } string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; // "../../../../SymSpell/frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file //create object var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength); //load dictionary switch (args[0].ToLower()) { case "load": if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.Error.WriteLine("File not found!"); return; } break; case "create": if (!symSpell.CreateDictionary(dictionaryPath)) { Console.Error.WriteLine("File not found!"); return; } break; default: break; } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All, 1); //lookup suggestions for single-word input strings string inputTerm; while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim())) { var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceLookup, true); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } } } else { //help Console.WriteLine("SymSpell.CommandLine load Path [MaxEditDistance] [Verbosity] [PrefixLength]"); Console.WriteLine("SymSpell.CommandLine create Path [MaxEditDistance] [Verbosity] [PrefixLength]"); Console.WriteLine(); Console.WriteLine("load: load dictionary from dictionary file"); Console.WriteLine("create: create dictionary from text corpus"); Console.WriteLine("MaxEditDistance: default=2"); Console.WriteLine("Verbosity=Top|Closest|All (case-sensitive)"); Console.WriteLine("PrefixLength: default=7 (5:low memory; 7:fast lookup)"); Console.WriteLine(); } }
private static async Task Main(string[] args) { var directory = new DirectoryInfo("temp_git"); if (directory.Exists) { NormalizeDirectoryAttributes(directory); directory.Delete(true); } void NormalizeDirectoryAttributes(DirectoryInfo directoryInfo) { foreach (var subPath in directoryInfo.GetDirectories()) { NormalizeDirectoryAttributes(subPath); } foreach (var file in directoryInfo.GetFiles()) { file.Attributes = FileAttributes.Normal; } } await Task.Delay(TimeSpan.FromSeconds(1)); var info = new ProcessStartInfo("git", "clone https://github.com/discord-csharp/MODiX temp_git"); var p = Process.Start(info); if (p == null) { throw new InvalidOperationException("process handle was null"); } p.WaitForExit(); await Task.Delay(TimeSpan.FromSeconds(2)); var extensions = new[] { ".txt", ".md", ".cs" }; var spellingInfos = new List <FileSpellingInfo>(); var spell = new SymSpell(); if (!spell.LoadDictionary(Path.Combine(Environment.CurrentDirectory, "frequency_dictionary_en_82_765.txt"), 0, 1)) { throw new InvalidOperationException(); } foreach (var file in Directory.GetFiles(Path.Combine(Environment.CurrentDirectory, "temp_git"), "*", SearchOption.AllDirectories)) { if (!extensions.Contains(Path.GetExtension(file))) { continue; } var spellingInfo = new FileSpellingInfo { Path = file }; var fileContents = File.ReadAllLines(file); for (var i = 0; i < fileContents.Length; i++) { var line = fileContents[i].Trim(); if (string.IsNullOrWhiteSpace(line)) { continue; } spellingInfo.LineMistakes.Add((i + 1, line), new List <(string, string)>()); var words = line.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(c => c.RemoveSpecialCharacters()); foreach (var word in words.Where(c => !string.IsNullOrWhiteSpace(c))) { var results = spell.Lookup(word.ToLower(), SymSpell.Verbosity.Top); if (results == null || results.Any() == false) { continue; } var suggestion = results.First(); if (suggestion.term == word.ToLower()) { continue; } spellingInfo.LineMistakes[(i + 1, line)].Add((word, suggestion.term));
static void BenchmarkPrecalculationLookup() { string[] query1k = BuildQuery1K(); int resultNumber = 0; int repetitions = 1000; int totalLoopCount = 0; long totalMatches = 0; long totalOrigMatches = 0; double totalLoadTime, totalMem, totalLookupTime, totalOrigLoadTime, totalOrigMem, totalOrigLookupTime; totalLoadTime = totalMem = totalLookupTime = totalOrigLoadTime = totalOrigMem = totalOrigLookupTime = 0; long totalRepetitions = 0; Stopwatch stopWatch = new Stopwatch(); for (int maxEditDistance = 1; maxEditDistance <= 3; maxEditDistance++) { for (int prefixLength = 5; prefixLength <= 7; prefixLength++) { //benchmark dictionary precalculation size and time //maxEditDistance=1/2/3; prefixLength=5/6/7; dictionary=30k/82k/500k; class=instantiated/static for (int i = 0; i < DictionaryPath.Length; i++) { totalLoopCount++; //instantiated dictionary long memSize = GC.GetTotalMemory(true); stopWatch.Restart(); SymSpell dict = new SymSpell(DictionarySize[i], maxEditDistance, prefixLength); dict.LoadDictionary(DictionaryPath[i], 0, 1); stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; totalLoadTime += stopWatch.Elapsed.TotalSeconds; totalMem += memDelta / 1024.0 / 1024.0; Console.WriteLine("Precalculation instance " + stopWatch.Elapsed.TotalSeconds.ToString("N3") + "s " + (memDelta / 1024.0 / 1024.0).ToString("N1") + "MB " + dict.WordCount.ToString("N0") + " words " + dict.EntryCount.ToString("N0") + " entries MaxEditDistance=" + maxEditDistance.ToString() + " prefixLength=" + prefixLength.ToString() + " dict=" + DictionaryName[i]); //static dictionary memSize = GC.GetTotalMemory(true); stopWatch.Restart(); Original.SymSpell dictOrig = new Original.SymSpell(maxEditDistance, prefixLength); dictOrig.LoadDictionary(DictionaryPath[i], "", 0, 1); stopWatch.Stop(); memDelta = GC.GetTotalMemory(true) - memSize; totalOrigLoadTime += stopWatch.Elapsed.TotalSeconds; totalOrigMem += memDelta / 1024.0 / 1024.0; Console.WriteLine("Precalculation static " + stopWatch.Elapsed.TotalSeconds.ToString("N3") + "s " + (memDelta / 1024 / 1024.0).ToString("N1") + "MB " + dictOrig.Count.ToString("N0") + " words " + dictOrig.EntryCount.ToString("N0") + " entries MaxEditDistance=" + maxEditDistance.ToString() + " prefixLength=" + prefixLength.ToString() + " dict=" + DictionaryName[i]); //benchmark lookup result number and time //maxEditDistance=1/2/3; prefixLength=5/6/7; dictionary=30k/82k/500k; verbosity=0/1/2; query=exact/non-exact/mix; class=instantiated/static foreach (SymSpell.Verbosity verbosity in Enum.GetValues(typeof(SymSpell.Verbosity))) { //instantiated exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dict.Lookup("different", verbosity, maxEditDistance).Count; } stopWatch.Stop(); totalLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalMatches += resultNumber; Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=exact"); //static exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dictOrig.Lookup("different", "", maxEditDistance, (int)verbosity).Count; } stopWatch.Stop(); totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalOrigMatches += resultNumber; Console.WriteLine("Lookup static " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=exact"); Console.WriteLine(); totalRepetitions += repetitions; //instantiated non-exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dict.Lookup("hockie", verbosity, maxEditDistance).Count; } stopWatch.Stop(); totalLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalMatches += resultNumber; Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=non-exact"); //static non-exact stopWatch.Restart(); for (int round = 0; round < repetitions; round++) { resultNumber = dictOrig.Lookup("hockie", "", maxEditDistance, (int)verbosity).Count; } stopWatch.Stop(); totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalOrigMatches += resultNumber; Console.WriteLine("Lookup static " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / repetitions).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=non-exact"); Console.WriteLine(); totalRepetitions += repetitions; //instantiated mix stopWatch.Restart(); resultNumber = 0; foreach (var word in query1k) { resultNumber += dict.Lookup(word, verbosity, maxEditDistance).Count; } stopWatch.Stop(); totalLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalMatches += resultNumber; Console.WriteLine("Lookup instance " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / query1k.Length).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=mix"); //static mix stopWatch.Restart(); resultNumber = 0; foreach (var word in query1k) { resultNumber += dictOrig.Lookup(word, "", maxEditDistance, (int)verbosity).Count; } stopWatch.Stop(); totalOrigLookupTime += stopWatch.Elapsed.TotalMilliseconds; totalOrigMatches += resultNumber; Console.WriteLine("Lookup static " + resultNumber.ToString("N0") + " results " + (stopWatch.Elapsed.TotalMilliseconds / query1k.Length).ToString("N6") + "ms/op verbosity=" + verbosity.ToString() + " query=mix"); Console.WriteLine(); totalRepetitions += query1k.Length; } Console.WriteLine(); dict = null; dictOrig = null; } } } Console.WriteLine("Average Precalculation time instance " + (totalLoadTime / totalLoopCount).ToString("N3") + "s " + ((totalLoadTime / totalOrigLoadTime) - 1).ToString("P1")); Console.WriteLine("Average Precalculation time static " + (totalOrigLoadTime / totalLoopCount).ToString("N3") + "s"); Console.WriteLine("Average Precalculation memory instance " + (totalMem / totalLoopCount).ToString("N1") + "MB " + ((totalMem / totalOrigMem) - 1).ToString("P1")); Console.WriteLine("Average Precalculation memory static " + (totalOrigMem / totalLoopCount).ToString("N1") + "MB"); Console.WriteLine("Average Lookup time instance " + (totalLookupTime / totalRepetitions).ToString("N3") + "ms " + ((totalLookupTime / totalOrigLookupTime) - 1).ToString("P1")); Console.WriteLine("Average Lookup time static " + (totalOrigLookupTime / totalRepetitions).ToString("N3") + "ms"); Console.WriteLine("Total Lookup results instance " + totalMatches.ToString("N0") + " " + (totalMatches - totalOrigMatches) + " differences"); Console.WriteLine("Total Lookup results static " + totalOrigMatches.ToString("N0")); }
private static void Experimento3() { string strPath = @"D:\json\"; string[] fileEntries = Directory.GetFiles(strPath); StringBuilder OCROriginal = new StringBuilder(); string fileName = @"D:\cuantificacion\Experimentos\experimento3.xlsx"; string connectionString = String.Format(@"Provider=Microsoft.ACE.OLEDB.12.0;" + "Data Source={0};Extended Properties='Excel 12.0;HDR=YES;IMEX=0'", fileName); EditDistanceLength editDistance = new EditDistanceLength(); const int initialCapacity = 82765; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>(); foreach (string path in fileEntries) { string jsonText = File.ReadAllText(path, Encoding.Default); var response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText); foreach (var respuestas in response.Responses) { var annotation = respuestas.FullTextAnnotation; if (annotation != null) { OCROriginal.Append(annotation.Text); } } } symSpell.LoadDictionary(@"D:\load8.txt", 0, 1); List <SymSpell.SuggestItem> suggestions = symSpell.LookupCompound(OCROriginal.ToString(), 2); var arraySymspell = suggestions[0].ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Split(' '); var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("{", "").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' '); int j = 0, k = 0; double similarity; for (int i = 0; i < arraySymspell.Length; i++) { if (j == arrayOCROriginal.Length) { break; } similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j].ToLower()); ExperimentSpell exp1 = new ExperimentSpell(); if (similarity == 1) { exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; j++; } else { if (similarity >= .4) { exp1.correction = "Corregida"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; j++; } else { if (similarity > 0.06) { exp1.correction = "Espacios"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (j > 0) { similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j - 1].ToLower()); } else { similarity = 0; } if (similarity == 1) { j--; exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity >= .4) { j--; exp1.correction = "Corregida"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity > 0.06) { j--; exp1.correction = "Espacios"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (j + 1 < arrayOCROriginal.Length) { similarity = editDistance.CalculateSimilarity(arraySymspell[i], arrayOCROriginal[j + 1].ToLower()); } else { similarity = 0; } if (similarity == 1) { j++; exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity >= .4) { j++; exp1.correction = "Corregida"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { if (similarity > 0.06) { j++; exp1.correction = "Espacios"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; } else { exp1.correction = "Error"; exp1.correctionLookupCompound = arraySymspell[i]; exp1.original = arrayOCROriginal[j]; j++; } } } } } } } } } excelMatrix.Add(k++, exp1); } CreateExcelFileExperimento(excelMatrix, "3"); }
public WordCorrection(string path, int MaxEditDistance) { symSpell = new SymSpell(82765, MaxEditDistance, MaxEditDistance + 1); symSpell.LoadDictionary(path, 0, 1); }
private void SpellCorrect_Click(object sender, RoutedEventArgs e) { ConsoleManager.Show(); const int initialCapacity = 82765 * 2; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); long memSize = GC.GetTotalMemory(true); // Load a frequency dictionary //wordfrequency_en.txt ensures high correction quality by combining two data sources: //Google Books Ngram data provides representative word frequencies (but contains many entries with spelling errors) //SCOWL — Spell Checker Oriented Word Lists which ensures genuine English vocabulary (but contained no word frequencies) string path = @"C:\Users\Emmanuel\source\repos\Project-Carl\Project-CARL\WpfApp1\frequency_dictionary_en_82_765.txt"; string dict2 = @"C:\Users\Emmanuel\source\repos\Project-Carl\Project-CARL\WpfApp1\unigram_freq.txt"; long memDelta = GC.GetTotalMemory(true) - memSize; if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + System.IO.Path.GetFullPath(path)); Console.ReadKey(); //return; } if (!symSpell.LoadDictionary(dict2, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + System.IO.Path.GetFullPath(path)); Console.ReadKey(); //return; } //Open textfile String correctionFile = ""; System.Windows.Forms.MessageBox.Show("Choose file to Correct"); OpenFileDialog openFileDialog1 = new OpenFileDialog(); if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK) { correctionFile = openFileDialog1.FileName; System.Windows.Forms.MessageBox.Show(correctionFile); } //read words into array/list string corp = File.ReadAllText(correctionFile, Encoding.UTF8); //read raw text file string[] words = corp.Split(new string[] { " " }, StringSplitOptions.None); //tokenize raw text file List <SymSpell.SuggestItem> suggestedWord = null; //list of all corrected words List <string> correctedWords = new List <string>(); //Output of the corrected words //submit word to symSpell for (int i = 0; i < words.Length; i++) { suggestedWord = (symSpell.Lookup(words[i], SymSpell.Verbosity.Closest)); correctedWords.Add(suggestedWord.First().term); } //save words to file string fileName = System.IO.Path.GetRandomFileName() + ".txt"; //random file name for our corrected text //save the directory of the correction file we selected previously string pathString = System.IO.Path.GetDirectoryName(correctionFile); // Use Combine again to add the file name to the path. pathString = System.IO.Path.Combine(pathString, fileName); string tmpstring = ""; foreach (string word in correctedWords) { tmpstring += (word + " "); } File.WriteAllText(pathString, tmpstring); }
private List <string> AddHTMLTags(List <string> chatLines, List <string> finalNameTags) { int initialCapacity = 82765; int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation var symSpellEngine = new SymSpell(initialCapacity, maxEditDistanceDictionary); string baseDirectory = AppDomain.CurrentDomain.BaseDirectory; string dictionaryPath = baseDirectory + "../../frequency_dictionary_en_82_765.txt"; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file if (!symSpellEngine.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.WriteLine("File not found!"); } //List<string> nameTags = new List<string>(); List <string> newHTMLLines = new List <string>(); foreach (string htmlLines in chatLines) { string changedHTMLLine = string.Empty; foreach (string name in finalNameTags) { int startIndex = 0; startIndex = htmlLines.IndexOf(name); string boldTag = "<span style=\"font-weight: bold; color:#000000; \">"; if (startIndex > -1 && startIndex < 3) { //Devildogs, ya know string tempHtmlLines = CheckDerps(htmlLines, name); string spellingHMTLLines = string.Empty; //Fix Erica's bad spelling if ((tempHtmlLines.StartsWith("Lady Red") || tempHtmlLines.StartsWith("LadyRedE") || tempHtmlLines.StartsWith("Carissa T") || tempHtmlLines.StartsWith("PrincessV"))) { spellingHMTLLines = FixBadSpelling(tempHtmlLines, symSpellEngine); } else { spellingHMTLLines = tempHtmlLines; } changedHTMLLine = spellingHMTLLines.Insert(startIndex, boldTag); changedHTMLLine = changedHTMLLine.Insert((startIndex + boldTag.Length + name.Length), "</span>"); changedHTMLLine = AddCharacterColors(changedHTMLLine, name, startIndex, boldTag); } else { Console.Write(changedHTMLLine); } } //If not in the namelist if (changedHTMLLine.Length < 3) { changedHTMLLine = htmlLines; } changedHTMLLine = ReservedCharacterChangePass(changedHTMLLine); changedHTMLLine = StylisticCharacterChangePass(changedHTMLLine); newHTMLLines.Add(changedHTMLLine); } return(newHTMLLines); }
//Load a frequency dictionary or create a frequency dictionary from a text corpus public static void Main(string[] args) { //set parameters const int initialCapacity = 82765; const int maxEditDistance = 2; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); string input; string path; // Console.WriteLine("Test 1"); // Didn't manage to get it up and running // path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/bonjour.txt"; // if (!SymSpellCompound.CreateDictionary(path, 0, 1)) Console.Error.WriteLine("File not found: " + Path.GetFullPath(path)); // input = "bonjor"; // Correct(input, symSpell); // Console.WriteLine(); Console.WriteLine("Test 2"); // path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/bonjour.1.txt"; if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } input = "bonjuor"; Correct(input, symSpell); Console.WriteLine(); Console.WriteLine("Test 3"); // path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/bonjour.2.txt"; if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } Correct("bonjur hallo", symSpell); Console.WriteLine(); Console.WriteLine("Test 4"); // breaks with "བཀྲ་ཤས་་", breaks with "བཀྲ་ཤིན་", doesn't recognize "བཀྲ་ཤེས་", or "སཀྲ་ཤིས་" path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/tib.txt"; if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } input = "སཀྲ་ཤིས་"; Correct(input, symSpell); Console.WriteLine(); Console.WriteLine("Test 5"); path = AppDomain.CurrentDomain.BaseDirectory + "../../../../../lists/tib.1.txt"; if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey(); return; } input = "དཀྲ'ཤེས'"; Correct(input, symSpell); }
static void Main(string[] args) { if (args.Length > 2) { Console.Error.Write("Creating dictionary ..."); long memSize = GC.GetTotalMemory(true); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //parameters int initialCapacity = 82765; int termIndex = 0; //column of the term in the dictionary text file int countIndex = 1; //column of the term frequency in the dictionary text file //dictionaryType string dictionaryType = args[0].ToLower(); if ("load.create".IndexOf(dictionaryType) == -1) { Console.Error.WriteLine("Error in parameter 1"); return; } //dictionaryPath string dictionaryPath = AppDomain.CurrentDomain.BaseDirectory + args[1]; //prefix length (optional parameter) int offset = 0; string lookupType = ""; int prefixLength = 7; if (!int.TryParse(args[2], out prefixLength)) { prefixLength = 7; } else { offset = 1; } //lookupType if (args.Length > 2 + offset) { lookupType = args[2 + offset].ToLower(); if ("lookup.lookupcompound.wordsegment".IndexOf(lookupType) == -1) { Console.Error.WriteLine("Error in parameter " + (3 + offset).ToString()); return; } } //maxEditDistance int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation if (args.Length > 3 + offset) { if (!int.TryParse(args[3 + offset], out maxEditDistanceDictionary)) { Console.Error.WriteLine("Error in parameter " + (4 + offset).ToString()); return; } } //output stats bool outputStats = false;//false, true if (args.Length > 4 + offset) { if (!bool.TryParse(args[4 + offset], out outputStats)) { Console.Error.WriteLine("Error in parameter " + (5 + offset).ToString()); return; } } //verbosity var suggestionVerbosity = SymSpell.Verbosity.Top; //Top, Closest, All if (args.Length > 5 + offset) { if (!Enum.TryParse(args[5 + offset], true, out suggestionVerbosity)) { Console.Error.WriteLine("Error in parameter " + (6 + offset).ToString()); return; } } //create object var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary, prefixLength); //load dictionary switch (dictionaryType) { case "load": if (!symSpell.LoadDictionary(dictionaryPath, termIndex, countIndex)) { Console.Error.WriteLine("File not found!"); return; } break; case "create": if (!symSpell.CreateDictionary(dictionaryPath)) { Console.Error.WriteLine("File not found!"); return; } break; default: break; } stopWatch.Stop(); long memDelta = GC.GetTotalMemory(true) - memSize; //not to stdout, but to Console.Error: status info will alway be on console, but not redirected or piped Console.Error.WriteLine("\rDictionary: " + symSpell.WordCount.ToString("N0") + " words, " + symSpell.EntryCount.ToString("N0") + " entries, edit distance=" + symSpell.MaxDictionaryEditDistance.ToString() + " in " + stopWatch.Elapsed.TotalMilliseconds.ToString("0.0") + "ms " + (memDelta / 1024 / 1024.0).ToString("N0") + " MB"); //warm up var result = symSpell.Lookup("warmup", SymSpell.Verbosity.All); //lookup suggestions for single-word input strings string inputTerm; while (!string.IsNullOrEmpty(inputTerm = (Console.ReadLine() ?? "").Trim())) { switch (lookupType) { case "lookup": var suggestions = symSpell.Lookup(inputTerm, suggestionVerbosity, maxEditDistanceDictionary, true); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "lookupcompound": var suggestions2 = symSpell.LookupCompound(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions2) { if (outputStats) { Console.WriteLine(suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString("N0")); } else { Console.WriteLine(suggestion.term); } } break; case "wordsegment": var suggestions3 = symSpell.WordSegmentation(inputTerm); //display suggestions, edit distance and term frequency foreach (var suggestion in suggestions3) { if (outputStats) { Console.WriteLine(suggestion.correctedString + " " + suggestion.distanceSum.ToString("N0") + " " + suggestion.probabilityLogSum.ToString()); } else { Console.WriteLine(suggestion.correctedString); } } break; default: break; } } } else { //PrefixLength is number //help Console.WriteLine("SymSpell.CommandLine DictionaryType DictionaryPath [PrefixLength] LookupType [MaxEditDistance] [OutputStats] [Verbosity]"); Console.WriteLine(); Console.WriteLine("DictionaryType=load|create"); Console.WriteLine(" load: load dictionary from dictionary file"); Console.WriteLine(" create: create dictionary from text corpus"); Console.WriteLine("DictionaryPath: path to dictionary/corpus file"); Console.WriteLine("PrefixLength: default=7 (speed/memory consumption trade-off)"); //dictionary param Console.WriteLine(" 5: low memory, slow lookup"); Console.WriteLine(" 6: medium memory, medium lookup"); Console.WriteLine(" 7: high memory, fast lookup"); //lookup intended for correction of single word //lookupcompound intended for correction of multiple words, it can insert only a single space per token, faster than wordsegmentation //wordsegmentation intended for segmentation and correction of multiple words, it can insert multiple spaces per token, slower than lookupcompound Console.WriteLine("LookupType=lookup|lookupcompound|wordsegment"); Console.WriteLine(" lookup: correct single word"); Console.WriteLine(" lookupcompound: correct multiple-word string (supports splitting/merging)"); Console.WriteLine(" wordsegment: word segment and correct input string"); Console.WriteLine("MaxEditDistance: default=2 (0: no correction, word segmentation only)"); Console.WriteLine("OutputStats=false|true"); Console.WriteLine(" false: only corrected string"); Console.WriteLine(" true: corrected string, edit distance, word frequency/probability"); Console.WriteLine("Verbosity=top|closest|all"); //no effect for lookupcompound and wordsegment Console.WriteLine(" top: Top suggestion"); Console.WriteLine(" closest: All suggestions of smallest edit distance found"); Console.WriteLine(" all: All suggestions within maxEditDistance"); Console.WriteLine(); } }
private static void Experimento2_1() { Stopwatch stopWatch = new Stopwatch(); string strPath = @"D:\json\"; string[] fileEntries = Directory.GetFiles(strPath); StringBuilder OCROriginal = new StringBuilder(); string fileName = @"D:\cuantificacion\Experimentos\experimento2.xlsx"; string connectionString = String.Format(@"Provider=Microsoft.ACE.OLEDB.12.0;" + "Data Source={0};Extended Properties='Excel 12.0;HDR=YES;IMEX=0'", fileName); EditDistanceLength editDistance = new EditDistanceLength(); //Symspell parameters const int initialCapacity = 82765; const int maxEditDistance = 5; const int prefixLength = 7; SymSpell symSpell = new SymSpell(initialCapacity, maxEditDistance, prefixLength); Dictionary <int, ExperimentSpell> excelMatrix = new Dictionary <int, ExperimentSpell>(); foreach (string path in fileEntries) { string jsonText = File.ReadAllText(path, Encoding.Default); var response = Google.Protobuf.JsonParser.Default.Parse <Google.Cloud.Vision.V1.AnnotateFileResponse>(jsonText); foreach (var respuestas in response.Responses) { var annotation = respuestas.FullTextAnnotation; if (annotation != null) { OCROriginal.Append(annotation.Text); } } } stopWatch.Start(); //load symspell dictionary default symSpell.LoadDictionary(@"D:\load8.txt", 0, 1); //process symspell List <SymSpell.SuggestItem> suggestions = symSpell.LookupCompound(OCROriginal.ToString(), 2); stopWatch.Stop(); var arraySymspell = suggestions[0].ToString().Replace("\n", " ").Replace("}", "").Split(' '); var arrayOCROriginal = OCROriginal.ToString().Replace("\n", " ").Replace("}", "").Replace(": ", "***").Replace(" : ", " ").Replace(":", " ").Replace("***", ": ").Replace(". ", " ").Replace(", ", " ").Replace("-", " ").Split(' '); int j = 0, k = 0; for (int i = 0; i < arraySymspell.Length; i++) { ExperimentSpell exp1 = new ExperimentSpell(); exp1.correction = "igual"; exp1.correctionLookupCompound = arraySymspell[i]; if (j < arrayOCROriginal.Length) { exp1.original = arrayOCROriginal[j]; } else { exp1.original = ""; } j++; excelMatrix.Add(k++, exp1); } CreateExcelFileExperimento(excelMatrix, "2"); }