internal void ParseDictionary(Dictionary dictionary) { AffixRules rules = null; Encoding encoding = null; string affixFile = dictionary.GetFile(DictionaryFileType.Affix); if (null != affixFile) { encoding = Utils.DetectEncoding(affixFile); if (null == encoding) { encoding = EncodingDetector.DetectEncoding(affixFile); } rules = this.affixParser.Parse(affixFile, encoding); } string fileName = dictionary.GetFile(DictionaryFileType.Dictionary); DictionaryWithFlags rawDict = this.parser.Parse(fileName, encoding); foreach (DictionaryItemWithFlags item in rawDict) { if (null == item.Flags) { dictionary.Add(item.Word); } else { dictionary.AddRange(rules.GetPossibleWords(item)); } } }
public MatrixGeneratorTest() { DictionaryManager manager = new DictionaryManager(@"C:\dev\git\Pspell\SpellCheckerConsole\bin\Debug\dictionaries"); Dictionary enUs = manager.GetDictionary("en_US"); this.alphabetWithSpace = enUs.GetAlphabetForErrorModel(true).ToCharArray(); this.alphabet = enUs.GetAlphabetForErrorModel().ToCharArray(); Array.Sort<char>(this.alphabetWithSpace); Array.Sort<char>(this.alphabet); ErrorListParser parser = new ErrorListParser("test_errors.txt"); testData = parser.Parse(); }
public Dictionary(DictionaryLoader loader, string name, string path, char[] alphabet, char[] specialChars = null, string wordBoundaryRegex = null, Dictionary<char, List<char>> accentPairs = null) { Name = name; Alphabet = alphabet; specialCharsInsideWord = specialChars; this.path = path; this.wordBoundaryRegex = wordBoundaryRegex; this.accentPairs = accentPairs; this.loader = loader; this.AlphabetWithSpecialCharsCache = this.GetAlphabetForErrorModel(true).ToCharArray(); }
public LanguageModelEvaluation EvaluateCandidates(MisspelledWord word, Dictionary<string, double> candidates) { foundInNgrams = false; List<string> leftContext = word.GetLeftContext(); NgramType type = this.dictionary.GetHighestAvailableNgramCollection(leftContext.Count); Dictionary<string, double> probability = new Dictionary<string, double>(); string[] lcArray = this.GetLeftContext(leftContext, type); NgramEvaluation evaluation; foreach (KeyValuePair<string, double> option in candidates) { lcArray[leftContext.Count - 1] = option.Key.Contains(' ') ? option.Key.Split(space).First() : option.Key; evaluation = this.dictionary.GetNgramCollection(type).GetProbability(lcArray); probability.Add(option.Key, evaluation.Probability); if (!foundInNgrams && evaluation.Occurence > 0) { foundInNgrams = true; } } List<string> rightContext = word.GetRightContext(); NgramType secType = this.dictionary.GetHighestAvailableNgramCollection(rightContext.Count); if (type == NgramType.Unigram && type == NgramType.Unigram) { // do nothing } else { string[] rcArray = this.GetRightContext(rightContext, secType); foreach (KeyValuePair<string, double> option in candidates) { rcArray[0] = option.Key.Contains(' ') ? option.Key.Split(space).Last() : option.Key; evaluation = this.dictionary.GetNgramCollection(secType).GetProbability(rcArray); probability[option.Key] *= evaluation.Probability; if (!foundInNgrams && evaluation.Occurence > 0) { foundInNgrams = true; } } } return new LanguageModelEvaluation(probability, foundInNgrams); }
public static void Write(string file, Dictionary dictionary) { using (FileStream fStream = new FileStream(file, FileMode.Create, FileAccess.Write)) { using (StreamWriter writer = new StreamWriter(fStream, Encoding.UTF8)) { /*foreach (string word in dictionary) { writer.WriteLine(word); } writer.Close(); * */ } } }
internal void ParseConfusionMatrixes(Dictionary dictionary) { DictionaryFileType[] matrixes = new DictionaryFileType[] { DictionaryFileType.DeletetionsMatrix, DictionaryFileType.InsertionsMatrix, DictionaryFileType.SubstitutionsMatrix, DictionaryFileType.TranspositionsMatrix }; foreach (DictionaryFileType type in matrixes) { string file = dictionary.GetFile(type); if (null != file) { ConfusionMatrix matrix = this.matrixParser.ParseMatrix(file); dictionary.AddConfusionMatrix(ConvertFileTypeToEditOperation(type), matrix); } } }
public DictionaryGenerator(Dictionary dictionary, string directory, string outputDirectory) { this.dictionary = dictionary; this.outputDirectory = outputDirectory; this.directory = directory; this.errorModel = new MPSpell.Correction.ErrorModel(dictionary); this.languageModel = new LanguageModel(dictionary); int initValue = 1; char[] alphabetWithSpace = dictionary.GetAlphabetForErrorModel(true).ToCharArray(); char[] alphabet = dictionary.GetAlphabetForErrorModel().ToCharArray(); insGen = new InsertionsMatrixGenerator(alphabetWithSpace, initValue); delGen = new DeletionsMatrixGenerator(alphabetWithSpace, initValue); subGen = new SubstitutionsMatrixGenerator(alphabet, initValue); trnGen = new TranspositionsMatrixGenerator(alphabet, initValue); charCounter = new CharFrequencyCounter(alphabetWithSpace.ToStringArray()); twoCharCounter = new TwoCharFrequencyCounter(alphabetWithSpace.ToStringArray()); }
public void SetValue(char rowKey, char columnKey, int value) { if (matrix.ContainsKey(rowKey)) { if (matrix[rowKey].ContainsKey(columnKey)) { matrix[rowKey][columnKey] = value; } else { matrix[rowKey].Add(columnKey, value); } } else { Dictionary<char, int> innerDict = new Dictionary<char,int>(); innerDict.Add(columnKey, value); matrix.Add(rowKey, innerDict); } }
public DictionaryTest() { dictionary = new Dictionary(new DictionaryLoader(new DefaultDictionaryFileParser()), "en_US", "gen", "abcdefghijklmnopqrstuvwxyz".ToCharArray() ); dictionary.Add("acres"); Dictionary<char, List<char>> accentPairs = new Dictionary<char, List<char>>(); accentPairs.Add('a', new List<char>() { 'á' }); accentPairs.Add('e', new List<char>() { 'é', 'ě' }); accentPairs.Add('i', new List<char>() { 'í' }); accentPairs.Add('o', new List<char>() { 'ó' }); accentPairs.Add('u', new List<char>() { 'ú', 'ů' }); accentPairs.Add('y', new List<char>() { 'ý' }); accentPairs.Add('c', new List<char>() { 'č' }); accentPairs.Add('d', new List<char>() { 'ď' }); accentPairs.Add('n', new List<char>() { 'ň' }); accentPairs.Add('r', new List<char>() { 'ř' }); accentPairs.Add('s', new List<char>() { 'š' }); accentPairs.Add('t', new List<char>() { 'ť' }); accentPairs.Add('z', new List<char>() { 'ž' }); csCZ = new Dictionary(new DictionaryLoader(new DefaultDictionaryFileParser()), "cs_CZ", "gen", "abcdefghijklmnopqrstuvwxyzáéíóúýčďěňřšťžů".ToCharArray(), null, "[a-záéíóúýčďěňřšťžů]+", accentPairs); csCZ.Add("večeře"); csCZ.Add("véčera"); csCZ.Add("věci"); }
public Dictionary<string, double> GeneratePossibleWords(string word) { Dictionary<string, double> result = new Dictionary<string, double>(); // substitution for (int i = 0; i < word.Length; i++) { foreach (string charItem in alphabet) { string edited = String.Copy(word).Remove(i, 1).Insert(i, charItem); if (dictionary.FindWord(edited)) { double prop = this.CalculateProbability(EditOperation.Substitution, word[i], charItem[0]); if (!result.ContainsKey(edited)) { result.Add(edited, prop); } else if (prop > result[edited]) { result[edited] = prop; } } } } // deletions for (int i = 0; i < word.Length; i++) { string edited = String.Copy(word).Remove(i, 1); if (dictionary.FindWord(edited)) { char prev = (i - 1) < 0 ? ' ' : word[i]; double prop = this.CalculateProbability(EditOperation.Deletion,prev,word[i]); if (!result.ContainsKey(edited)) { result.Add(edited, prop); } else if (prop > result[edited]) { result[edited] = prop; } } } bool found = false; // insertions for (int i = 0; i <= word.Length; i++) { foreach (string item in alphabetWithSpace) { string edited = String.Copy(word).Insert(i, item); if (item == " ") { string tr = edited.Trim(); if (tr != word) { string[] parts = tr.Split(space); foreach (string part in parts) { if (dictionary.FindWord(part)) { found = true; } else { found = false; break; } } } } if (found || dictionary.FindWord(edited)) { char prev = (i - 1) < 0 ? ' ' : word[i-1]; double prop = this.CalculateProbability(EditOperation.Insertion, prev, item[0]); if (!result.ContainsKey(edited)) { result.Add(edited, prop); } else if (prop > result[edited]) { result[edited] = prop; } found = false; } } } // transposition for (int i = 0; i < word.Length - 1; i++) { string newString = String.Copy(word); string charItem = newString[i].ToString(); string edited = newString.Remove(i, 1).Insert(i + 1, charItem); if (dictionary.FindWord(edited)) { double prop = this.CalculateProbability(EditOperation.Transposition, word[i], word[i + 1]); if (!result.ContainsKey(edited)) { result.Add(edited, prop); } else if(prop > result[edited]) { result[edited] = prop; } } } return result; }
public StringChecker(string text, Dictionary dictionary, int contextSize = 2) : base(dictionary, contextSize) { reader = new StringReader(text); contextLeft = contextSize; }
internal void ParseFrequences(Dictionary dictionary) { FrequencyVector<string> oneChrFrq = this.frequencyParser.ParseFrequency(dictionary.GetFile(DictionaryFileType.OneCharFrequences)); dictionary.AddFrequencyVector(FrequencyVectorType.OneChar, oneChrFrq); FrequencyVector<string> twoChrFrq = this.frequencyParser.ParseFrequency(dictionary.GetFile(DictionaryFileType.TwoCharFrequences)); dictionary.AddFrequencyVector(FrequencyVectorType.TwoChar, twoChrFrq); }
protected Dictionary<char, List<char>> ParsePairs(string[] pairs) { Dictionary<char, List<char>> result = new Dictionary<char, List<char>>(); foreach (string pair in pairs) { string[] data = pair.Split(new char[] { '-' }, StringSplitOptions.RemoveEmptyEntries); if (result.ContainsKey(data[0][0])) { result[data[0][0]].Add(data[1][0]); } else { result.Add(data[0][0], new List<char>() { data[1][0] }); } } return result; }
protected Dictionary GetDictionary(FileInfo dictionaryXml, string path) { XmlDocument xml = new XmlDocument(); xml.Load(dictionaryXml.FullName); XmlNodeList root = xml.GetElementsByTagName("Dictionary"); XmlNode node = null; if (root.Count > 0) { node = root.Item(0); } Dictionary dictionary = null; if (null != node) { DictionaryLoader loader = this.CreateDefaultLoader(); XmlElement el = node as XmlElement; string name = node.Attributes["locale"].Value; char[] alphabet = node.Attributes["alphabet"].Value.ToCharArray(); char[] specialChars = null; Dictionary<char, List<char>> accentPairs = null; string regex = null; if (el.HasAttribute("allowedSpecialChars")) { specialChars = node.Attributes["allowedSpecialChars"].Value.ToCharArray(); } if (el.HasAttribute("wordBoundaryRegex")) { regex = node.Attributes["wordBoundaryRegex"].Value; } if (el.HasAttribute("accentPairs")) { string[] pairs = node.Attributes["accentPairs"].Value.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); accentPairs = this.ParsePairs(pairs); } dictionary = new Dictionary(loader, name, path, alphabet, specialChars, regex, accentPairs); foreach (XmlNode file in node.ChildNodes) { DictionaryFileType type = DictionaryFileType.Unknown; switch (file.Attributes["type"].Value) { case "LineDictionary": type = DictionaryFileType.LineDictionary; break; case "Dictionary": type = DictionaryFileType.Dictionary; break; case "Affix": type = DictionaryFileType.Affix; break; case "OneCharFrequences": type = DictionaryFileType.OneCharFrequences; break; case "TwoCharFrequences": type = DictionaryFileType.TwoCharFrequences; break; case "DeletionsMatrix": type = DictionaryFileType.DeletetionsMatrix; break; case "InsertionsMatrix": type = DictionaryFileType.InsertionsMatrix; break; case "TranspositionsMatrix": type = DictionaryFileType.TranspositionsMatrix; break; case "SubstitutionsMatrix": type = DictionaryFileType.SubstitutionsMatrix; break; case "UnigramFrequences": type = DictionaryFileType.UnigramFrequences; break; case "DigramFrequences": type = DictionaryFileType.DigramFrequences; break; case "TrigramFrequences": type = DictionaryFileType.TrigramFrequences; break; } dictionary.AddFile(type, file.InnerText.Trim()); } } return dictionary; }
// todo move internal void ParseSimpleDictionary(Dictionary dictionary) { string file = dictionary.GetFile(DictionaryFileType.LineDictionary); if (null != file) { Encoding enc = EncodingDetector.DetectEncoding(file); using (StreamReader reader = new StreamReader(file, enc)) { while (!reader.EndOfStream) { dictionary.Add(reader.ReadLine()); } } } }
internal void ParseNgrams(Dictionary dictionary) { DictionaryFileType[] files = new DictionaryFileType[] { DictionaryFileType.UnigramFrequences, DictionaryFileType.DigramFrequences, DictionaryFileType.TrigramFrequences }; foreach (DictionaryFileType type in files) { string file = dictionary.GetFile(type); if (null != file) { NgramCollection collection = this.ngramParser.ParseNgrams(file); NgramType ngramType; switch (type) { case DictionaryFileType.DigramFrequences: ngramType = NgramType.Digram; break; case DictionaryFileType.TrigramFrequences: ngramType = NgramType.Trigram; break; default: ngramType = NgramType.Unigram; break; } dictionary.AddNgramCollection(ngramType, collection); } } }
public LanguageModel(Dictionary dict) { dictionary = dict; }
public AccentModel(Dictionary dictionary) { this.dictionary = dictionary; this.accentPairs = dictionary.GetAccentPairs(); }
public FolderCorrector(Dictionary dictionary, string[] sourceFiles, string resultDirectory = null, string reportDirectory = null) { this.OnlySelectedFiles = true; this.FilesToProcess = this.GetFileInfo(sourceFiles); this.PrepareProject(dictionary, resultDirectory, reportDirectory, false); }
public FolderCorrector(Dictionary dictionary, string sourceDirectory, string resultDirectory = null, string reportDirectory = null, bool preserveSubfolders = true) { this.SourceDirectory = sourceDirectory; // prepare files and folders this.FilesToProcess = this.AnalyzeDir(new DirectoryInfo(sourceDirectory)); this.PrepareProject(dictionary, resultDirectory, reportDirectory, preserveSubfolders); }
private void PrepareProject(Dictionary dictionary, string resultDirectory, string reportDirectory, bool preserveSubfolders) { this.ExportContext = false; this.ResultDirectory = resultDirectory; this.ReportDirectory = reportDirectory; this.dictionary = dictionary; // setup models this.languageModel = new LanguageModel(dictionary); this.errorModel = new ErrorModel(dictionary); this.accentModel = dictionary.IsAccentModelAvailable() ? new AccentModel(dictionary) : null; // setup corrector this.corrector = new Corrector(errorModel, languageModel, accentModel); this.ThreadsAvailable = this.ScaleThreads(); this.filesGroups = this.DivadeIntoGroups(this.ThreadsAvailable); this.ThreadsUsed = this.FilesToProcess.Count > 1 ? filesGroups.Length : 1; // other settings PreserveSubfolders = preserveSubfolders; }
public LanguageModelEvaluation(Dictionary<string, double> evalutation, bool foundInNgrams) { Probabilities = evalutation; FoundInNgrams = foundInNgrams; }
public Checker(Dictionary dictionary, int contextSize) { this.tokenizer = new Tokenizer(dictionary); }