Пример #1
0
        internal void ParseDictionary(Dictionary dictionary)
        {
            AffixRules rules = null;
            Encoding encoding = null;

            string affixFile = dictionary.GetFile(DictionaryFileType.Affix);
            if (null != affixFile)
            {
                encoding = Utils.DetectEncoding(affixFile);
                if (null == encoding)
                {
                    encoding = EncodingDetector.DetectEncoding(affixFile);
                }
                rules = this.affixParser.Parse(affixFile, encoding);
            }

            string fileName = dictionary.GetFile(DictionaryFileType.Dictionary);
            DictionaryWithFlags rawDict = this.parser.Parse(fileName, encoding);

            foreach (DictionaryItemWithFlags item in rawDict)
            {
                if (null == item.Flags)
                {
                    dictionary.Add(item.Word);
                }
                else
                {
                    dictionary.AddRange(rules.GetPossibleWords(item));
                }
            }
        }
Пример #2
0
        public MatrixGeneratorTest()
        {
            DictionaryManager manager = new DictionaryManager(@"C:\dev\git\Pspell\SpellCheckerConsole\bin\Debug\dictionaries");
            Dictionary enUs = manager.GetDictionary("en_US");
            this.alphabetWithSpace = enUs.GetAlphabetForErrorModel(true).ToCharArray();
            this.alphabet = enUs.GetAlphabetForErrorModel().ToCharArray();
            Array.Sort<char>(this.alphabetWithSpace);
            Array.Sort<char>(this.alphabet);

            ErrorListParser parser = new ErrorListParser("test_errors.txt");
            testData = parser.Parse();
        }
Пример #3
0
        public Dictionary(DictionaryLoader loader, string name, string path, char[] alphabet, char[] specialChars = null, string wordBoundaryRegex = null, Dictionary<char, List<char>> accentPairs = null)
        {
            Name = name;
            Alphabet = alphabet;
            specialCharsInsideWord = specialChars;

            this.path = path;
            this.wordBoundaryRegex = wordBoundaryRegex;
            this.accentPairs = accentPairs;
            this.loader = loader;

            this.AlphabetWithSpecialCharsCache = this.GetAlphabetForErrorModel(true).ToCharArray();
        }
Пример #4
0
        public LanguageModelEvaluation EvaluateCandidates(MisspelledWord word, Dictionary<string, double> candidates)
        {
            foundInNgrams = false;
            List<string> leftContext = word.GetLeftContext();

            NgramType type = this.dictionary.GetHighestAvailableNgramCollection(leftContext.Count);

            Dictionary<string, double> probability = new Dictionary<string, double>();
            string[] lcArray = this.GetLeftContext(leftContext, type);
            NgramEvaluation evaluation;
            foreach (KeyValuePair<string, double> option in candidates)
            {
                lcArray[leftContext.Count - 1] = option.Key.Contains(' ') ? option.Key.Split(space).First() : option.Key;

                evaluation = this.dictionary.GetNgramCollection(type).GetProbability(lcArray);
                probability.Add(option.Key, evaluation.Probability);

                if (!foundInNgrams && evaluation.Occurence > 0)
                {
                    foundInNgrams = true;
                }
            }

            List<string> rightContext = word.GetRightContext();
            NgramType secType = this.dictionary.GetHighestAvailableNgramCollection(rightContext.Count);

            if (type == NgramType.Unigram && type == NgramType.Unigram)
            {
                // do nothing
            }
            else
            {
                string[] rcArray = this.GetRightContext(rightContext, secType);
                foreach (KeyValuePair<string, double> option in candidates)
                {
                    rcArray[0] = option.Key.Contains(' ') ? option.Key.Split(space).Last() : option.Key;

                    evaluation = this.dictionary.GetNgramCollection(secType).GetProbability(rcArray);
                    probability[option.Key] *= evaluation.Probability;

                    if (!foundInNgrams && evaluation.Occurence > 0)
                    {
                        foundInNgrams = true;
                    }
                }
            }

            return new LanguageModelEvaluation(probability, foundInNgrams);
        }
Пример #5
0
 public static void Write(string file, Dictionary dictionary)
 {
     using (FileStream fStream = new FileStream(file, FileMode.Create, FileAccess.Write))
     {
         using (StreamWriter writer = new StreamWriter(fStream, Encoding.UTF8))
         {
             /*foreach (string word in dictionary)
             {
                 writer.WriteLine(word);
             }
             writer.Close();
              * */
         }
     }
 }
Пример #6
0
        internal void ParseConfusionMatrixes(Dictionary dictionary)
        {
            DictionaryFileType[] matrixes = new DictionaryFileType[] {
                DictionaryFileType.DeletetionsMatrix,
                DictionaryFileType.InsertionsMatrix,
                DictionaryFileType.SubstitutionsMatrix,
                DictionaryFileType.TranspositionsMatrix
            };

            foreach (DictionaryFileType type in matrixes)
            {
                string file = dictionary.GetFile(type);
                if (null != file)
                {
                    ConfusionMatrix matrix = this.matrixParser.ParseMatrix(file);
                    dictionary.AddConfusionMatrix(ConvertFileTypeToEditOperation(type), matrix);
                }
            }
        }
Пример #7
0
        public DictionaryGenerator(Dictionary dictionary, string directory, string outputDirectory)
        {
            this.dictionary = dictionary;
            this.outputDirectory = outputDirectory;
            this.directory = directory;
            this.errorModel = new MPSpell.Correction.ErrorModel(dictionary);
            this.languageModel = new LanguageModel(dictionary);

            int initValue = 1;

            char[] alphabetWithSpace = dictionary.GetAlphabetForErrorModel(true).ToCharArray();
            char[] alphabet = dictionary.GetAlphabetForErrorModel().ToCharArray();
            insGen = new InsertionsMatrixGenerator(alphabetWithSpace, initValue);
            delGen = new DeletionsMatrixGenerator(alphabetWithSpace, initValue);
            subGen = new SubstitutionsMatrixGenerator(alphabet, initValue);
            trnGen = new TranspositionsMatrixGenerator(alphabet, initValue);

            charCounter = new CharFrequencyCounter(alphabetWithSpace.ToStringArray());
            twoCharCounter = new TwoCharFrequencyCounter(alphabetWithSpace.ToStringArray());
        }
Пример #8
0
        public void SetValue(char rowKey, char columnKey, int value)
        {
            if (matrix.ContainsKey(rowKey))
            {

                if (matrix[rowKey].ContainsKey(columnKey))
                {
                    matrix[rowKey][columnKey] = value;
                }
                else
                {
                    matrix[rowKey].Add(columnKey, value);
                }

            }
            else
            {
                Dictionary<char, int> innerDict = new Dictionary<char,int>();
                innerDict.Add(columnKey, value);

                matrix.Add(rowKey, innerDict);
            }
        }
Пример #9
0
        public DictionaryTest()
        {
            dictionary = new Dictionary(new DictionaryLoader(new DefaultDictionaryFileParser()),
                "en_US",
                "gen",
                "abcdefghijklmnopqrstuvwxyz".ToCharArray()
            );

            dictionary.Add("acres");

            Dictionary<char, List<char>> accentPairs = new Dictionary<char, List<char>>();
            accentPairs.Add('a', new List<char>() { 'á' });
            accentPairs.Add('e', new List<char>() { 'é', 'ě' });
            accentPairs.Add('i', new List<char>() { 'í' });
            accentPairs.Add('o', new List<char>() { 'ó' });
            accentPairs.Add('u', new List<char>() { 'ú', 'ů' });
            accentPairs.Add('y', new List<char>() { 'ý' });
            accentPairs.Add('c', new List<char>() { 'č' });
            accentPairs.Add('d', new List<char>() { 'ď' });
            accentPairs.Add('n', new List<char>() { 'ň' });
            accentPairs.Add('r', new List<char>() { 'ř' });
            accentPairs.Add('s', new List<char>() { 'š' });
            accentPairs.Add('t', new List<char>() { 'ť' });
            accentPairs.Add('z', new List<char>() { 'ž' });

            csCZ = new Dictionary(new DictionaryLoader(new DefaultDictionaryFileParser()),
                "cs_CZ",
                "gen",
                "abcdefghijklmnopqrstuvwxyzáéíóúýčďěňřšťžů".ToCharArray(),
                null,
                "[a-záéíóúýčďěňřšťžů]+",
                accentPairs);

            csCZ.Add("večeře");
            csCZ.Add("véčera");
            csCZ.Add("věci");
        }
Пример #10
0
        public Dictionary<string, double> GeneratePossibleWords(string word)
        {
            Dictionary<string, double> result = new Dictionary<string, double>();

            // substitution
            for (int i = 0; i < word.Length; i++)
            {
                foreach (string charItem in alphabet)
                {
                    string edited = String.Copy(word).Remove(i, 1).Insert(i, charItem);
                    if (dictionary.FindWord(edited))
                    {
                        double prop = this.CalculateProbability(EditOperation.Substitution, word[i], charItem[0]);
                        if (!result.ContainsKey(edited))
                        {
                            result.Add(edited, prop);
                        }
                        else if (prop > result[edited])
                        {
                            result[edited] = prop;
                        }
                    }
                }
            }

            // deletions
            for (int i = 0; i < word.Length; i++)
            {
                string edited = String.Copy(word).Remove(i, 1);
                if (dictionary.FindWord(edited))
                {
                    char prev = (i - 1) < 0 ? ' ' : word[i];
                    double prop = this.CalculateProbability(EditOperation.Deletion,prev,word[i]);
                    if (!result.ContainsKey(edited))
                    {
                        result.Add(edited, prop);
                    }
                    else if (prop > result[edited])
                    {
                        result[edited] = prop;
                    }
                }
            }

            bool found = false;
            // insertions
            for (int i = 0; i <= word.Length; i++)
            {
                foreach (string item in alphabetWithSpace)
                {

                    string edited = String.Copy(word).Insert(i, item);
                    if (item == " ")
                    {
                        string tr = edited.Trim();
                        if (tr != word)
                        {
                            string[] parts = tr.Split(space);
                            foreach (string part in parts)
                            {
                                if (dictionary.FindWord(part))
                                {
                                    found = true;
                                }
                                else
                                {
                                    found = false;
                                    break;
                                }
                            }
                        }
                    }
                    if (found || dictionary.FindWord(edited))
                    {
                        char prev = (i - 1) < 0 ? ' ' : word[i-1];
                        double prop = this.CalculateProbability(EditOperation.Insertion, prev, item[0]);
                        if (!result.ContainsKey(edited))
                        {
                            result.Add(edited, prop);
                        }
                        else if (prop > result[edited])
                        {
                            result[edited] = prop;
                        }

                        found = false;
                    }
                }
            }

            // transposition
            for (int i = 0; i < word.Length - 1; i++)
            {
                string newString = String.Copy(word);
                string charItem = newString[i].ToString();
                string edited = newString.Remove(i, 1).Insert(i + 1, charItem);
                if (dictionary.FindWord(edited))
                {
                    double prop = this.CalculateProbability(EditOperation.Transposition, word[i], word[i + 1]);
                    if (!result.ContainsKey(edited))
                    {
                        result.Add(edited, prop);
                    }
                    else if(prop > result[edited])
                    {
                        result[edited] = prop;
                    }
                }
            }

            return result;
        }
Пример #11
0
 public StringChecker(string text, Dictionary dictionary, int contextSize = 2)
     : base(dictionary, contextSize)
 {
     reader = new StringReader(text);
     contextLeft = contextSize;
 }
Пример #12
0
        internal void ParseFrequences(Dictionary dictionary)
        {
            FrequencyVector<string> oneChrFrq = this.frequencyParser.ParseFrequency(dictionary.GetFile(DictionaryFileType.OneCharFrequences));
            dictionary.AddFrequencyVector(FrequencyVectorType.OneChar, oneChrFrq);

            FrequencyVector<string> twoChrFrq = this.frequencyParser.ParseFrequency(dictionary.GetFile(DictionaryFileType.TwoCharFrequences));
            dictionary.AddFrequencyVector(FrequencyVectorType.TwoChar, twoChrFrq);
        }
Пример #13
0
        protected Dictionary<char, List<char>> ParsePairs(string[] pairs)
        {
            Dictionary<char, List<char>> result = new Dictionary<char, List<char>>();
            foreach (string pair in pairs)
            {
                string[] data = pair.Split(new char[] { '-' }, StringSplitOptions.RemoveEmptyEntries);
                if (result.ContainsKey(data[0][0]))
                {
                    result[data[0][0]].Add(data[1][0]);
                }
                else
                {
                    result.Add(data[0][0], new List<char>() { data[1][0] });
                }
            }

            return result;
        }
Пример #14
0
        protected Dictionary GetDictionary(FileInfo dictionaryXml, string path)
        {
            XmlDocument xml = new XmlDocument();
            xml.Load(dictionaryXml.FullName);
            XmlNodeList root = xml.GetElementsByTagName("Dictionary");
            XmlNode node = null;
            if (root.Count > 0)
            {
                node = root.Item(0);
            }

            Dictionary dictionary = null;

            if (null != node)
            {
                DictionaryLoader loader = this.CreateDefaultLoader();
                XmlElement el = node as XmlElement;

                string name = node.Attributes["locale"].Value;
                char[] alphabet = node.Attributes["alphabet"].Value.ToCharArray();
                char[] specialChars = null;
                Dictionary<char, List<char>> accentPairs = null;
                string regex = null;
                if (el.HasAttribute("allowedSpecialChars"))
                {
                    specialChars = node.Attributes["allowedSpecialChars"].Value.ToCharArray();
                }
                if (el.HasAttribute("wordBoundaryRegex"))
                {
                    regex = node.Attributes["wordBoundaryRegex"].Value;
                }
                if (el.HasAttribute("accentPairs"))
                {
                    string[] pairs = node.Attributes["accentPairs"].Value.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                    accentPairs = this.ParsePairs(pairs);
                }
                dictionary = new Dictionary(loader, name, path, alphabet, specialChars, regex, accentPairs);
                foreach (XmlNode file in node.ChildNodes)
                {
                    DictionaryFileType type = DictionaryFileType.Unknown;
                    switch (file.Attributes["type"].Value)
                    {
                        case "LineDictionary":
                            type = DictionaryFileType.LineDictionary;
                            break;

                        case "Dictionary":
                            type = DictionaryFileType.Dictionary;
                            break;

                        case "Affix":
                            type = DictionaryFileType.Affix;
                            break;

                        case "OneCharFrequences":
                            type = DictionaryFileType.OneCharFrequences;
                            break;

                        case "TwoCharFrequences":
                            type = DictionaryFileType.TwoCharFrequences;
                            break;

                        case "DeletionsMatrix":
                            type = DictionaryFileType.DeletetionsMatrix;
                            break;

                        case "InsertionsMatrix":
                            type = DictionaryFileType.InsertionsMatrix;
                            break;

                        case "TranspositionsMatrix":
                            type = DictionaryFileType.TranspositionsMatrix;
                            break;

                        case "SubstitutionsMatrix":
                            type = DictionaryFileType.SubstitutionsMatrix;
                            break;

                        case "UnigramFrequences":
                            type = DictionaryFileType.UnigramFrequences;
                            break;

                        case "DigramFrequences":
                            type = DictionaryFileType.DigramFrequences;
                            break;

                        case "TrigramFrequences":
                            type = DictionaryFileType.TrigramFrequences;
                            break;
                    }

                    dictionary.AddFile(type, file.InnerText.Trim());
                }

            }

            return dictionary;
        }
Пример #15
0
 // todo move
 internal void ParseSimpleDictionary(Dictionary dictionary)
 {
     string file = dictionary.GetFile(DictionaryFileType.LineDictionary);
     if (null != file)
     {
         Encoding enc = EncodingDetector.DetectEncoding(file);
         using (StreamReader reader = new StreamReader(file, enc))
         {
             while (!reader.EndOfStream)
             {
                 dictionary.Add(reader.ReadLine());
             }
         }
     }
 }
Пример #16
0
        internal void ParseNgrams(Dictionary dictionary)
        {
            DictionaryFileType[] files = new DictionaryFileType[]
            {
                DictionaryFileType.UnigramFrequences,
                DictionaryFileType.DigramFrequences,
                DictionaryFileType.TrigramFrequences
            };

            foreach (DictionaryFileType type in files)
            {
                string file = dictionary.GetFile(type);

                if (null != file)
                {
                    NgramCollection collection = this.ngramParser.ParseNgrams(file);
                    NgramType ngramType;
                    switch (type)
                    {
                        case DictionaryFileType.DigramFrequences:
                            ngramType = NgramType.Digram;
                            break;

                        case DictionaryFileType.TrigramFrequences:
                            ngramType = NgramType.Trigram;
                            break;

                        default:
                            ngramType = NgramType.Unigram;
                            break;
                    }

                    dictionary.AddNgramCollection(ngramType, collection);
                }
            }
        }
Пример #17
0
 public LanguageModel(Dictionary dict)
 {
     dictionary = dict;
 }
Пример #18
0
 public AccentModel(Dictionary dictionary)
 {
     this.dictionary = dictionary;
     this.accentPairs = dictionary.GetAccentPairs();
 }
Пример #19
0
        public FolderCorrector(Dictionary dictionary, string[] sourceFiles, string resultDirectory = null, string reportDirectory = null)
        {
            this.OnlySelectedFiles = true;
            this.FilesToProcess = this.GetFileInfo(sourceFiles);

            this.PrepareProject(dictionary, resultDirectory, reportDirectory, false);
        }
Пример #20
0
        public FolderCorrector(Dictionary dictionary, string sourceDirectory, string resultDirectory = null, string reportDirectory = null, bool preserveSubfolders = true)
        {
            this.SourceDirectory = sourceDirectory;

            // prepare files and folders
            this.FilesToProcess = this.AnalyzeDir(new DirectoryInfo(sourceDirectory));

            this.PrepareProject(dictionary, resultDirectory, reportDirectory, preserveSubfolders);
        }
Пример #21
0
        private void PrepareProject(Dictionary dictionary, string resultDirectory, string reportDirectory, bool preserveSubfolders)
        {
            this.ExportContext = false;
            this.ResultDirectory = resultDirectory;
            this.ReportDirectory = reportDirectory;

            this.dictionary = dictionary;

            // setup models
            this.languageModel = new LanguageModel(dictionary);
            this.errorModel = new ErrorModel(dictionary);
            this.accentModel = dictionary.IsAccentModelAvailable() ? new AccentModel(dictionary) : null;

            // setup corrector
            this.corrector = new Corrector(errorModel, languageModel, accentModel);

            this.ThreadsAvailable = this.ScaleThreads();
            this.filesGroups = this.DivadeIntoGroups(this.ThreadsAvailable);
            this.ThreadsUsed = this.FilesToProcess.Count > 1 ? filesGroups.Length : 1;

            // other settings
            PreserveSubfolders = preserveSubfolders;
        }
Пример #22
0
 public LanguageModelEvaluation(Dictionary<string, double> evalutation, bool foundInNgrams)
 {
     Probabilities = evalutation;
     FoundInNgrams = foundInNgrams;
 }
Пример #23
0
 public Checker(Dictionary dictionary, int contextSize)
 {
     this.tokenizer = new Tokenizer(dictionary);
 }