Exemple #1
0
        private static void ConvertToModel(int modelVersion, string filePattern)
        {
            string[] modelFiles = Directory.GetFiles(basePath, filePattern);

            for (int i = 0; i < Math.Min(modelFiles.Length, 3); i++)
            {
                ILanguageModel model = null;
                switch (modelVersion)
                {
                case 1:
                    model = new Model1();
                    break;

                case 2:
                    model = new Model2();
                    break;

                case 3:
                    model = new Model2French();
                    break;
                }
                using (BinaryReader br = new BinaryReader(File.OpenRead(modelFiles[i])))
                {
                    model.ReadFromPriorBinary(br);
                }

                int modelLevel = Convert.ToInt32(Path.GetFileNameWithoutExtension(modelFiles[i]).Where(c => Char.IsDigit(c)).First().ToString());
                using (BinaryWriter bw = new BinaryWriter(File.Create(Path.Combine(basePath, "model_v" + modelVersion + "_" + modelLevel + ".at"))))
                {
                    model.WriteToBinary(bw);
                }
            }
        }
Exemple #2
0
        public static void Test(AccentConverter converter, int modelVersion)
        {
            string modelFile = (converter is VietConverter) ? g_ModelFile : g_FrenchModelFile;

            string[] modelFiles = Directory.GetFiles(DataManager.BaseDir, Path.GetFileName(String.Format(modelFile, "*")));

            List <ILanguageModel> models = new ILanguageModel[5].ToList();

            for (int n = 1; n <= 3; n++)
            {
                string fileName = String.Format(modelFile, n);
                if (!File.Exists(fileName))
                {
                    continue;
                }

                Console.WriteLine("Loading {0}-gram model...", n);

                Clocker.Tick();

                models[n - 1] = ModelFactory.LoadModel(modelFiles[n - 1], modelVersion);

                Clocker.Tock();
            }

            TestUserQuery(models, converter);
        }
Exemple #3
0
        public Corrector(IErrorModel errorModel, ILanguageModel languageModel, IAccentModel accentModel = null, bool skipCandidatesMissingInNgrams = false)
        {
            this.errorModel = errorModel;
            this.languageModel = languageModel;
            this.accentModel = accentModel;

            this.skipCandidatesMissingInNgrams = skipCandidatesMissingInNgrams;
        }
        public static double CalculatePerplexity(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus, out TestStats testStats)
        {
            testStats = new TestStats
            {
                UniqueUnksFound = new HashSet<string>(),
                UniqueWordsFound = new HashSet<string>()
            };

            double logSumOfCorpus = 0;
            for (int k = 0; k < testCorpus.Sentences.Count; k++)
            {
                Sentence sentence = testCorpus.Sentences[k];
                double logOfSentence = 0;
                string previousWord = Constants.Start;
                string previousPreviousWord = Constants.Start;

                testStats.TotalSentencesFound++;
                for (int i = 0; i < sentence.Words.Length; i++)
                {
                    string calculatedWord = sentence.Words[i];
                    if (!trainingCorpus.UniqueWords.ContainsKey(sentence.Words[i]))
                    {
                        calculatedWord = Constants.Unknown;
                        testStats.TotalUnksFound++;
                        testStats.UniqueUnksFound.Add(sentence.Words[i]);
                    }
                    testStats.TotalWordsFound++;
                    testStats.UniqueWordsFound.Add(calculatedWord);

                    double modelP = model.P(previousPreviousWord, previousWord, calculatedWord);
                    double logModelP = Math.Log(modelP, 2);
                    logOfSentence += logModelP;

                    previousPreviousWord = previousWord;
                    previousWord = calculatedWord;
                }

                if (Double.IsInfinity(logOfSentence))
                {
                    throw new InvalidOperationException();
                }
                logSumOfCorpus += logOfSentence;
                if (Double.IsInfinity(logSumOfCorpus))
                {
                    throw new InvalidOperationException();
                }

                if (model is Problem1Model && k % 100 == 0)
                {
                    Console.WriteLine("Now at sentence {0}/{1}", k, testCorpus.Sentences.Count);
                }
            }

            double sum = logSumOfCorpus / testCorpus.TotalWordCount;
            return Math.Pow(2, -1*sum);
        }
Exemple #5
0
        public static ILanguageModel LoadModel(string file, int version)
        {
            ILanguageModel model = ModelFactory.CreateModelByVersion(version);

            using (BinaryReader br = new BinaryReader(File.OpenRead(file)))
            {
                model.ReadFromBinary(br);
            }
            return(model);
        }
Exemple #6
0
        private static double CalculatePerplexityWrapper(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus)
        {
            Perplexity.TestStats testStats;
            double perplexity = Perplexity.CalculatePerplexity(model, trainingCorpus, testCorpus, out testStats);

            Console.WriteLine("{0}\tPerplexity", perplexity);
            Console.WriteLine("Test stats:");
            Console.WriteLine(testStats.ToString());
            return perplexity;
        }
Exemple #7
0
        private static void ComputeAccentScore(string[] words, int iW,
                                               AccentConverter converter, double weight,
                                               ILanguageModel model, int n, Dictionary <string, double> accScoreMap)
        {
            int g = n - 1;

            // compute accent probability for this word
            int g3Start = Math.Max(iW - g, 0);
            int g3End   = Math.Min(iW + g, words.Length - 1);

            for (int jW = g3Start; jW <= g3End - g; jW++)
            {
                string segment =
                    (g == 2) ? String.Format("{0} {1} {2}", words[jW], words[jW + 1], words[jW + 2]) :
                    (g == 1) ? String.Format("{0} {1}", words[jW], words[jW + 1]) : words[jW];


                Dictionary <string, int> accentsCountMap = model.GetAccents(segment);

                if (accentsCountMap == null)
                {
                    continue;
                }

                double count = accentsCountMap.Sum(item => item.Value);

                foreach (string accents in accentsCountMap.Keys)
                {
                    string accSegment = Accentify(segment, accents, converter);
                    if (accSegment != null)
                    {
                        string[] accWords = accSegment.Split(new char[0]);

                        string accentedWord = accWords[iW - jW];
                        double accScore     = (accentsCountMap[accents] / count) * weight;

                        if (!accScoreMap.ContainsKey(accentedWord))
                        {
                            accScoreMap.Add(accentedWord, 0);
                        }
                        accScoreMap[accentedWord] += accScore;
                    }
                }
            }
        }
Exemple #8
0
        static void EvaluateModels(int modelVersion, string modelFilePattern)
        {
            var models = new List <ILanguageModel>();

            string[] modelFiles = Directory.GetFiles(basePath, modelFilePattern);
            for (int i = 0; i < modelFiles.Length; i++)
            {
                using (BinaryReader br = new BinaryReader(File.OpenRead(modelFiles[i])))
                {
                    ILanguageModel model = ModelFactory.CreateModelByVersion(modelVersion);
                    model.ReadFromBinary(br);
                    models.Add(model);
                }
            }
            var converter = new Utility.VietConverter();

            Trainer.TestTestingSet(models, converter);
            Trainer.TestUserQuery(models, converter);
        }
Exemple #9
0
        public DictionaryGenerator(Dictionary dictionary, string directory, string outputDirectory)
        {
            this.dictionary = dictionary;
            this.outputDirectory = outputDirectory;
            this.directory = directory;
            this.errorModel = new MPSpell.Correction.ErrorModel(dictionary);
            this.languageModel = new LanguageModel(dictionary);

            int initValue = 1;

            char[] alphabetWithSpace = dictionary.GetAlphabetForErrorModel(true).ToCharArray();
            char[] alphabet = dictionary.GetAlphabetForErrorModel().ToCharArray();
            insGen = new InsertionsMatrixGenerator(alphabetWithSpace, initValue);
            delGen = new DeletionsMatrixGenerator(alphabetWithSpace, initValue);
            subGen = new SubstitutionsMatrixGenerator(alphabet, initValue);
            trnGen = new TranspositionsMatrixGenerator(alphabet, initValue);

            charCounter = new CharFrequencyCounter(alphabetWithSpace.ToStringArray());
            twoCharCounter = new TwoCharFrequencyCounter(alphabetWithSpace.ToStringArray());
        }
Exemple #10
0
        public static double GetPerplexity(ILanguageModel lm, IList <StringList> testSet, int ngramSize)
        {
            var perplexity = new BigDecimal(1d);

            foreach (var sentence in testSet)
            {
                foreach (var ngram in NGramUtils.GetNGrams(sentence, ngramSize))
                {
                    var ngramProbability = lm.CalculateProbability(ngram);
                    perplexity = perplexity.multiply(new BigDecimal(1d).divide(new BigDecimal(ngramProbability), MathContext.DECIMAL128));
                }
            }

            var p = Math.Log(perplexity.doubleValue());

            if (double.IsInfinity(p) || double.IsNaN(p))
            {
                return(double.PositiveInfinity); // over/underflow -> too high perplexity
            }
            var log = new BigDecimal(p);

            return(Math.Pow(Math.E, log.divide(new BigDecimal(testSet.Count), MathContext.DECIMAL128).doubleValue()));
        }
        private void TestWellDefinedProbability(ILanguageModel model, bool testIfWellDefined)
        {
            // Verify the function for P is well defined for trigrams that exist
            foreach (var wordminus2 in _twoDogSentencesCorpus.UniqueWords.Keys)
            {
                if (wordminus2 == Constants.Stop)
                {
                    continue;
                }

                foreach (var wordminus1 in _twoDogSentencesCorpus.UniqueWords.Keys)
                {
                    if (wordminus1 == Constants.Stop || (wordminus2 != Constants.Start && wordminus1 == Constants.Start))
                    {
                        continue;
                    }

                    double total = 0;
                    foreach (var word in _twoDogSentencesCorpus.UniqueWords.Keys.Where(w => w != Constants.Start))
                    {
                        double pml = model.P(wordminus2, wordminus1, word);
                        if (pml > 0)
                        {
                            total += pml;
                        }
                    }
                    Debug.WriteLine("Next! Sum was {0}", total);
                    if (testIfWellDefined)
                    {
                        total.Should().Be(1);
                    }
                }
            }
        }
Exemple #12
0
        private void PrepareProject(Dictionary dictionary, string resultDirectory, string reportDirectory, bool preserveSubfolders)
        {
            this.ExportContext = false;
            this.ResultDirectory = resultDirectory;
            this.ReportDirectory = reportDirectory;

            this.dictionary = dictionary;

            // setup models
            this.languageModel = new LanguageModel(dictionary);
            this.errorModel = new ErrorModel(dictionary);
            this.accentModel = dictionary.IsAccentModelAvailable() ? new AccentModel(dictionary) : null;

            // setup corrector
            this.corrector = new Corrector(errorModel, languageModel, accentModel);

            this.ThreadsAvailable = this.ScaleThreads();
            this.filesGroups = this.DivadeIntoGroups(this.ThreadsAvailable);
            this.ThreadsUsed = this.FilesToProcess.Count > 1 ? filesGroups.Length : 1;

            // other settings
            PreserveSubfolders = preserveSubfolders;
        }
Exemple #13
0
 public Auger(IPrefixLookup spellchecker, ILanguageModel languageModel, INextWordModel nextWordModel)
 {
     SpellChecker  = spellchecker;
     LanguageModel = languageModel;
     NextWordModel = nextWordModel;
 }
Exemple #14
0
        public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true)
        {
            List <int> grams = new List <int>();

            for (int n = minGram; n <= maxGram; n++)
            {
                if (!File.Exists(String.Format(outModelFilePattern, n)))
                {
                    grams.Add(n);
                }
            }
            if (grams.Count == 0)
            {
                return;
            }

            // Load dictionary of raw words
            Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null;

            // Load segments from training data
            List <List <string> > segments = TextParser.ParseData(inputTrainingFiles);

            StringBuilder sbRaw = new StringBuilder();
            StringBuilder sbAcc = new StringBuilder();

            foreach (int n in grams)
            {
                int iG = n - 1;
                Console.WriteLine("Building {0}-gram ...", iG + 1);

                Clocker.Tick();

                using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1))))
                {
                    ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion);
                    for (int iS = 0; iS < segments.Count; iS++)
                    {
                        List <string> words = segments[iS];
                        for (int i = 0; i < words.Count - iG; i++)
                        {
                            sbRaw.Clear();
                            sbAcc.Clear();

                            bool shouldProceed = true;
                            if (learnKnownWordsOnly)
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    string accWord = words[i + g];
                                    string rawWord = converter.Convert(accWord);

                                    if (!dictionary.ContainsKey(rawWord))
                                    {
                                        shouldProceed = false;
                                        break;
                                    }

                                    sbAcc.Append(accWord);
                                    sbRaw.Append(rawWord);
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }
                            else
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    sbAcc.Append(words[i + g]);
                                    sbRaw.Append(converter.Convert(words[i + g]));
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }

                            if (shouldProceed)
                            {
                                string accents = ExtractAccents(sbAcc.ToString(), converter);

                                igGram.Add(sbRaw.ToString(), accents);
                            }
                        }
                    }

                    igGram.WriteToBinary(bwModel);
                }

                Clocker.Tock();
            }
        }