private static void ConvertToModel(int modelVersion, string filePattern) { string[] modelFiles = Directory.GetFiles(basePath, filePattern); for (int i = 0; i < Math.Min(modelFiles.Length, 3); i++) { ILanguageModel model = null; switch (modelVersion) { case 1: model = new Model1(); break; case 2: model = new Model2(); break; case 3: model = new Model2French(); break; } using (BinaryReader br = new BinaryReader(File.OpenRead(modelFiles[i]))) { model.ReadFromPriorBinary(br); } int modelLevel = Convert.ToInt32(Path.GetFileNameWithoutExtension(modelFiles[i]).Where(c => Char.IsDigit(c)).First().ToString()); using (BinaryWriter bw = new BinaryWriter(File.Create(Path.Combine(basePath, "model_v" + modelVersion + "_" + modelLevel + ".at")))) { model.WriteToBinary(bw); } } }
public static void Test(AccentConverter converter, int modelVersion) { string modelFile = (converter is VietConverter) ? g_ModelFile : g_FrenchModelFile; string[] modelFiles = Directory.GetFiles(DataManager.BaseDir, Path.GetFileName(String.Format(modelFile, "*"))); List <ILanguageModel> models = new ILanguageModel[5].ToList(); for (int n = 1; n <= 3; n++) { string fileName = String.Format(modelFile, n); if (!File.Exists(fileName)) { continue; } Console.WriteLine("Loading {0}-gram model...", n); Clocker.Tick(); models[n - 1] = ModelFactory.LoadModel(modelFiles[n - 1], modelVersion); Clocker.Tock(); } TestUserQuery(models, converter); }
public Corrector(IErrorModel errorModel, ILanguageModel languageModel, IAccentModel accentModel = null, bool skipCandidatesMissingInNgrams = false) { this.errorModel = errorModel; this.languageModel = languageModel; this.accentModel = accentModel; this.skipCandidatesMissingInNgrams = skipCandidatesMissingInNgrams; }
public static double CalculatePerplexity(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus, out TestStats testStats) { testStats = new TestStats { UniqueUnksFound = new HashSet<string>(), UniqueWordsFound = new HashSet<string>() }; double logSumOfCorpus = 0; for (int k = 0; k < testCorpus.Sentences.Count; k++) { Sentence sentence = testCorpus.Sentences[k]; double logOfSentence = 0; string previousWord = Constants.Start; string previousPreviousWord = Constants.Start; testStats.TotalSentencesFound++; for (int i = 0; i < sentence.Words.Length; i++) { string calculatedWord = sentence.Words[i]; if (!trainingCorpus.UniqueWords.ContainsKey(sentence.Words[i])) { calculatedWord = Constants.Unknown; testStats.TotalUnksFound++; testStats.UniqueUnksFound.Add(sentence.Words[i]); } testStats.TotalWordsFound++; testStats.UniqueWordsFound.Add(calculatedWord); double modelP = model.P(previousPreviousWord, previousWord, calculatedWord); double logModelP = Math.Log(modelP, 2); logOfSentence += logModelP; previousPreviousWord = previousWord; previousWord = calculatedWord; } if (Double.IsInfinity(logOfSentence)) { throw new InvalidOperationException(); } logSumOfCorpus += logOfSentence; if (Double.IsInfinity(logSumOfCorpus)) { throw new InvalidOperationException(); } if (model is Problem1Model && k % 100 == 0) { Console.WriteLine("Now at sentence {0}/{1}", k, testCorpus.Sentences.Count); } } double sum = logSumOfCorpus / testCorpus.TotalWordCount; return Math.Pow(2, -1*sum); }
public static ILanguageModel LoadModel(string file, int version) { ILanguageModel model = ModelFactory.CreateModelByVersion(version); using (BinaryReader br = new BinaryReader(File.OpenRead(file))) { model.ReadFromBinary(br); } return(model); }
private static double CalculatePerplexityWrapper(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus) { Perplexity.TestStats testStats; double perplexity = Perplexity.CalculatePerplexity(model, trainingCorpus, testCorpus, out testStats); Console.WriteLine("{0}\tPerplexity", perplexity); Console.WriteLine("Test stats:"); Console.WriteLine(testStats.ToString()); return perplexity; }
private static void ComputeAccentScore(string[] words, int iW, AccentConverter converter, double weight, ILanguageModel model, int n, Dictionary <string, double> accScoreMap) { int g = n - 1; // compute accent probability for this word int g3Start = Math.Max(iW - g, 0); int g3End = Math.Min(iW + g, words.Length - 1); for (int jW = g3Start; jW <= g3End - g; jW++) { string segment = (g == 2) ? String.Format("{0} {1} {2}", words[jW], words[jW + 1], words[jW + 2]) : (g == 1) ? String.Format("{0} {1}", words[jW], words[jW + 1]) : words[jW]; Dictionary <string, int> accentsCountMap = model.GetAccents(segment); if (accentsCountMap == null) { continue; } double count = accentsCountMap.Sum(item => item.Value); foreach (string accents in accentsCountMap.Keys) { string accSegment = Accentify(segment, accents, converter); if (accSegment != null) { string[] accWords = accSegment.Split(new char[0]); string accentedWord = accWords[iW - jW]; double accScore = (accentsCountMap[accents] / count) * weight; if (!accScoreMap.ContainsKey(accentedWord)) { accScoreMap.Add(accentedWord, 0); } accScoreMap[accentedWord] += accScore; } } } }
static void EvaluateModels(int modelVersion, string modelFilePattern) { var models = new List <ILanguageModel>(); string[] modelFiles = Directory.GetFiles(basePath, modelFilePattern); for (int i = 0; i < modelFiles.Length; i++) { using (BinaryReader br = new BinaryReader(File.OpenRead(modelFiles[i]))) { ILanguageModel model = ModelFactory.CreateModelByVersion(modelVersion); model.ReadFromBinary(br); models.Add(model); } } var converter = new Utility.VietConverter(); Trainer.TestTestingSet(models, converter); Trainer.TestUserQuery(models, converter); }
public DictionaryGenerator(Dictionary dictionary, string directory, string outputDirectory) { this.dictionary = dictionary; this.outputDirectory = outputDirectory; this.directory = directory; this.errorModel = new MPSpell.Correction.ErrorModel(dictionary); this.languageModel = new LanguageModel(dictionary); int initValue = 1; char[] alphabetWithSpace = dictionary.GetAlphabetForErrorModel(true).ToCharArray(); char[] alphabet = dictionary.GetAlphabetForErrorModel().ToCharArray(); insGen = new InsertionsMatrixGenerator(alphabetWithSpace, initValue); delGen = new DeletionsMatrixGenerator(alphabetWithSpace, initValue); subGen = new SubstitutionsMatrixGenerator(alphabet, initValue); trnGen = new TranspositionsMatrixGenerator(alphabet, initValue); charCounter = new CharFrequencyCounter(alphabetWithSpace.ToStringArray()); twoCharCounter = new TwoCharFrequencyCounter(alphabetWithSpace.ToStringArray()); }
public static double GetPerplexity(ILanguageModel lm, IList <StringList> testSet, int ngramSize) { var perplexity = new BigDecimal(1d); foreach (var sentence in testSet) { foreach (var ngram in NGramUtils.GetNGrams(sentence, ngramSize)) { var ngramProbability = lm.CalculateProbability(ngram); perplexity = perplexity.multiply(new BigDecimal(1d).divide(new BigDecimal(ngramProbability), MathContext.DECIMAL128)); } } var p = Math.Log(perplexity.doubleValue()); if (double.IsInfinity(p) || double.IsNaN(p)) { return(double.PositiveInfinity); // over/underflow -> too high perplexity } var log = new BigDecimal(p); return(Math.Pow(Math.E, log.divide(new BigDecimal(testSet.Count), MathContext.DECIMAL128).doubleValue())); }
private void TestWellDefinedProbability(ILanguageModel model, bool testIfWellDefined) { // Verify the function for P is well defined for trigrams that exist foreach (var wordminus2 in _twoDogSentencesCorpus.UniqueWords.Keys) { if (wordminus2 == Constants.Stop) { continue; } foreach (var wordminus1 in _twoDogSentencesCorpus.UniqueWords.Keys) { if (wordminus1 == Constants.Stop || (wordminus2 != Constants.Start && wordminus1 == Constants.Start)) { continue; } double total = 0; foreach (var word in _twoDogSentencesCorpus.UniqueWords.Keys.Where(w => w != Constants.Start)) { double pml = model.P(wordminus2, wordminus1, word); if (pml > 0) { total += pml; } } Debug.WriteLine("Next! Sum was {0}", total); if (testIfWellDefined) { total.Should().Be(1); } } } }
private void PrepareProject(Dictionary dictionary, string resultDirectory, string reportDirectory, bool preserveSubfolders) { this.ExportContext = false; this.ResultDirectory = resultDirectory; this.ReportDirectory = reportDirectory; this.dictionary = dictionary; // setup models this.languageModel = new LanguageModel(dictionary); this.errorModel = new ErrorModel(dictionary); this.accentModel = dictionary.IsAccentModelAvailable() ? new AccentModel(dictionary) : null; // setup corrector this.corrector = new Corrector(errorModel, languageModel, accentModel); this.ThreadsAvailable = this.ScaleThreads(); this.filesGroups = this.DivadeIntoGroups(this.ThreadsAvailable); this.ThreadsUsed = this.FilesToProcess.Count > 1 ? filesGroups.Length : 1; // other settings PreserveSubfolders = preserveSubfolders; }
public Auger(IPrefixLookup spellchecker, ILanguageModel languageModel, INextWordModel nextWordModel) { SpellChecker = spellchecker; LanguageModel = languageModel; NextWordModel = nextWordModel; }
public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true) { List <int> grams = new List <int>(); for (int n = minGram; n <= maxGram; n++) { if (!File.Exists(String.Format(outModelFilePattern, n))) { grams.Add(n); } } if (grams.Count == 0) { return; } // Load dictionary of raw words Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null; // Load segments from training data List <List <string> > segments = TextParser.ParseData(inputTrainingFiles); StringBuilder sbRaw = new StringBuilder(); StringBuilder sbAcc = new StringBuilder(); foreach (int n in grams) { int iG = n - 1; Console.WriteLine("Building {0}-gram ...", iG + 1); Clocker.Tick(); using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1)))) { ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion); for (int iS = 0; iS < segments.Count; iS++) { List <string> words = segments[iS]; for (int i = 0; i < words.Count - iG; i++) { sbRaw.Clear(); sbAcc.Clear(); bool shouldProceed = true; if (learnKnownWordsOnly) { for (int g = 0; g <= iG; g++) { string accWord = words[i + g]; string rawWord = converter.Convert(accWord); if (!dictionary.ContainsKey(rawWord)) { shouldProceed = false; break; } sbAcc.Append(accWord); sbRaw.Append(rawWord); if (g < iG) { sbRaw.Append(" "); } } } else { for (int g = 0; g <= iG; g++) { sbAcc.Append(words[i + g]); sbRaw.Append(converter.Convert(words[i + g])); if (g < iG) { sbRaw.Append(" "); } } } if (shouldProceed) { string accents = ExtractAccents(sbAcc.ToString(), converter); igGram.Add(sbRaw.ToString(), accents); } } } igGram.WriteToBinary(bwModel); } Clocker.Tock(); } }