private static void Analyze(string corpusPath, LanguageModel lm) { Console.WriteLine("Corpus path: {0}", corpusPath); Console.WriteLine(); Console.WriteLine("Splitting corpus."); List<List<string>> splitCorpus = SplitCorpus(corpusPath,80, 10, 10); Console.WriteLine("Splitted corpus as follow:"); Console.WriteLine("Training: {0}", splitCorpus[0].Count); Console.WriteLine("Validate: {0}", splitCorpus[1].Count); Console.WriteLine("Test: {0}", splitCorpus[2].Count); Console.WriteLine("Training model."); lm.Train(splitCorpus[0]); Console.WriteLine("Calculate Perplextiy with validate set."); PerplexityCalculator perplexityCalculator = new PerplexityCalculator(lm); int unkWords = 0; double perplexity = perplexityCalculator.GetPerplexity(splitCorpus[1], out unkWords); Console.WriteLine("Found {0} unknown words:", unkWords); Console.WriteLine("Perplexity of validation is {0}", perplexity); perplexity = perplexityCalculator.GetPerplexity(splitCorpus[2], out unkWords); Console.WriteLine("Found {0} unknown words:", unkWords); Console.WriteLine("Perplexity of testing is {0}", perplexity); }
public static List<DoubleCombination> GetOptimumCombination(int rangeOfTrial, LanguageModel languageModel, IEnumerable<string> corpus) { if (languageModel == null) throw new ArgumentNullException("backoffModel"); List<double> optimumBetas = new List<double>(languageModel.Settings.NGramOrder); Dictionary<double, List<DoubleCombination>> perplexityResults = new Dictionary<double, List<DoubleCombination>>(); Dictionary<int, double> orderValues = languageModel is ExampleBackOffModelWithDiscounting ? languageModel.Settings.BackOffBetaPerOrder : languageModel.Settings.LinearInterpolationLambdaPerOrder; for (int i = 1; i < rangeOfTrial; i++) { for (int j = 1; j < languageModel.Settings.NGramOrder + 1; j++) { // Initialize round for (int k = 1; k < languageModel.Settings.NGramOrder + 1; k++) { orderValues[k] = i / (double)rangeOfTrial; } // Set each of the values to try for (int k = 1; k < rangeOfTrial; k++) { orderValues[j] = k / (double)rangeOfTrial; double sumValues = 0; for (int l = 1; l < languageModel.Settings.NGramOrder + 1; l++) { sumValues += orderValues[l]; } if (sumValues != 1) continue; languageModel.ClearCacheForDifferentSettings(); PerplexityCalculator calculator = new PerplexityCalculator(languageModel); int unkWords = 0; double perplexity = calculator.GetPerplexity(corpus, out unkWords); if (!perplexityResults.ContainsKey(perplexity)) { perplexityResults[perplexity] = new List<DoubleCombination>(); } DoubleCombination configuration = new DoubleCombination(languageModel.Settings.NGramOrder); for (int l = 0; l < configuration.NOrder; l++) { configuration[l] = orderValues[l + 1]; } perplexityResults[perplexity].Add(configuration); } } } return perplexityResults[perplexityResults.Keys.Min()]; }