Esempio n. 1
0
        private static void Analyze(string corpusPath, LanguageModel lm)
        {
            Console.WriteLine("Corpus path: {0}", corpusPath);
            Console.WriteLine();

            Console.WriteLine("Splitting corpus.");
            List<List<string>> splitCorpus = SplitCorpus(corpusPath,80, 10, 10);
            Console.WriteLine("Splitted corpus as follow:");
            Console.WriteLine("Training: {0}", splitCorpus[0].Count);
            Console.WriteLine("Validate: {0}", splitCorpus[1].Count);
            Console.WriteLine("Test: {0}", splitCorpus[2].Count);

            Console.WriteLine("Training model.");
            lm.Train(splitCorpus[0]);

            Console.WriteLine("Calculate Perplextiy with validate set.");
            PerplexityCalculator perplexityCalculator = new PerplexityCalculator(lm);
            int unkWords = 0;
            double perplexity = perplexityCalculator.GetPerplexity(splitCorpus[1], out unkWords);
            Console.WriteLine("Found {0} unknown words:", unkWords);
            Console.WriteLine("Perplexity of validation is {0}", perplexity);

            perplexity = perplexityCalculator.GetPerplexity(splitCorpus[2], out unkWords);
            Console.WriteLine("Found {0} unknown words:", unkWords);
            Console.WriteLine("Perplexity of testing is {0}", perplexity);
        }
Esempio n. 2
0
        public static List<DoubleCombination> GetOptimumCombination(int rangeOfTrial, LanguageModel languageModel, IEnumerable<string> corpus)
        {
            if (languageModel == null) throw new ArgumentNullException("backoffModel");

            List<double> optimumBetas = new List<double>(languageModel.Settings.NGramOrder);
            Dictionary<double, List<DoubleCombination>> perplexityResults = new Dictionary<double, List<DoubleCombination>>();
            Dictionary<int, double> orderValues = languageModel is ExampleBackOffModelWithDiscounting
                                                ? languageModel.Settings.BackOffBetaPerOrder
                                                : languageModel.Settings.LinearInterpolationLambdaPerOrder;

            for (int i = 1; i < rangeOfTrial; i++)
            {
                for (int j = 1; j < languageModel.Settings.NGramOrder + 1; j++)
                {
                    // Initialize round
                    for (int k = 1; k < languageModel.Settings.NGramOrder + 1; k++)
                    {
                        orderValues[k] = i / (double)rangeOfTrial;
                    }

                    // Set each of the values to try
                    for (int k = 1; k < rangeOfTrial; k++)
                    {
                        orderValues[j] = k / (double)rangeOfTrial;
                        double sumValues = 0;
                        for (int l = 1; l < languageModel.Settings.NGramOrder + 1; l++)
                        {
                            sumValues += orderValues[l];
                        }

                        if (sumValues != 1)
                            continue;

                        languageModel.ClearCacheForDifferentSettings();
                        PerplexityCalculator calculator = new PerplexityCalculator(languageModel);
                        int unkWords = 0;
                        double perplexity = calculator.GetPerplexity(corpus, out unkWords);
                        if (!perplexityResults.ContainsKey(perplexity))
                        {
                            perplexityResults[perplexity] = new List<DoubleCombination>();
                        }

                        DoubleCombination configuration = new DoubleCombination(languageModel.Settings.NGramOrder);
                        for (int l = 0; l < configuration.NOrder; l++)
                        {
                            configuration[l] = orderValues[l + 1];
                        }
                        perplexityResults[perplexity].Add(configuration);
                    }
                }
            }

            return perplexityResults[perplexityResults.Keys.Min()];
        }