예제 #1
0
 public TrigramWithLinearInterpolationLanguageModel(double l1, double l2, double l3, ISmoother smoother)
 {
     UnigramLM = new UnigramLanguageModel {
         Smoother = smoother
     };
     BigramLM = new BigramLanguageModel {
         Smoother = smoother
     };
     TrigramLM = new TrigramLanguageModel {
         Smoother = smoother
     };
     Smoother = smoother;
     L1       = l1;
     L2       = l2;
     L3       = l3;
 }
예제 #2
0
        public static LanguageModelHyperparameters GenerateFromArguments(string args)
        {
            args = args.ToLower();
            var splittedArgs = args.Split(' ', StringSplitOptions.RemoveEmptyEntries);

            // Smoothers
            // Create the collection-level unigram model with no smoothing (max-likelihood) used in some smoothing techniques
            INGramLanguageModel collectionLevelLanguageModel = new UnigramLanguageModel {
                Smoother = new MaxLikelihoodSmoother()
            };
            ISmoother smoother = null;

            switch (splittedArgs[Array.IndexOf(splittedArgs, "-smoothingtechnique") + 1])
            {
            case "ml":
                smoother = new MaxLikelihoodSmoother();
                break;

            case "addk":
                smoother = new AddKSmoother {
                    K = Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1])
                };
                break;

            case "jm":
                smoother = new JelinekMercerSmoother {
                    CollectionLevelLanguageModel = collectionLevelLanguageModel, L = Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1])
                };
                break;

            case "dirichlet":
                smoother = new DirichletSmoother {
                    CollectionLevelLanguageModel = collectionLevelLanguageModel, M = Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1])
                };
                break;

            case "ad":
                smoother = new AbsoluteDiscountSmoother {
                    CollectionLevelLanguageModel = collectionLevelLanguageModel, D = Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1])
                };
                break;

            case "ts":
                smoother = new TwoStageSmoother {
                    CollectionLevelLanguageModel = collectionLevelLanguageModel, L = Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1]), M = Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l2") + 1])
                };
                break;
            }

            // LM
            var nGramLanguageModels = Corpus.CategoriesMap.ToDictionary(cdp => cdp.Key, cdp =>
            {
                INGramLanguageModel modelToUse = null;
                switch (splittedArgs[Array.IndexOf(splittedArgs, "-lm") + 1])
                {
                case "unigram":
                    modelToUse = new UnigramLanguageModel {
                        Smoother = smoother
                    };
                    break;

                case "bigram":
                    modelToUse = new BigramLanguageModel {
                        Smoother = smoother
                    };
                    break;

                case "trigram":
                    modelToUse = new TrigramLanguageModel {
                        Smoother = smoother
                    };
                    break;

                case "trigramwithlinearinterpolation":
                    modelToUse = new TrigramWithLinearInterpolationLanguageModel(
                        Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1]),
                        Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l2") + 1]),
                        Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l3") + 1]),
                        smoother);
                    break;
                }

                return(modelToUse);
            });

            return(new LanguageModelHyperparameters
            {
                CategoryNGramLanguageModelsMap = nGramLanguageModels,
                UnkRatio = Array.IndexOf(splittedArgs, "-unkratio") >= 0 ? Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-unkratio") + 1]) : 0.1,
                IgnoreCase = Array.IndexOf(splittedArgs, "-ignorecase") >= 0,
                L1 = Array.IndexOf(splittedArgs, "-l1") >= 0 ? Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l1") + 1]) : 0.0,
                L2 = Array.IndexOf(splittedArgs, "-l2") >= 0 ? Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l2") + 1]) : 0.0,
                L3 = Array.IndexOf(splittedArgs, "-l3") >= 0 ? Double.Parse(splittedArgs[Array.IndexOf(splittedArgs, "-l3") + 1]) : 0.0,
                CollectionLevelLanguageModel = collectionLevelLanguageModel,
            });
        }