Exemple #1
0
 public void Merge(NGram <TSymbol> toBeMerged)
 {
     if (_n != toBeMerged.GetN())
     {
         return;
     }
     _vocabulary.UnionWith(toBeMerged._vocabulary);
     rootNode.Merge(toBeMerged.rootNode);
 }
Exemple #2
0
        /**
         * <summary>Wrapper function to set the N-gram probabilities with Good-Turing smoothing. N[1] / \sum_{i=1}^infty N_i is
         * the out of vocabulary probability.</summary>
         * <param name="nGram">N-Gram for which the probabilities will be set.</param>
         * <param name="level">Level for which N-Gram probabilities will be set. Probabilities for different levels of the</param>
         *              N-gram can be set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2,
         *              N-Gram is treated as Bigram, etc.
         */
        public override void SetProbabilities(NGram <TSymbol> nGram, int level)
        {
            var    countsOfCounts = nGram.CalculateCountsOfCounts(level);
            var    n   = LinearRegressionOnCountsOfCounts(countsOfCounts);
            double sum = 0;

            for (var r = 1; r < countsOfCounts.Length; r++)
            {
                sum += countsOfCounts[r] * r;
            }

            nGram.SetAdjustedProbability(n, level, n[1] / sum);
        }
        /**
         * <summary>Wrapper function to learn the parameter (delta) in additive smoothing. The function first creates K NGrams
         * with the train folds of the corpus. Then optimizes delta with respect to the test folds of the corpus</summary>
         * <param name="corpus">Train corpus used to optimize delta parameter</param>
         * <param name="n">N in N-Gram.</param>
         */
        protected override void LearnParameters(List <List <TSymbol> > corpus, int n)
        {
            const int k      = 10;
            var       nGrams = new NGram <TSymbol> [k];
            var       kFoldCrossValidation =
                new KFoldCrossValidation <List <TSymbol> >(corpus, k, 0);

            for (var i = 0; i < k; i++)
            {
                nGrams[i] = new NGram <TSymbol>(kFoldCrossValidation.GetTrainFold(i), n);
            }

            _delta = LearnBestDelta(nGrams, kFoldCrossValidation, 0.1);
        }
Exemple #4
0
        public NGramModel(int n, string language)
        {
            this.language = language;

            if (n <= 1)
            {
                throw new Exception("n must be larger than 1");
            }
            else
            {
                this.n1gram = new NGram(n, language);
                this.n2gram = new NGram(n - 1, language);
            }
        }
        /**
         * <summary>Wrapper function to set the N-gram probabilities with interpolated smoothing.</summary>
         * <param name="nGram">N-Gram for which the probabilities will be set.</param>
         * <param name="level">Level for which N-Gram probabilities will be set. Probabilities for different levels of the</param>
         *              N-gram can be set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2,
         *              N-Gram is treated as Bigram, etc.
         *
         */
        public override void SetProbabilities(NGram <TSymbol> nGram, int level)
        {
            for (var j = 2; j <= nGram.GetN(); j++)
            {
                nGram.CalculateNGramProbabilities(_simpleSmoothing, j);
            }

            nGram.CalculateNGramProbabilities(_simpleSmoothing, 1);
            switch (nGram.GetN())
            {
            case 2:
                nGram.SetLambda(_lambda1);
                break;

            case 3:
                nGram.SetLambda(_lambda1, _lambda2);
                break;
            }
        }
        /**
         * <summary>Wrapper function to learn the parameters (lambda1 and lambda2) in interpolated smoothing. The function first creates K NGrams
         * with the train folds of the corpus. Then optimizes lambdas with respect to the test folds of the corpus depending on given N.</summary>
         * <param name="corpus">Train corpus used to optimize lambda parameters</param>
         * <param name="n">N in N-Gram.</param>
         */
        protected override void LearnParameters(List <List <TSymbol> > corpus, int n)
        {
            if (n <= 1)
            {
                return;
            }

            var K      = 10;
            var nGrams = new NGram <TSymbol> [K];
            var kFoldCrossValidation = new KFoldCrossValidation <List <TSymbol> >(corpus, K, 0);

            for (var i = 0; i < K; i++)
            {
                nGrams[i] = new NGram <TSymbol>(kFoldCrossValidation.GetTrainFold(i), n);
                for (var j = 2; j <= n; j++)
                {
                    nGrams[i].CalculateNGramProbabilities(_simpleSmoothing, j);
                }

                nGrams[i].CalculateNGramProbabilities(_simpleSmoothing, 1);
            }

            if (n == 2)
            {
                _lambda1 = LearnBestLambda(nGrams, kFoldCrossValidation, 0.1);
            }
            else
            {
                if (n == 3)
                {
                    var bestLambdas = LearnBestLambdas(nGrams, kFoldCrossValidation, 0.1, 0.1);
                    _lambda1 = bestLambdas[0];
                    _lambda2 = bestLambdas[1];
                }
            }
        }
Exemple #7
0
 /**
  * <summary>Wrapper function to set the N-gram probabilities with no smoothing and replacing unknown words not found in non rare words.</summary>
  * <param name="nGram">N-Gram for which the probabilities will be set.</param>
  * <param name="level">Level for which N-Gram probabilities will be set. Probabilities for different levels of the</param>
  *              N-gram can be set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2,
  *              N-Gram is treated as Bigram, etc.
  *
  */
 public override void SetProbabilities(NGram <Symbol> nGram, int level)
 {
     _dictionary = nGram.ConstructDictionaryWithNonRareWords(level, _probability);
     nGram.ReplaceUnknownWords(_dictionary);
     base.SetProbabilities(nGram, level);
 }
Exemple #8
0
 /**
  * <summary>Wrapper function to learn parameters of the smoothing method and set the N-gram probabilities.</summary>
  *
  * <param name="corpus">Train corpus used to optimize parameters of the smoothing method.</param>
  * <param name="nGram">N-Gram for which the probabilities will be set.</param>
  */
 public void Train(List <List <TSymbol> > corpus, NGram <TSymbol> nGram)
 {
     LearnParameters(corpus, nGram.GetN());
     SetProbabilities(nGram);
 }
 /**
  * <summary>Wrapper function to set the N-gram probabilities with no smoothing and replacing unknown words not found in {@link HashSet} the dictionary.</summary>
  * <param name="nGram">N-Gram for which the probabilities will be set.</param>
  * <param name="level">Level for which N-Gram probabilities will be set. Probabilities for different levels of the</param>
  *              N-gram can be set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2,
  *              N-Gram is treated as Bigram, etc.
  */
 protected new void SetProbabilities(NGram <TSymbol> nGram, int level)
 {
     nGram.ReplaceUnknownWords(_dictionary);
     base.SetProbabilities(nGram, level);
 }
 /**
  * <summary>Wrapper function to set the N-gram probabilities with additive smoothing.</summary>
  * <param name="nGram">N-Gram for which the probabilities will be set.</param>
  * <param name="level">Level for which N-Gram probabilities will be set. Probabilities for different levels of the</param>
  *              N-gram can be set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2,
  *              N-Gram is treated as Bigram, etc.
  */
 public override void SetProbabilities(NGram <TSymbol> nGram, int level)
 {
     nGram.SetProbabilityWithPseudoCount(_delta, level);
 }