Example #1
0
        public override void TrainLanguageModel(Corpus trainingCorpus)
        {
            // We need to process sentece by sentence to avoid wrapping sentences, ie. counting (STOP, w) bigrams
            foreach (var sentence in trainingCorpus.AllTokenizedSentences)
            {
                // Initialize x_{-1} to START
                var v = "<s>";

                // We now need to store all counts of c(v, w) and c(v)
                foreach (var w in sentence)
                {
                    Unigram vUnigram = new Unigram {
                        w = v
                    };
                    Bigram vwBigram = new Bigram {
                        v = v, w = w
                    };

                    // +1 to current count, current will be 0 if not found, thus starting at 1 as expected
                    this.NGramCounts.TryGetValue(vUnigram.GetComparisonKey(), out int vCount);
                    vCount++;
                    this.NGramCounts[vUnigram.GetComparisonKey()] = vCount;

                    var isNewNgram = !this.NGramCounts.TryGetValue(vwBigram.GetComparisonKey(), out int vwCount);
                    vwCount++;
                    this.NGramCounts[vwBigram.GetComparisonKey()] = vwCount;

                    if (isNewNgram)
                    {
                        this.UniqueNGramsCount++;
                    }

                    // Replace previous token
                    v = w;
                }
            }
        }
Example #2
0
        public override double ComputeWordProbability(string u, string v, string w)
        {
            Unigram vUnigram = new Unigram {
                w = v
            };
            Bigram vwBigram = new Bigram {
                v = v, w = w
            };

            // Compute word probability given the previous one
            this.NGramCounts.TryGetValue(vUnigram.GetComparisonKey(), out int vCount);
            this.NGramCounts.TryGetValue(vwBigram.GetComparisonKey(), out int vwCount);

            // q(w|v) = c(v, w)/c(v)
            // q(w|v)_{addK} = (c(v, w) + k)/(c(v) + k|V*|)
            double qWv = this.Smoother.ComputeSmoothedWordProbability(u, v, w, vwCount, vCount, this.UniqueNGramsCount);

            return(qWv);
        }