Beispiel #1
0
        public override double ComputeWordProbability(string u, string v, string w)
        {
            Bigram uvBigram = new Bigram {
                v = u, w = v
            };
            Trigram uvwTrigram = new Trigram {
                u = u, v = v, w = w
            };;

            // Compute word probability given the previous two
            this.NGramCounts.TryGetValue(uvBigram.GetComparisonKey(), out int uvCount);
            this.NGramCounts.TryGetValue(uvwTrigram.GetComparisonKey(), out int uvwCount);

            // q(w|u, v) = c(u, v, w)/c(u, v)
            // q(w|u, v)_{addK} = (c(u, v, w) + k)/(c(u, v) + k|V*|)
            double qWuv = this.Smoother.ComputeSmoothedWordProbability(u, v, w, uvwCount, uvCount, this.UniqueNGramsCount);

            return(qWuv);
        }
Beispiel #2
0
        public override void TrainLanguageModel(Corpus trainingCorpus)
        {
            // We need to process sentece by sentence to avoid wrapping sentences, ie. counting (STOP, v, w) trigrams
            foreach (var sentence in trainingCorpus.AllTokenizedSentences)
            {
                // Initialize x_{-1}, x_{-2} to START
                var u = "<s>";
                var v = "<s>";

                // We now need to store all counts of c(u, v, w) and c(v, w)
                foreach (var w in sentence)
                {
                    Bigram uvBigram = new Bigram {
                        v = u, w = v
                    };
                    Trigram uvwTrigram = new Trigram {
                        u = u, v = v, w = w
                    };

                    // +1 to current count, current will be 0 if not found, thus starting at 1 as expected
                    this.NGramCounts.TryGetValue(uvBigram.GetComparisonKey(), out int uvCount);
                    uvCount++;
                    this.NGramCounts[uvBigram.GetComparisonKey()] = uvCount;

                    var isNewNgram = !this.NGramCounts.TryGetValue(uvwTrigram.GetComparisonKey(), out int uvwCount);
                    uvwCount++;
                    this.NGramCounts[uvwTrigram.GetComparisonKey()] = uvwCount;

                    if (isNewNgram)
                    {
                        this.UniqueNGramsCount++;
                    }

                    // Replace previous tokens
                    u = v;
                    v = w;
                }
            }
        }