private void calculateEmissionTestCorpus(List <Tokenizer.WordTag> testWords)
        {
            foreach (var tw in testWords)
            {
                if (!char.IsUpper(tw.word[0]))
                {
                    continue;
                }

                string sWord = tw.word;
                PartOfSpeechModel.EmissionModel wmFind = WordCapitalizedTagsEmissionFrequence.Find(x => x.Word == sWord);
                EmissionProbabilisticModel      wFind  = WordCapitalizedTagsEmissionProbabilities.Find(x => x.Word == sWord);
                if (wmFind != null && wFind == null)
                {
                    EmissionProbabilisticModel epModel = new EmissionProbabilisticModel();
                    epModel.Word = wmFind.Word;
                    foreach (var tf in wmFind.TagFreq)
                    {
                        int    cti   = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value;
                        double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti)
                        epModel.TagFreq.Add(tf.Key, pwiti);
                    }
                    this.WordCapitalizedTagsEmissionProbabilities.Add(epModel);
                }
            }

            foreach (var tw in testWords)
            {
                string sWord = tw.word.ToLower();

                PartOfSpeechModel.EmissionModel wmFind = WordTagsEmissionFrequence.Find(x => x.Word == sWord);
                EmissionProbabilisticModel      wFind  = WordTagsEmissionProbabilities.Find(x => x.Word == sWord);
                if (wmFind != null && wFind == null)
                {
                    EmissionProbabilisticModel epModel = new EmissionProbabilisticModel();
                    epModel.Word = wmFind.Word;
                    foreach (var tf in wmFind.TagFreq)
                    {
                        int    cti   = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value;
                        double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti)
                        epModel.TagFreq.Add(tf.Key, pwiti);
                    }
                    this.WordTagsEmissionProbabilities.Add(epModel);
                }
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Calculates Emission Probability for suffix & preffix training capitalized & uncapitalized words.
        /// </summary>
        /// <param name="capitalSuff"></param>
        /// <param name="capitalPref"></param>
        /// <param name="suffxem"></param>
        /// <param name="preffxem"></param>
        /// <param name="smoothing"></param>
        /// <param name="prefSize"></param>
        /// <param name="suffSize"></param>
        private void CalculateSuffixPrefixProbabilities(
            List <EmissionModel> capitalSuff,
            List <EmissionModel> capitalPref,
            List <EmissionModel> suffxem,
            List <EmissionModel> preffxem,
            int smoothing,
            int prefSize,
            int suffSize)
        {
            foreach (var sfx in capitalSuff)
            {
                var tagSum = sfx.TagFreq.Sum(x => x.Value);
                Dictionary <string, double> tgfreq = new Dictionary <string, double>();
                foreach (var tg in sfx.TagFreq)
                {
                    tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * suffSize)));
                }

                var em = new EmissionProbabilisticModel();
                em.Word    = sfx.Word;
                em.TagFreq = tgfreq;
                this.SuffixCapitalizedWordEmissionProbabilities.Add(em);
            }

            foreach (var pfx in capitalPref)
            {
                var tagSum = pfx.TagFreq.Sum(x => x.Value);
                Dictionary <string, double> tgfreq = new Dictionary <string, double>();
                foreach (var tg in pfx.TagFreq)
                {
                    tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * prefSize)));
                }

                var em = new EmissionProbabilisticModel();
                em.Word    = pfx.Word;
                em.TagFreq = tgfreq;
                this.PrefixCapitalizedWordEmissionProbabilities.Add(em);
            }

            foreach (var sfx in suffxem)
            {
                var tagSum = sfx.TagFreq.Sum(x => x.Value);
                Dictionary <string, double> tgfreq = new Dictionary <string, double>();
                foreach (var tg in sfx.TagFreq)
                {
                    tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * suffSize)));
                }

                var em = new EmissionProbabilisticModel();
                em.Word    = sfx.Word;
                em.TagFreq = tgfreq;
                this.SuffixEmissionProbabilities.Add(em);
            }

            foreach (var pfx in preffxem)
            {
                var tagSum = pfx.TagFreq.Sum(x => x.Value);
                Dictionary <string, double> tgfreq = new Dictionary <string, double>();
                foreach (var tg in pfx.TagFreq)
                {
                    tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * prefSize)));
                }

                var em = new EmissionProbabilisticModel();
                em.Word    = pfx.Word;
                em.TagFreq = tgfreq;
                this.PrefixEmissionProbabilities.Add(em);
            }
        }