Exemple #1
0
        private void CalculateEmissionForWordTags(List <Tokenizer.WordTag> uncapitalizedWords, List <Tokenizer.WordTag> capitalizedWords)
        {
            this.WordCapitalizedTagsEmissionFrequence = new List <EmissionModel>();
            this.WordTagsEmissionFrequence            = new List <EmissionModel>();

            foreach (var w in capitalizedWords)
            {
                EmissionModel wmFind = WordCapitalizedTagsEmissionFrequence.Find(x => x.Word == w.word);
                if (wmFind == null)
                {
                    EmissionModel wModel = new EmissionModel();
                    wModel.Word = w.word;
                    wModel.TagFreq.Add(w.tag, 1);
                    this.WordCapitalizedTagsEmissionFrequence.Add(wModel);
                }
                else
                {
                    var tag = wmFind.TagFreq.FirstOrDefault(x => x.Key == w.tag);
                    if (tag.Key == null)
                    {
                        wmFind.TagFreq.Add(w.tag, 1);
                    }
                    else
                    {
                        wmFind.TagFreq[tag.Key] += 1;
                    }
                }
            }


            foreach (var w in uncapitalizedWords)
            {
                EmissionModel wmFind = WordTagsEmissionFrequence.Find(x => x.Word == w.word);
                if (wmFind == null)
                {
                    EmissionModel wModel = new EmissionModel();
                    wModel.Word = w.word;
                    wModel.TagFreq.Add(w.tag, 1);
                    this.AddTagToUnigramOccurences(w.tag);
                    this.WordTagsEmissionFrequence.Add(wModel);
                }
                else
                {
                    var tag = wmFind.TagFreq.FirstOrDefault(x => x.Key == w.tag);
                    if (tag.Key == null)
                    {
                        this.AddTagToUnigramOccurences(w.tag);
                        wmFind.TagFreq.Add(w.tag, 1);
                    }
                    else
                    {
                        this.AddTagToUnigramOccurences(w.tag);
                        wmFind.TagFreq[tag.Key] += 1;
                    }
                }
            }
        }
        private void calculateEmissionTestCorpus(List <Tokenizer.WordTag> testWords)
        {
            foreach (var tw in testWords)
            {
                if (!char.IsUpper(tw.word[0]))
                {
                    continue;
                }

                string sWord = tw.word;
                PartOfSpeechModel.EmissionModel wmFind = WordCapitalizedTagsEmissionFrequence.Find(x => x.Word == sWord);
                EmissionProbabilisticModel      wFind  = WordCapitalizedTagsEmissionProbabilities.Find(x => x.Word == sWord);
                if (wmFind != null && wFind == null)
                {
                    EmissionProbabilisticModel epModel = new EmissionProbabilisticModel();
                    epModel.Word = wmFind.Word;
                    foreach (var tf in wmFind.TagFreq)
                    {
                        int    cti   = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value;
                        double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti)
                        epModel.TagFreq.Add(tf.Key, pwiti);
                    }
                    this.WordCapitalizedTagsEmissionProbabilities.Add(epModel);
                }
            }

            foreach (var tw in testWords)
            {
                string sWord = tw.word.ToLower();

                PartOfSpeechModel.EmissionModel wmFind = WordTagsEmissionFrequence.Find(x => x.Word == sWord);
                EmissionProbabilisticModel      wFind  = WordTagsEmissionProbabilities.Find(x => x.Word == sWord);
                if (wmFind != null && wFind == null)
                {
                    EmissionProbabilisticModel epModel = new EmissionProbabilisticModel();
                    epModel.Word = wmFind.Word;
                    foreach (var tf in wmFind.TagFreq)
                    {
                        int    cti   = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value;
                        double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti)
                        epModel.TagFreq.Add(tf.Key, pwiti);
                    }
                    this.WordTagsEmissionProbabilities.Add(epModel);
                }
            }
        }