private void CalculateEmissionForWordTags(List <Tokenizer.WordTag> uncapitalizedWords, List <Tokenizer.WordTag> capitalizedWords) { this.WordCapitalizedTagsEmissionFrequence = new List <EmissionModel>(); this.WordTagsEmissionFrequence = new List <EmissionModel>(); foreach (var w in capitalizedWords) { EmissionModel wmFind = WordCapitalizedTagsEmissionFrequence.Find(x => x.Word == w.word); if (wmFind == null) { EmissionModel wModel = new EmissionModel(); wModel.Word = w.word; wModel.TagFreq.Add(w.tag, 1); this.WordCapitalizedTagsEmissionFrequence.Add(wModel); } else { var tag = wmFind.TagFreq.FirstOrDefault(x => x.Key == w.tag); if (tag.Key == null) { wmFind.TagFreq.Add(w.tag, 1); } else { wmFind.TagFreq[tag.Key] += 1; } } } foreach (var w in uncapitalizedWords) { EmissionModel wmFind = WordTagsEmissionFrequence.Find(x => x.Word == w.word); if (wmFind == null) { EmissionModel wModel = new EmissionModel(); wModel.Word = w.word; wModel.TagFreq.Add(w.tag, 1); this.AddTagToUnigramOccurences(w.tag); this.WordTagsEmissionFrequence.Add(wModel); } else { var tag = wmFind.TagFreq.FirstOrDefault(x => x.Key == w.tag); if (tag.Key == null) { this.AddTagToUnigramOccurences(w.tag); wmFind.TagFreq.Add(w.tag, 1); } else { this.AddTagToUnigramOccurences(w.tag); wmFind.TagFreq[tag.Key] += 1; } } } }
private void calculateEmissionTestCorpus(List <Tokenizer.WordTag> testWords) { foreach (var tw in testWords) { if (!char.IsUpper(tw.word[0])) { continue; } string sWord = tw.word; PartOfSpeechModel.EmissionModel wmFind = WordCapitalizedTagsEmissionFrequence.Find(x => x.Word == sWord); EmissionProbabilisticModel wFind = WordCapitalizedTagsEmissionProbabilities.Find(x => x.Word == sWord); if (wmFind != null && wFind == null) { EmissionProbabilisticModel epModel = new EmissionProbabilisticModel(); epModel.Word = wmFind.Word; foreach (var tf in wmFind.TagFreq) { int cti = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value; double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti) epModel.TagFreq.Add(tf.Key, pwiti); } this.WordCapitalizedTagsEmissionProbabilities.Add(epModel); } } foreach (var tw in testWords) { string sWord = tw.word.ToLower(); PartOfSpeechModel.EmissionModel wmFind = WordTagsEmissionFrequence.Find(x => x.Word == sWord); EmissionProbabilisticModel wFind = WordTagsEmissionProbabilities.Find(x => x.Word == sWord); if (wmFind != null && wFind == null) { EmissionProbabilisticModel epModel = new EmissionProbabilisticModel(); epModel.Word = wmFind.Word; foreach (var tf in wmFind.TagFreq) { int cti = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value; double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti) epModel.TagFreq.Add(tf.Key, pwiti); } this.WordTagsEmissionProbabilities.Add(epModel); } } }