private void calculateEmissionTestCorpus(List <Tokenizer.WordTag> testWords) { foreach (var tw in testWords) { if (!char.IsUpper(tw.word[0])) { continue; } string sWord = tw.word; PartOfSpeechModel.EmissionModel wmFind = WordCapitalizedTagsEmissionFrequence.Find(x => x.Word == sWord); EmissionProbabilisticModel wFind = WordCapitalizedTagsEmissionProbabilities.Find(x => x.Word == sWord); if (wmFind != null && wFind == null) { EmissionProbabilisticModel epModel = new EmissionProbabilisticModel(); epModel.Word = wmFind.Word; foreach (var tf in wmFind.TagFreq) { int cti = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value; double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti) epModel.TagFreq.Add(tf.Key, pwiti); } this.WordCapitalizedTagsEmissionProbabilities.Add(epModel); } } foreach (var tw in testWords) { string sWord = tw.word.ToLower(); PartOfSpeechModel.EmissionModel wmFind = WordTagsEmissionFrequence.Find(x => x.Word == sWord); EmissionProbabilisticModel wFind = WordTagsEmissionProbabilities.Find(x => x.Word == sWord); if (wmFind != null && wFind == null) { EmissionProbabilisticModel epModel = new EmissionProbabilisticModel(); epModel.Word = wmFind.Word; foreach (var tf in wmFind.TagFreq) { int cti = this.UnigramFrequence.FirstOrDefault(x => x.Key == tf.Key).Value; double pwiti = (double)tf.Value / cti; // Emission probability: p(wi/ti) = C(ti, wi) / C(ti) epModel.TagFreq.Add(tf.Key, pwiti); } this.WordTagsEmissionProbabilities.Add(epModel); } } }
/// <summary> /// Calculates Emission Probability for suffix & preffix training capitalized & uncapitalized words. /// </summary> /// <param name="capitalSuff"></param> /// <param name="capitalPref"></param> /// <param name="suffxem"></param> /// <param name="preffxem"></param> /// <param name="smoothing"></param> /// <param name="prefSize"></param> /// <param name="suffSize"></param> private void CalculateSuffixPrefixProbabilities( List <EmissionModel> capitalSuff, List <EmissionModel> capitalPref, List <EmissionModel> suffxem, List <EmissionModel> preffxem, int smoothing, int prefSize, int suffSize) { foreach (var sfx in capitalSuff) { var tagSum = sfx.TagFreq.Sum(x => x.Value); Dictionary <string, double> tgfreq = new Dictionary <string, double>(); foreach (var tg in sfx.TagFreq) { tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * suffSize))); } var em = new EmissionProbabilisticModel(); em.Word = sfx.Word; em.TagFreq = tgfreq; this.SuffixCapitalizedWordEmissionProbabilities.Add(em); } foreach (var pfx in capitalPref) { var tagSum = pfx.TagFreq.Sum(x => x.Value); Dictionary <string, double> tgfreq = new Dictionary <string, double>(); foreach (var tg in pfx.TagFreq) { tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * prefSize))); } var em = new EmissionProbabilisticModel(); em.Word = pfx.Word; em.TagFreq = tgfreq; this.PrefixCapitalizedWordEmissionProbabilities.Add(em); } foreach (var sfx in suffxem) { var tagSum = sfx.TagFreq.Sum(x => x.Value); Dictionary <string, double> tgfreq = new Dictionary <string, double>(); foreach (var tg in sfx.TagFreq) { tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * suffSize))); } var em = new EmissionProbabilisticModel(); em.Word = sfx.Word; em.TagFreq = tgfreq; this.SuffixEmissionProbabilities.Add(em); } foreach (var pfx in preffxem) { var tagSum = pfx.TagFreq.Sum(x => x.Value); Dictionary <string, double> tgfreq = new Dictionary <string, double>(); foreach (var tg in pfx.TagFreq) { tgfreq.Add(tg.Key, (double)(tg.Value + smoothing) / (tagSum + (smoothing * prefSize))); } var em = new EmissionProbabilisticModel(); em.Word = pfx.Word; em.TagFreq = tgfreq; this.PrefixEmissionProbabilities.Add(em); } }