/** * @brief calculate interpolate weight **/ public double get_lower_weight(NgramStat stat) { double total = stat.m_total; double min1 = stat.m_min1; double min2 = stat.m_min2; double min3 = stat.m_min3; return((m_d1 * (min1 - min2) + m_d2 * (min2 - min3) + m_d3plus * min3) / total); }
/// <summary> /// calculate the ngram prob and lower weight, and output them /// @param context, n-gram prefix context /// @param vec_ngram, all n-gram(of the same order) to calculate /// @param stat, current n-gram occurrence statistics /// @param discount, current n-gram discount constants /// @param interpolate, whether to interpolate /// </summary> static void calc_ngram(string context, List <Ngram> vec_ngram, Discount discount, bool interpolate, StreamWriter sw) { int order = context.Split().Length; NgramStat cur_stat = new NgramStat(); cur_stat.reset(); //All ngrams with same context for (int i = 0; i < vec_ngram.Count; i++) { Ngram ngram = vec_ngram[i]; if (ngram.word == null) { //This is unigram if (ngram.occur >= g_min_count[0]) { calc_unigram(context, ngram.occur, g_uni_stat, g_discounts[0], sw); } } else { cur_stat.count(ngram.occur); } } double lower_weight = interpolate ? discount.get_lower_weight(cur_stat) : 0.0; //All ngrams with same context for (int i = 0; i < vec_ngram.Count; i++) { Ngram ngram = vec_ngram[i]; if (ngram.word == null) { //skip unigram continue; } if (ngram.occur >= g_min_count[order]) { double prob = discount.get_discount(ngram.occur) / (double)cur_stat.m_total; sw.WriteLine("{0} {1}\t{2} {3}", context, ngram.word, prob, lower_weight); } } }
public static Discount[] g_discounts = new Discount[LM_ORDER]; ///< discounts parameter /// /** * load the unigram statistics and discount constants from file * * @param file, file name * @param g_uni_stat, unigram statistics * @param discounts, n-gram discount constants * */ static bool LoadStatFile(string file, NgramStat g_uni_stat, Discount[] discounts) { StreamReader sr = new StreamReader(file); string strLine = sr.ReadLine(); int pIndex = strLine.IndexOf('\t'); string[] items = strLine.Substring(pIndex + 1).Split(); g_uni_stat.m_total = long.Parse(items[0]); g_uni_stat.m_min1 = long.Parse(items[1]); g_uni_stat.m_min2 = long.Parse(items[2]); g_uni_stat.m_min3 = long.Parse(items[3]); Console.WriteLine("Total:{0} Min1:{1} Min2:{2} Min3:{3}", g_uni_stat.m_total, g_uni_stat.m_min1, g_uni_stat.m_min2, g_uni_stat.m_min3); int order = 0; while (sr.EndOfStream == false) { strLine = sr.ReadLine(); if (strLine.Length == 0) { continue; } pIndex = strLine.IndexOf('\t'); items = strLine.Substring(pIndex + 1).Split(); discounts[order] = new Discount(); discounts[order].m_d1 = double.Parse(items[0]); discounts[order].m_d2 = double.Parse(items[1]); discounts[order].m_d3plus = double.Parse(items[2]); order++; } sr.Close(); return(order > 0); }
const int LM_MINSCORE = -99; ///< minimal score for unknown word //calculate the unigram prob and output it, the prob of BOS is 0 static void calc_unigram(string word, long occur, NgramStat stat, Discount discount, StreamWriter sw) { //set the prob=0 for LM_BOS <s> if (word == LM_BOS) { double exponent = (double)LM_MINSCORE * Math.Log(10.0); sw.WriteLine("{0}\t{1} 0", word, Math.Exp(exponent)); return; } //skip 1-gram with occur = 0 if (occur <= 0) { return; } double prob = discount.get_discount(occur) / (double)stat.m_total; // normalize unigram, make sure sum(prob_unigram) = 1 prob += discount.get_lower_weight(stat) / (double)stat.m_min1; sw.WriteLine("{0}\t{1} 0", word, prob); }