/// <summary> /// calculate the ngram prob and lower weight, and output them /// @param context, n-gram prefix context /// @param vec_ngram, all n-gram(of the same order) to calculate /// @param stat, current n-gram occurrence statistics /// @param discount, current n-gram discount constants /// @param interpolate, whether to interpolate /// </summary> static void calc_ngram(string context, List <Ngram> vec_ngram, Discount discount, bool interpolate, StreamWriter sw) { int order = context.Split().Length; NgramStat cur_stat = new NgramStat(); cur_stat.reset(); //All ngrams with same context for (int i = 0; i < vec_ngram.Count; i++) { Ngram ngram = vec_ngram[i]; if (ngram.word == null) { //This is unigram if (ngram.occur >= g_min_count[0]) { calc_unigram(context, ngram.occur, g_uni_stat, g_discounts[0], sw); } } else { cur_stat.count(ngram.occur); } } double lower_weight = interpolate ? discount.get_lower_weight(cur_stat) : 0.0; //All ngrams with same context for (int i = 0; i < vec_ngram.Count; i++) { Ngram ngram = vec_ngram[i]; if (ngram.word == null) { //skip unigram continue; } if (ngram.occur >= g_min_count[order]) { double prob = discount.get_discount(ngram.occur) / (double)cur_stat.m_total; sw.WriteLine("{0} {1}\t{2} {3}", context, ngram.word, prob, lower_weight); } } }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("kn_lower1 [input file] [output file] [option list]"); return; } string strLine = null; string[] items = null; bool interpolate = false; for (int i = 2; i < args.Length; i++) { string[] optPair = args[i].Split(':'); if (optPair[0] == "-i") { interpolate = true; } else if (optPair[0] == "-f") { if (!LoadStatFile(optPair[1], g_uni_stat, g_discounts)) { Console.WriteLine("error: failed to load file {0}", optPair[1]); return; } } else if (optPair[0] == "-c") { items = optPair[1].Split(','); for (int j = 0; j < items.Length; j++) { g_min_count[j] = long.Parse(items[j]); } } } StreamReader sr = new StreamReader(args[0]); StreamWriter sw = new StreamWriter(args[1]); SortedDictionary <string, List <Ngram> > kv = new SortedDictionary <string, List <Ngram> >(); while ((strLine = sr.ReadLine()) != null) { //if input is 1-gram, just keep the same //if input is n-gram (such as "a b c\t3"), then output "a b\tc\t3"; if (strLine.Contains(" ") == true) { //This is not unigram, so we convert "word1 word2 ... wordN \t frequency" to //"word1 word2 ... \t wordN \t frequency" items = strLine.Split(' '); strLine = String.Join(" ", items, 0, items.Length - 1); strLine = strLine.Trim() + "\t" + items[items.Length - 1]; } items = strLine.Split('\t'); string strContext = items[0]; Ngram v = new Ngram(); if (items.Length == 2) { //This is unigram v.word = null; v.occur = long.Parse(items[1]); } else if (items.Length == 3) { //This is not unigram v.word = items[1]; v.occur = long.Parse(items[2]); } if (kv.ContainsKey(strContext) == false) { kv.Add(strContext, new List <Ngram>()); } kv[strContext].Add(v); } sr.Close(); foreach (KeyValuePair <string, List <Ngram> > pair in kv) { int order = pair.Key.Split().Length; if (g_discounts[order] == null) { Console.WriteLine("Error: Invalidate {0}-gram: {1}", order, pair.Key); continue; } calc_ngram(pair.Key, pair.Value, g_discounts[order], interpolate, sw); } sw.Close(); }