Esempio n. 1
0
        /// <summary>
        ///  calculate the ngram prob and lower weight, and output them
        ///  @param context, n-gram prefix context
        ///  @param vec_ngram, all n-gram(of the same order) to calculate
        /// @param stat, current n-gram occurrence statistics
        /// @param discount, current n-gram discount constants
        /// @param interpolate, whether to interpolate
        /// </summary>
        static void calc_ngram(string context, List <Ngram> vec_ngram, Discount discount,
                               bool interpolate, StreamWriter sw)
        {
            int       order    = context.Split().Length;
            NgramStat cur_stat = new NgramStat();

            cur_stat.reset();

            //All ngrams with same context
            for (int i = 0; i < vec_ngram.Count; i++)
            {
                Ngram ngram = vec_ngram[i];
                if (ngram.word == null)
                {
                    //This is unigram
                    if (ngram.occur >= g_min_count[0])
                    {
                        calc_unigram(context, ngram.occur, g_uni_stat, g_discounts[0], sw);
                    }
                }
                else
                {
                    cur_stat.count(ngram.occur);
                }
            }

            double lower_weight = interpolate ? discount.get_lower_weight(cur_stat) : 0.0;

            //All ngrams with same context
            for (int i = 0; i < vec_ngram.Count; i++)
            {
                Ngram ngram = vec_ngram[i];
                if (ngram.word == null)
                {
                    //skip unigram
                    continue;
                }
                if (ngram.occur >= g_min_count[order])
                {
                    double prob = discount.get_discount(ngram.occur) / (double)cur_stat.m_total;
                    sw.WriteLine("{0} {1}\t{2} {3}", context, ngram.word, prob, lower_weight);
                }
            }
        }
Esempio n. 2
0
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("kn_lower1 [input file] [output file] [option list]");
                return;
            }

            string strLine = null;

            string[] items = null;

            bool interpolate = false;

            for (int i = 2; i < args.Length; i++)
            {
                string[] optPair = args[i].Split(':');
                if (optPair[0] == "-i")
                {
                    interpolate = true;
                }
                else if (optPair[0] == "-f")
                {
                    if (!LoadStatFile(optPair[1], g_uni_stat, g_discounts))
                    {
                        Console.WriteLine("error: failed to load file {0}", optPair[1]);
                        return;
                    }
                }
                else if (optPair[0] == "-c")
                {
                    items = optPair[1].Split(',');
                    for (int j = 0; j < items.Length; j++)
                    {
                        g_min_count[j] = long.Parse(items[j]);
                    }
                }
            }


            StreamReader sr = new StreamReader(args[0]);
            StreamWriter sw = new StreamWriter(args[1]);

            SortedDictionary <string, List <Ngram> > kv = new SortedDictionary <string, List <Ngram> >();

            while ((strLine = sr.ReadLine()) != null)
            {
                //if input is 1-gram, just keep the same
                //if input is n-gram (such as "a b c\t3"), then output "a b\tc\t3";
                if (strLine.Contains(" ") == true)
                {
                    //This is not unigram, so we convert "word1 word2 ... wordN \t frequency" to
                    //"word1 word2 ... \t wordN \t frequency"
                    items   = strLine.Split(' ');
                    strLine = String.Join(" ", items, 0, items.Length - 1);
                    strLine = strLine.Trim() + "\t" + items[items.Length - 1];
                }

                items = strLine.Split('\t');
                string strContext = items[0];

                Ngram v = new Ngram();
                if (items.Length == 2)
                {
                    //This is unigram
                    v.word  = null;
                    v.occur = long.Parse(items[1]);
                }
                else if (items.Length == 3)
                {
                    //This is not unigram
                    v.word  = items[1];
                    v.occur = long.Parse(items[2]);
                }

                if (kv.ContainsKey(strContext) == false)
                {
                    kv.Add(strContext, new List <Ngram>());
                }
                kv[strContext].Add(v);
            }
            sr.Close();


            foreach (KeyValuePair <string, List <Ngram> > pair in kv)
            {
                int order = pair.Key.Split().Length;
                if (g_discounts[order] == null)
                {
                    Console.WriteLine("Error: Invalidate {0}-gram: {1}", order, pair.Key);
                    continue;
                }

                calc_ngram(pair.Key, pair.Value, g_discounts[order], interpolate, sw);
            }


            sw.Close();
        }