Beispiel #1
0
        /**
         * @brief calculate interpolate weight
         **/
        public double get_lower_weight(NgramStat stat)
        {
            double total = stat.m_total;
            double min1  = stat.m_min1;
            double min2  = stat.m_min2;
            double min3  = stat.m_min3;

            return((m_d1 * (min1 - min2) + m_d2 * (min2 - min3) + m_d3plus * min3) / total);
        }
Beispiel #2
0
        /// <summary>
        ///  calculate the ngram prob and lower weight, and output them
        ///  @param context, n-gram prefix context
        ///  @param vec_ngram, all n-gram(of the same order) to calculate
        /// @param stat, current n-gram occurrence statistics
        /// @param discount, current n-gram discount constants
        /// @param interpolate, whether to interpolate
        /// </summary>
        static void calc_ngram(string context, List <Ngram> vec_ngram, Discount discount,
                               bool interpolate, StreamWriter sw)
        {
            int       order    = context.Split().Length;
            NgramStat cur_stat = new NgramStat();

            cur_stat.reset();

            //All ngrams with same context
            for (int i = 0; i < vec_ngram.Count; i++)
            {
                Ngram ngram = vec_ngram[i];
                if (ngram.word == null)
                {
                    //This is unigram
                    if (ngram.occur >= g_min_count[0])
                    {
                        calc_unigram(context, ngram.occur, g_uni_stat, g_discounts[0], sw);
                    }
                }
                else
                {
                    cur_stat.count(ngram.occur);
                }
            }

            double lower_weight = interpolate ? discount.get_lower_weight(cur_stat) : 0.0;

            //All ngrams with same context
            for (int i = 0; i < vec_ngram.Count; i++)
            {
                Ngram ngram = vec_ngram[i];
                if (ngram.word == null)
                {
                    //skip unigram
                    continue;
                }
                if (ngram.occur >= g_min_count[order])
                {
                    double prob = discount.get_discount(ngram.occur) / (double)cur_stat.m_total;
                    sw.WriteLine("{0} {1}\t{2} {3}", context, ngram.word, prob, lower_weight);
                }
            }
        }
Beispiel #3
0
        public static Discount[] g_discounts = new Discount[LM_ORDER]; ///< discounts parameter
        ///

        /**
         * load the unigram statistics and discount constants from file
         *
         * @param file, file name
         * @param g_uni_stat, unigram statistics
         * @param discounts, n-gram discount constants
         *
         */
        static bool LoadStatFile(string file, NgramStat g_uni_stat, Discount[] discounts)
        {
            StreamReader sr = new StreamReader(file);

            string strLine = sr.ReadLine();
            int    pIndex  = strLine.IndexOf('\t');

            string[] items = strLine.Substring(pIndex + 1).Split();
            g_uni_stat.m_total = long.Parse(items[0]);
            g_uni_stat.m_min1  = long.Parse(items[1]);
            g_uni_stat.m_min2  = long.Parse(items[2]);
            g_uni_stat.m_min3  = long.Parse(items[3]);


            Console.WriteLine("Total:{0} Min1:{1} Min2:{2} Min3:{3}", g_uni_stat.m_total, g_uni_stat.m_min1, g_uni_stat.m_min2, g_uni_stat.m_min3);

            int order = 0;

            while (sr.EndOfStream == false)
            {
                strLine = sr.ReadLine();
                if (strLine.Length == 0)
                {
                    continue;
                }

                pIndex = strLine.IndexOf('\t');
                items  = strLine.Substring(pIndex + 1).Split();

                discounts[order]          = new Discount();
                discounts[order].m_d1     = double.Parse(items[0]);
                discounts[order].m_d2     = double.Parse(items[1]);
                discounts[order].m_d3plus = double.Parse(items[2]);
                order++;
            }
            sr.Close();

            return(order > 0);
        }
Beispiel #4
0
        const int LM_MINSCORE = -99;    ///< minimal score for unknown word

        //calculate the unigram prob and output it, the prob of BOS is 0
        static void calc_unigram(string word, long occur, NgramStat stat, Discount discount, StreamWriter sw)
        {
            //set the prob=0 for LM_BOS <s>
            if (word == LM_BOS)
            {
                double exponent = (double)LM_MINSCORE * Math.Log(10.0);

                sw.WriteLine("{0}\t{1} 0", word, Math.Exp(exponent));
                return;
            }

            //skip 1-gram with occur = 0
            if (occur <= 0)
            {
                return;
            }

            double prob = discount.get_discount(occur) / (double)stat.m_total;

            // normalize unigram, make sure sum(prob_unigram) = 1
            prob += discount.get_lower_weight(stat) / (double)stat.m_min1;

            sw.WriteLine("{0}\t{1} 0", word, prob);
        }