Пример #1
0
        // possibility = WC(w0rd)/total_word_cout
        private static double GetWordPossOverTotalWc(string inWord, WordWcMap wordWcMap, bool caseFlag)
        {
            double wc      = 1.0d * GetWc(inWord, wordWcMap, caseFlag);
            double totalWc = 1.0d * wordWcMap.GetTotalWc();
            double score   = wc / totalWc;

            return(score);
        }
Пример #2
0
        // should be the same as GetUnigramFreqScore
        // score range is between 0.0 ~ 1.0
        // not used because it is no good
        private static double GetWordScore2(string inWord, double maxWc, WordWcMap wordWcMap)
        {
            double wc      = 1.0d * GetWc(inWord, wordWcMap);
            double totalWc = 1.0d * wordWcMap.GetTotalWc();
            double score   = (Math.Log(wc / totalWc) / Math.Log(maxWc / totalWc));

            return(score);
        }
Пример #3
0
        private static double GetScoreByPeter(string inWord, WordWcMap wordWcMap)
        {
            long   totalWc = wordWcMap.GetTotalWc();
            double wc      = GetWc(inWord, wordWcMap);
            double score   = wc / (1.0 * totalWc);

            return(score);
        }
Пример #4
0
        // org code from baseline, TBM, From Ensemble
        public static double GetUnigramFreqScore(string inWord, WordWcMap wordWcMap)
        {
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int            freq             = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            IList <string> spls             = inWord.Split("[ ]", true).ToList();
            bool           isSplit          = spls.Count >= 2;

            if (isSplit == false)
            {
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);                // to avoid infinity
            }
            long   maxWc   = wordWcMap.GetMaxWc();
            long   totalWc = wordWcMap.GetTotalWc();
            double score   = (Math.Log(1.0 * freq / totalWc) / Math.Log(1.0 * maxWc / totalWc));

            return(score);
        }