Пример #1
0
        // single word score, multiwords is 0.0
        private static double GetScoreDev1(string inWord, WordWcMap wordWcMap)
        {
            long   maxWc = wordWcMap.GetMaxWc();
            double wc    = GetWc(inWord, wordWcMap);
            double score = wc / (1.0 * maxWc);

            return(score);
        }
Пример #2
0
        // private method
        // possibility = WC(w0rd)/max_word_cout
        private static double GetWordPossOverMaxWc(string inWord, WordWcMap wordWcMap, bool caseFlag)
        {
            double wc    = 1.0d * GetWc(inWord, wordWcMap, caseFlag);
            double maxWc = 1.0d * wordWcMap.GetMaxWc();
            double score = wc / maxWc;

            return(score);
        }
Пример #3
0
        // org code from baseline, TBM
        public static double GetCorpusFreqScore(string inWord, WordWcMap wordWcMap)
        {
            // get the wc
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            // check if inWord is a multiword
            IList <string> spls    = inWord.Split("[ ]").ToList();
            bool           isSplit = spls.Count >= 2;

            if (isSplit == false)
            {
                // check possessive, this is not right:
                // all XXX's will result in same scsore is XXX is bigger than 's
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            else
            {
                //System.out.println("---- split: [" + inWord + "]");
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    //System.out.println("- split: rpStr: [" + spl + "|" + rpStr + "]");
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    // use the min. freq of each word as the freq of the multiwords
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                // use the min. freq of the split words as whole word?
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);
            }
            long   maxWc = wordWcMap.GetMaxWc();
            double score = (Math.Log(freq) / Math.Log(maxWc));

            return(score);
        }
Пример #4
0
        // org code from baseline, TBM, From Ensemble
        public static double GetUnigramFreqScore(string inWord, WordWcMap wordWcMap)
        {
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int            freq             = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            IList <string> spls             = inWord.Split("[ ]", true).ToList();
            bool           isSplit          = spls.Count >= 2;

            if (isSplit == false)
            {
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);                // to avoid infinity
            }
            long   maxWc   = wordWcMap.GetMaxWc();
            long   totalWc = wordWcMap.GetTotalWc();
            double score   = (Math.Log(1.0 * freq / totalWc) / Math.Log(1.0 * maxWc / totalWc));

            return(score);
        }
Пример #5
0
        // get socre for single word and multiwords (for split cases)
        // 1). multiword: score = avg. score of allwords
        // 2). single word: score =  log(adjust WC) / log (adjust Max. WC).
        public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            long          totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());

            // use the average score for the multiwords
            foreach (string word in wordList)
            {
                totalScore += GetWordScore(word, maxWc, wordWcMap);
            }
            if (totalWords > 0)
            {
                score = totalScore / totalWords;
            }
            return(score);
        }
Пример #6
0
        public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            double minScore = int.MaxValue;

            foreach (string word in wordList)
            {
                double curScore = GetWordScore(word, maxWc, wordWcMap);
                minScore = (curScore < minScore ? curScore : minScore);
            }
            if (minScore < int.MaxValue)
            {
                score = minScore;
            }
            return(score);
        }