Пример #1
0
        // org code from baseline, TBM
        public static double GetCorpusFreqScore(string inWord, WordWcMap wordWcMap)
        {
            // get the wc
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            // check if inWord is a multiword
            IList <string> spls    = inWord.Split("[ ]").ToList();
            bool           isSplit = spls.Count >= 2;

            if (isSplit == false)
            {
                // check possessive, this is not right:
                // all XXX's will result in same scsore is XXX is bigger than 's
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            else
            {
                //System.out.println("---- split: [" + inWord + "]");
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    //System.out.println("- split: rpStr: [" + spl + "|" + rpStr + "]");
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    // use the min. freq of each word as the freq of the multiwords
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                // use the min. freq of the split words as whole word?
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);
            }
            long   maxWc = wordWcMap.GetMaxWc();
            double score = (Math.Log(freq) / Math.Log(maxWc));

            return(score);
        }
Пример #2
0
        // org code from baseline, TBM, From Ensemble
        public static double GetUnigramFreqScore(string inWord, WordWcMap wordWcMap)
        {
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int            freq             = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0);
            IList <string> spls             = inWord.Split("[ ]", true).ToList();
            bool           isSplit          = spls.Count >= 2;

            if (isSplit == false)
            {
                if (inWord.EndsWith("'s", StringComparison.Ordinal))
                {
                    spls = new List <string>();
                    spls.Add(inWord.Substring(0, inWord.Length - 2));
                    spls.Add("'s");
                    isSplit = true;
                }
            }
            // use the min. wc of split word in the multiword's case
            if (freq == 0 && isSplit)
            {
                int min = int.MaxValue;
                foreach (string spl in spls)
                {
                    if (String.IsNullOrEmpty(spl))
                    {
                        continue;
                    }
                    int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0);
                    //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq);
                    if (splFreq >= 0 && splFreq < min)
                    {
                        min = splFreq;
                    }
                }
                freq = min;
            }
            if (freq == 0)
            {
                return(0.0);                // to avoid infinity
            }
            long   maxWc   = wordWcMap.GetMaxWc();
            long   totalWc = wordWcMap.GetTotalWc();
            double score   = (Math.Log(1.0 * freq / totalWc) / Math.Log(1.0 * maxWc / totalWc));

            return(score);
        }
Пример #3
0
        private static int GetWc(string inWord, WordWcMap wordWcMap, bool caseFlag)
        {
            string inWordLc = inWord;

            // ignore case
            if (caseFlag == false)
            {
                inWordLc = inWord.ToLower();
            }
            // the key of wWcMap are lowercased in the Beta version
            Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap();
            int wc = 0;

            if (wWcMap.GetValueOrNull(inWordLc) != null)
            {
                wc = wWcMap.GetValueOrNull(inWordLc);
            }
            return(wc);
        }