// org code from baseline, TBM public static double GetCorpusFreqScore(string inWord, WordWcMap wordWcMap) { // get the wc Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap(); int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0); // check if inWord is a multiword IList <string> spls = inWord.Split("[ ]").ToList(); bool isSplit = spls.Count >= 2; if (isSplit == false) { // check possessive, this is not right: // all XXX's will result in same scsore is XXX is bigger than 's if (inWord.EndsWith("'s", StringComparison.Ordinal)) { spls = new List <string>(); spls.Add(inWord.Substring(0, inWord.Length - 2)); spls.Add("'s"); isSplit = true; } } else { //System.out.println("---- split: [" + inWord + "]"); } // use the min. wc of split word in the multiword's case if (freq == 0 && isSplit) { int min = int.MaxValue; foreach (string spl in spls) { //System.out.println("- split: rpStr: [" + spl + "|" + rpStr + "]"); if (String.IsNullOrEmpty(spl)) { continue; } int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0); //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq); // use the min. freq of each word as the freq of the multiwords if (splFreq >= 0 && splFreq < min) { min = splFreq; } } // use the min. freq of the split words as whole word? freq = min; } if (freq == 0) { return(0.0); } long maxWc = wordWcMap.GetMaxWc(); double score = (Math.Log(freq) / Math.Log(maxWc)); return(score); }
// org code from baseline, TBM, From Ensemble public static double GetUnigramFreqScore(string inWord, WordWcMap wordWcMap) { Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap(); int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0); IList <string> spls = inWord.Split("[ ]", true).ToList(); bool isSplit = spls.Count >= 2; if (isSplit == false) { if (inWord.EndsWith("'s", StringComparison.Ordinal)) { spls = new List <string>(); spls.Add(inWord.Substring(0, inWord.Length - 2)); spls.Add("'s"); isSplit = true; } } // use the min. wc of split word in the multiword's case if (freq == 0 && isSplit) { int min = int.MaxValue; foreach (string spl in spls) { if (String.IsNullOrEmpty(spl)) { continue; } int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0); //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq); if (splFreq >= 0 && splFreq < min) { min = splFreq; } } freq = min; } if (freq == 0) { return(0.0); // to avoid infinity } long maxWc = wordWcMap.GetMaxWc(); long totalWc = wordWcMap.GetTotalWc(); double score = (Math.Log(1.0 * freq / totalWc) / Math.Log(1.0 * maxWc / totalWc)); return(score); }
private static int GetWc(string inWord, WordWcMap wordWcMap, bool caseFlag) { string inWordLc = inWord; // ignore case if (caseFlag == false) { inWordLc = inWord.ToLower(); } // the key of wWcMap are lowercased in the Beta version Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap(); int wc = 0; if (wWcMap.GetValueOrNull(inWordLc) != null) { wc = wWcMap.GetValueOrNull(inWordLc); } return(wc); }