// single word score, multiwords is 0.0 private static double GetScoreDev1(string inWord, WordWcMap wordWcMap) { long maxWc = wordWcMap.GetMaxWc(); double wc = GetWc(inWord, wordWcMap); double score = wc / (1.0 * maxWc); return(score); }
// private method // possibility = WC(w0rd)/max_word_cout private static double GetWordPossOverMaxWc(string inWord, WordWcMap wordWcMap, bool caseFlag) { double wc = 1.0d * GetWc(inWord, wordWcMap, caseFlag); double maxWc = 1.0d * wordWcMap.GetMaxWc(); double score = wc / maxWc; return(score); }
// org code from baseline, TBM public static double GetCorpusFreqScore(string inWord, WordWcMap wordWcMap) { // get the wc Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap(); int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0); // check if inWord is a multiword IList <string> spls = inWord.Split("[ ]").ToList(); bool isSplit = spls.Count >= 2; if (isSplit == false) { // check possessive, this is not right: // all XXX's will result in same scsore is XXX is bigger than 's if (inWord.EndsWith("'s", StringComparison.Ordinal)) { spls = new List <string>(); spls.Add(inWord.Substring(0, inWord.Length - 2)); spls.Add("'s"); isSplit = true; } } else { //System.out.println("---- split: [" + inWord + "]"); } // use the min. wc of split word in the multiword's case if (freq == 0 && isSplit) { int min = int.MaxValue; foreach (string spl in spls) { //System.out.println("- split: rpStr: [" + spl + "|" + rpStr + "]"); if (String.IsNullOrEmpty(spl)) { continue; } int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0); //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq); // use the min. freq of each word as the freq of the multiwords if (splFreq >= 0 && splFreq < min) { min = splFreq; } } // use the min. freq of the split words as whole word? freq = min; } if (freq == 0) { return(0.0); } long maxWc = wordWcMap.GetMaxWc(); double score = (Math.Log(freq) / Math.Log(maxWc)); return(score); }
// org code from baseline, TBM, From Ensemble public static double GetUnigramFreqScore(string inWord, WordWcMap wordWcMap) { Dictionary <string, int> wWcMap = wordWcMap.GetWordWcMap(); int freq = (wWcMap.ContainsKey(inWord) ? wWcMap.GetValueOrNull(inWord) : 0); IList <string> spls = inWord.Split("[ ]", true).ToList(); bool isSplit = spls.Count >= 2; if (isSplit == false) { if (inWord.EndsWith("'s", StringComparison.Ordinal)) { spls = new List <string>(); spls.Add(inWord.Substring(0, inWord.Length - 2)); spls.Add("'s"); isSplit = true; } } // use the min. wc of split word in the multiword's case if (freq == 0 && isSplit) { int min = int.MaxValue; foreach (string spl in spls) { if (String.IsNullOrEmpty(spl)) { continue; } int splFreq = (wWcMap.ContainsKey(spl) ? wWcMap.GetValueOrNull(spl) : 0); //System.out.println("Corpus count:" + spl + "|" + wWcMap.get(spl) + "|" + splFreq); if (splFreq >= 0 && splFreq < min) { min = splFreq; } } freq = min; } if (freq == 0) { return(0.0); // to avoid infinity } long maxWc = wordWcMap.GetMaxWc(); long totalWc = wordWcMap.GetTotalWc(); double score = (Math.Log(1.0 * freq / totalWc) / Math.Log(1.0 * maxWc / totalWc)); return(score); }
// get socre for single word and multiwords (for split cases) // 1). multiword: score = avg. score of allwords // 2). single word: score = log(adjust WC) / log (adjust Max. WC). public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; long totalWords = wordList.Count; double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords foreach (string word in wordList) { totalScore += GetWordScore(word, maxWc, wordWcMap); } if (totalWords > 0) { score = totalScore / totalWords; } return(score); }
public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; int totalWords = wordList.Count; double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords double minScore = int.MaxValue; foreach (string word in wordList) { double curScore = GetWordScore(word, maxWc, wordWcMap); minScore = (curScore < minScore ? curScore : minScore); } if (minScore < int.MaxValue) { score = minScore; } return(score); }