// include multiwords, multiwords = avg. score private static double GetScoreDev2(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; int totalWords = wordList.Count; //double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords foreach (string word in wordList) { //double curScore = GetScoreByChurch(word, wordWcMap); //double curScore = GetScoreByCrowell(word, wordWcMap); //double curScore = GetScoreByPeter(word, wordWcMap); //double curScore = GetUnigramFreqScore(word, wordWcMap); //double curScore = GetWc(word, wordWcMap); double curScore = GetScoreDev1(word, wordWcMap); totalScore += curScore; } if (totalScore > 0.0) { score = totalScore / totalWords; } return(score); }
// These are hueristic rule for real-wrod split // check the total no of short word for split words in inTerm (candidate) // short word is configurable, such as 2 or 3 // the total no of split shot word must less than a number, default is 2 // This rule is added to filter out: some -> so me, // filter out: another -> a not her (shortSplitWordNo = 3) // filter out: anyone -> any one (shortSplitWordNo = 2) // 1. keep: away -> a way (shortSplitWordNo = 1) // 2. filter: out soon -> so on (shortSplitWordNo = 2) // 3. filter: out anyway -> any way (shortSplitWordNo = 2) private static bool CheckShortSplitWords(string inTerm, CSpellApi cSpellApi) { // init int shortSplitWordLength = cSpellApi.GetCanRwShortSplitWordLength(); int maxShortSplitWordNo = cSpellApi.GetCanRwMaxShortSplitWordNo(); // convert to word list List <string> wordList = TermUtil.ToWordList(inTerm); bool flag = true; int shortSplitWordNo = 0; // total no of short split word 1 foreach (string word in wordList) { // find shor word if (word.Length <= shortSplitWordLength) { shortSplitWordNo++; } } // check the total no of short split words (length <= 2) if (shortSplitWordNo >= maxShortSplitWordNo) { flag = false; } return(flag); }
// check all split words form a term to verify it is a valid // inTerm is the term to be split // the inTerm is a coreTerm public static bool IsValidSplitWords(string inTerm, CSpellApi cSpellApi) { //RootDictionary unitDic = cSpellApi.GetUnitDic(); List <string> splitWordList = TermUtil.ToWordList(inTerm); bool validFlag = true; // go through all split words, they can be: // 1. digit (pure number) // 2. unit // 3. word in the split word dictionary: English + ProperNoun (not Aa) // if any splitWord is not above, the split is false foreach (string splitWord in splitWordList) { /* remove unit and digit beacuse: * 1. they are handled in ND * 2. some unit are Aa, such as ng, cause noise [FP] * - seing => se i ng, no good * if((DigitPuncTokenUtil.IsDigit(splitWord) == false) // digit * && (unitDic.IsDicWord(splitWord) == false) // unit * && (IsValidSplitWord(splitWord, cSpellApi) == false))// split word */ if (IsValidSplitWord(splitWord, cSpellApi) == false) { validFlag = false; break; } } return(validFlag); }
// Use Avg. word2Vec Om for each word in the inTerm private static DoubleVec GetWordVecForTerm(string inTerm, Word2Vec w2vOm) { List <string> inWordList = TermUtil.ToWordList(inTerm); // avg. the wordVec if inTerm is a multiword DoubleVec outWordVec = GetAvgWordVecForList(inWordList, w2vOm); // TBD: take care of possesive return(outWordVec); }
// private method // Test merge and Split private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm) { // 0. process the inText TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); bool word2VecSkipWord = true; bool debugFlag = false; // 1.a context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); // 1.b context with all inText DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); // 1.c get score1 ContextScore score1 = new ContextScore(mergedWord, contextVec, w2vOm); ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm); Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore())); // 2. split words ContextScore score2 = new ContextScore(splitWords, contextVec, w2vOm); ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm); Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore())); // 3. 3. 3. Use avg. score on single words // This method use different context for each single word List <string> splitWordList = TermUtil.ToWordList(splitWords); int index = 0; double scoreSAvg = 0.0d; // radius double scoreSAAvg = 0.0d; // all inText //debugFlag = false; foreach (string splitWord in splitWordList) { // window radius DoubleVec contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); ContextScore scoreS = new ContextScore(splitWord, contextVecS, w2vOm); //System.out.println("-- " + scoreS.ToString()); scoreSAvg += scoreS.GetScore(); // all text DoubleVec contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); ContextScore scoreSA = new ContextScore(splitWord, contextVecSA, w2vOm); //System.out.println("-- " + scoreSA.ToString()); scoreSAAvg += scoreSA.GetScore(); index++; } scoreSAvg = scoreSAvg / index; // window scoreSAAvg = scoreSAAvg / index; // all text Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg)); }
// These are hueristic rule for real-word one-to-one correction // check if all one-to-one words in inTerm (candidate) // 1. must have wordVec. private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm) { List <string> wordList = TermUtil.ToWordList(inTerm); bool flag = true; foreach (string word in wordList) { if (word2VecOm.HasWordVec(word) == false) { flag = false; break; } } return(flag); }
// this method is to be deleted because it has same result as GetScore() public static double GetScore2(string inTerm, DoubleVec contextVec, Word2Vec w2vOm) { List <string> inWordList = TermUtil.ToWordList(inTerm); double score = 0.0d; int count = 0; foreach (string word in inWordList) { DoubleVec wordVec = w2vOm.GetWordVec(word); if (wordVec != null) { score += GetCwobScore(wordVec, contextVec); } count++; } // add score first, then calculate the avg. score = score / count; return(score); }
// get socre for single word and multiwords (for split cases) // 1). multiword: score = avg. score of allwords // 2). single word: score = log(adjust WC) / log (adjust Max. WC). public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; long totalWords = wordList.Count; double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords foreach (string word in wordList) { totalScore += GetWordScore(word, maxWc, wordWcMap); } if (totalWords > 0) { score = totalScore / totalWords; } return(score); }
// check all split words private static bool CheckSplitWords(string inTerm, CSpellApi cSpellApi) { // convert to word list List <string> splitWordList = TermUtil.ToWordList(inTerm); // go through all split words, they can be: // 1. digit (pure number) // 2. unit // 3. word in the split word dictionary: English + ProperNoun (not Aa) // if any splitWord is not above, the split is false bool flag = true; foreach (string splitWord in splitWordList) { // check each split word if (IsValidSplitWord(splitWord, cSpellApi) == false) { flag = false; break; } } return(flag); }
public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap) { // check multiword case for split bool normFlag = false; // don't use punctuation for determiner List <string> wordList = TermUtil.ToWordList(inWord, normFlag); double score = 0.0; double totalScore = 0.0; int totalWords = wordList.Count; double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc()); // use the average score for the multiwords double minScore = int.MaxValue; foreach (string word in wordList) { double curScore = GetWordScore(word, maxWc, wordWcMap); minScore = (curScore < minScore ? curScore : minScore); } if (minScore < int.MaxValue) { score = minScore; } return(score); }