// test Driver public static void MainTest(string[] args) { if (args.Length > 0) { Console.WriteLine("Usage: java ContextScore"); Environment.Exit(0); } // TBD: test // test real word and non word on split and merge case // // nonword split: knowabout -> know about, hotflahes -> hot flashes // alot -> a lot // nonword merge: stiff n ess -> stiffness // // realword: whatever vs. what ever // realword: onset vs. on set /// <summary> /// String configFile = "../data/Config/cSpell.properties"; /// CSpellApi cSpellApi = new CSpellApi(configFile); /// Word2Vec w2vIm = cSpellApi.GetWord2VecIm(); /// Word2Vec w2vOm = cSpellApi.GetWord2VecOm(); /// /// </summary> string inImFile = "../data/Context/syn0.data"; string inOmFile = "../data/Context/syn1n.data"; bool verboseFlag = true; Word2Vec w2vIm = new Word2Vec(inImFile, verboseFlag); Word2Vec w2vOm = new Word2Vec(inOmFile, verboseFlag); //Tests(w2vIm, w2vOm); TestOnSet(w2vIm, w2vOm); }
// unit test driver public static void MainTest(string[] args) { //String inFile = "../data/Context/word2Vec.data"; string inFile = "../data/Context/syn1n.data"; if (args.Length == 1) { inFile = args[0]; } else if (args.Length > 0) { Console.Error.WriteLine("Usage: java Word2Vec <inFile>"); Environment.Exit(1); } // test try { Word2Vec word2Vec = new Word2Vec(inFile); Console.WriteLine("Dimension: " + word2Vec.GetDimension()); Console.WriteLine("Word No: " + word2Vec.GetWordNo()); Console.WriteLine("Word size in WrodVec: " + word2Vec.GetWordVecMap().Keys.Count); Console.WriteLine("HasWordVec(man): " + word2Vec.HasWordVec("man")); Console.WriteLine("HasWordVec(king): " + word2Vec.HasWordVec("king")); Console.WriteLine("HasWordVec(ago): " + word2Vec.HasWordVec("ago")); Console.WriteLine("HasWordVec(a): " + word2Vec.HasWordVec("a")); Console.WriteLine("HasWordVec(ia): " + word2Vec.HasWordVec("ia")); Console.WriteLine("HasWordVec(m): " + word2Vec.HasWordVec("m")); Console.WriteLine("HasWordVec(xyxy): " + word2Vec.HasWordVec("xyxy")); } catch (Exception e) { Console.WriteLine(e.ToString()); Console.Write(e.StackTrace); } }
private static void TestOnSet(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "He was diagnosed early on set dementia 3 years ago."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); int tarPos = 4; int tarSize = 2; // "on set" has 2 tokens int radius = 2; bool word2VecSkipWord = true; bool debugFlag = false; // 1 context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); string str1 = "onset"; ContextScore s1 = new ContextScore(str1, contextVec, w2vOm); string str2 = "on set"; ContextScore s2 = new ContextScore(str2, contextVec, w2vOm); Console.WriteLine("- [" + str1 + "]: " + s1.ToString()); Console.WriteLine("- [" + str2 + "]: " + s2.ToString()); }
private static void Test(string inTerm, DoubleVec contextVec, Word2Vec w2vIm, Word2Vec w2vOm) { double cwobScore = GetScore(inTerm, contextVec, w2vOm); double cwobScore2 = GetScore2(inTerm, contextVec, w2vOm); double cosScore = GetSimilarityScore(inTerm, contextVec, w2vIm); Console.WriteLine(inTerm + "|" + string.Format("{0,1:F8}", cwobScore) + "|" + string.Format("{0,1:F8}", cwobScore2) + "|" + string.Format("{0,1:F8}", cosScore)); }
// public constructor /// <summary> /// Public constructor for ContextScore /// </summary> /// <param name="inTerm"> target token or candidate (can be multiword) </param> /// <param name="contextVec"> wordVec of context from IM </param> /// <param name="word2Vec"> word2Vec matrix of OM </param> public ContextScore(string inTerm, DoubleVec contextVec, Word2Vec word2Vec) { term_ = inTerm; // Use Cosine Similarity between IM and OM score_ = Word2VecScore.GetScore(inTerm, contextVec, word2Vec); // TBD: use 2-3 gram //score_ = NgramScore.GetScore(word, ngram); }
// tarPos: start from 0, not include empty space token public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int rankMode = cSpellApi.GetRankMode(); double wf1 = cSpellApi.GetOrthoScoreEdDistFac(); double wf2 = cSpellApi.GetOrthoScorePhoneticFac(); double wf3 = cSpellApi.GetOrthoScoreOverlapFac(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); string topRankStr = inStr; int maxCandNo = cSpellApi.GetCanMaxCandNo(); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetNw1To1ContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); double rangeFactor = cSpellApi.GetRankNwS1RankRangeFac(); double nwS1MinOScore = cSpellApi.GetRankNwS1MinOScore(); int tarSize = 1; // only for one-to-one or split, no merge here // get the top ranked candidate if (candidates.Count > 0) { // get the top rank str by scores switch (rankMode) { case CSpellApi.RANK_MODE_ORTHOGRAPHIC: topRankStr = RankByOrthographic.GetTopRankStr(inStr, candidates, wf1, wf2, wf3); ScoreDetailByMode.PrintOrthographicScore(inStr, candidates, maxCandNo, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_FREQUENCY: topRankStr = RankByFrequency.GetTopRankStr(candidates, wordWcMap); ScoreDetailByMode.PrintFrequencyScore(candidates, wordWcMap, maxCandNo, debugFlag); break; case CSpellApi.RANK_MODE_CONTEXT: topRankStr = RankByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius); ScoreDetailByMode.PrintContextScore(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, maxCandNo, debugFlag); break; case CSpellApi.RANK_MODE_NOISY_CHANNEL: topRankStr = RankByNoisyChannel.GetTopRankStr(inStr, candidates, wordWcMap, wf1, wf2, wf3); ScoreDetailByMode.PrintNoisyChannelScore(inStr, candidates, wordWcMap, maxCandNo, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_ENSEMBLE: topRankStr = RankByEnsemble.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, wf1, wf2, wf3); // ensemble use same basic socre as CSpell ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_CSPELL: topRankStr = RankByCSpellNonWord.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, nwS1MinOScore, wf1, wf2, wf3); ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); break; } } return(topRankStr); }
// not completed with contextScore private static void Tests(WordWcMap wordWcMap, Word2Vec w2vOm) { List <string> testStrList = new List <string>(); Test("spel", "spell", wordWcMap); Test("spel", "speil", wordWcMap); Test("spelld", "spell", wordWcMap); Test("spelld", "spelled", wordWcMap); }
// from ensemble paper, use the word vector (Input Matrix) for w2v // word2VecIm: word2Vec input matrix - syn0 // Similarity score use word2Vec Im public static double GetSimilarityScore(string inTerm, DoubleVec contextVec, Word2Vec word2VecIm) { // 1. Get Avg. score for inTerm DoubleVec termVec = GetWordVecForTerm(inTerm, word2VecIm); // 2. Get Cosine similarity between contextVec and tarVec double score = GetSimilarityScore(termVec, contextVec); return(score); }
// Use Avg. word2Vec Om for each word in the inTerm private static DoubleVec GetWordVecForTerm(string inTerm, Word2Vec w2vOm) { List <string> inWordList = TermUtil.ToWordList(inTerm); // avg. the wordVec if inTerm is a multiword DoubleVec outWordVec = GetAvgWordVecForList(inWordList, w2vOm); // TBD: take care of possesive return(outWordVec); }
// public method // inTerm: candidate (can be multiword) // contextVec: wordVec of context // word2VecOm: word2Vec output matrix - syn1neg // Use CWOB model to predict the target word = H X OM public static double GetScore(string inTerm, DoubleVec contextVec, Word2Vec w2vOm) { // 1. Get Avg. Vec for term (candidate from prediction) DoubleVec termVec = GetWordVecForTerm(inTerm, w2vOm); // 2. got the inner dot between hidden layer (context) and OM // to predict the output matrix in CBOW double score = GetCwobScore(termVec, contextVec); return(score); }
// use context score private static MergeObj GetTopRankMergeObjByContext(HashSet <MergeObj> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag) { // init Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetNwMergeContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); MergeObj topRankMergeObj = RankNonWordMergeByContext.GetTopRankMergeObj(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); return(topRankMergeObj); }
private static void TestWin(string inTerm, DoubleVec contextVec, Word2Vec w2vIm, Word2Vec w2vOm) { double cwobScore = GetScore(inTerm, contextVec, w2vOm); double cwobScore2 = GetScore2(inTerm, contextVec, w2vOm); double cosScore = GetSimilarityScore(inTerm, contextVec, w2vIm); Console.Write(string.Format("{0,1:F4}", cwobScore) + "-" + string.Format("{0,1:F4}", cwobScore2) + "|"); /* * System.out.println(inTerm + "|" + String.format("%1.8f", cwobScore) + "|" + String.format("%1.8f", cosScore)); */ }
// These are hueristic rule for real-word one-to-one correction // check if all one-to-one words in inTerm (candidate) // 1. must have wordVec. private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm) { List <string> wordList = TermUtil.ToWordList(inTerm); bool flag = true; foreach (string word in wordList) { if (word2VecOm.HasWordVec(word) == false) { flag = false; break; } } return(flag); }
// test Driver public static void MainTest(string[] args) { if (args.Length > 0) { Console.WriteLine("Usage: java Word2VecContext"); Environment.Exit(0); } // test string inImFile = "../data/Context/syn0.data"; string inOmFile = "../data/Context/syn1n.data"; bool verboseFlag = true; Word2Vec w2vIm = new Word2Vec(inImFile, verboseFlag); Word2Vec w2vOm = new Word2Vec(inOmFile, verboseFlag); Tests(w2vIm, w2vOm); }
// test Driver public static void MainTest(string[] args) { if (args.Length > 0) { Console.WriteLine("Usage: java CSpellScore"); Environment.Exit(0); } // test string inFile = "../data/Frequency/wcWord.data"; bool verboseFlag = true; WordWcMap wordWcMap = new WordWcMap(inFile, verboseFlag); string inOmFile = "../data/Context/syn1n.data"; Word2Vec w2vOm = new Word2Vec(inOmFile, verboseFlag); Tests(wordWcMap, w2vOm); }
private static string GetTopRankStrByContext(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); //WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); int contextRadius = cSpellApi.GetRwSplitContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); int tarSize = 1; // only for split, the target size = 1 double rwSplitCFactor = cSpellApi.GetRankRwSplitCFac(); int shortSplitWordLength = cSpellApi.GetCanRwShortSplitWordLength(); int maxShortSplitWordNo = cSpellApi.GetCanRwMaxShortSplitWordNo(); // include detail print string topRankStr = RankRealWordSplitByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, shortSplitWordLength, maxShortSplitWordNo, rwSplitCFactor, maxCandNo, debugFlag); return(topRankStr); }
// this method is to be deleted because it has same result as GetScore() public static double GetScore2(string inTerm, DoubleVec contextVec, Word2Vec w2vOm) { List <string> inWordList = TermUtil.ToWordList(inTerm); double score = 0.0d; int count = 0; foreach (string word in inWordList) { DoubleVec wordVec = w2vOm.GetWordVec(word); if (wordVec != null) { score += GetCwobScore(wordVec, contextVec); } count++; } // add score first, then calculate the avg. score = score / count; return(score); }
// Average wordVec for a list of words public static DoubleVec GetAvgWordVecForList(IList <string> wordList, Word2Vec word2Vec) { // init the matrix to all zero int dimension = word2Vec.GetDimension(); DoubleVec aveWordVec = new DoubleVec(dimension); int count = 0; foreach (string word in wordList) { DoubleVec wordVec = word2Vec.GetWordVec(word); if (wordVec != null) { aveWordVec.Add(wordVec); } count++; } // calculate the avg. if (count != 0) { aveWordVec.Divide(count); } return(aveWordVec); }
// specify the radius public static List <string> GetContext(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, bool word2VecSkipWord, bool debugFlag) { int radius = 0; // raidus is not needed when Context = true bool allContext = true; return(GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag, allContext)); }
// context from all inTextList, no specify on window radius public static DoubleVec GetContextVec(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, bool word2VecSkipWord, bool debugFlag) { // 1. get the context List <string> contextList = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); // 2. get the wordVec for the context DoubleVec contextVec = Word2VecScore.GetAvgWordVecForList(contextList, w2vIm); return(contextVec); }
private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "... last 10 years #$% was dianosed test123 yahoo.com early on set deminita 3 year ago."; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); Console.WriteLine("======= Word2VecContext ======================"); Console.WriteLine(" - inText: [" + inText + "]"); string inStr = String.Join("|", inTextList.Select(obj => obj.GetTokenStr())); Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]"); int tarPos = 0; int tarSize = 1; int index = 0; int radius = 3; bool debugFlag = false; Console.WriteLine("------ Test GetContext (no skip), radius=3 ..."); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); foreach (TokenObj tokenObj in inTextList) { // not the space token if (tokenObj.IsSpaceToken() == false) { string tokenStr = tokenObj.GetTokenStr(); // word2VecSkipWord = false (no skip) List <string> contextList = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, false, debugFlag); string contextStr = String.Join("|", contextList); Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr + "]"); tarPos++; } index++; } Console.WriteLine("------ Test GetContext (skip) , radius=3 ..."); Console.WriteLine(" - inText: [" + inText + "]"); tarPos = 0; foreach (TokenObj tokenObj in inTextList) { // not the space token if (tokenObj.IsSpaceToken() == false) { string tokenStr = tokenObj.GetTokenStr(); // word2VecSkipWord = true (skip) List <string> contextList2 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, true, debugFlag); string contextStr2 = String.Join("|", contextList2); Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr2 + "]"); tarPos++; } index++; } Console.WriteLine("------ Test GetContext (skip) , all ..."); Console.WriteLine(" - inText: [" + inText + "]"); tarPos = 0; // not the space token foreach (TokenObj tokenObj in nonSpaceTokenList) { string tokenStr = tokenObj.GetTokenStr(); // word2VecSkipWord = true (skip) List <string> contextList3 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, true, debugFlag); string contextStr3 = String.Join("|", contextList3); Console.WriteLine(tarPos + "|" + tokenStr + ": [" + contextStr3 + "]"); tarPos++; } }
// private method private static void Test(Word2Vec w2vIm, Word2Vec w2vOm) { }
// Get context: // tarPos: target word position // tarSize: no. of tokens for target word (merge should be > 1) // inTextList: No empty space token // w2vIm: context must use word2Vec input matrix // radius: number of tokens before / after the tarPos // boolean word2VecSkipWord: skip word if the word does not have wordVec private static List <string> GetContextForTar(int tarPos, int tarSize, List <string> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool allContext) { // output context List <string> outContextList = new List <string>(); // 2. find context before the tar token int tokenNo = 0; for (int i = tarPos - 1; i >= 0; i--) { string inWord = nonSpaceTokenList[i]; // check if has wordVec if word2VecSkipWord = true if ((word2VecSkipWord == false) || (w2vIm.HasWordVec(inWord) == true)) { tokenNo++; if ((tokenNo <= radius) || (allContext == true)) { outContextList.Insert(0, inWord); } else { break; } } } // 3. find context after the tar token int endPos = tarPos + tarSize; // target could be multiwords tokenNo = 0; for (int i = endPos; i < nonSpaceTokenList.Count; i++) { string inWord = nonSpaceTokenList[i]; if ((word2VecSkipWord == false) || (w2vIm.HasWordVec(inWord) == true)) { tokenNo++; if ((tokenNo <= radius) || (allContext == true)) { outContextList.Add(inWord); } else { break; } } } return(outContextList); }
private static bool CheckRealWord1To1Rules(ContextScore topContextScore, string inStr, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rw1To1Factor, bool debugFlag) { // return false if no topCand found if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr))) { return(false); } // 1. get the word2Vec score for the org inStr b4 one-to-one // 1.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before one-to-one ContextScore orgCs = new ContextScore(inStr, contextVec, word2VecOm); DebugPrint.Println("--- Real-Word One-To-One Context Score Detail: ---", debugFlag); DebugPrint.Println("- Score - orgTerm: " + orgCs.ToString(), debugFlag); DebugPrint.Println("- Score - top 1-to-1: " + topContextScore.ToString(), debugFlag); DebugPrint.Println("- rw1To1Factor: " + rw1To1Factor, debugFlag); // Score rules for one-to-one double orgScore = orgCs.GetScore(); double topScore = topContextScore.GetScore(); bool flag = false; // 2.1 no one-to-one correction if orgScore is 0.0d, no word2Vec information if (orgScore < 0.0d) { // 2.2a one-to-one if the org score is negative and top score is positive if (topScore > 0.0d) { // another rule for word2Vec on real-word if (((topScore - orgScore) > 0.085) && (orgScore > -0.085)) // help from 0.6812 to 0.6877 { flag = true; } } // 2.2b one-to-one if the org score is negative and top score is better else if ((topScore < 0.0d) && (topScore > orgScore * rw1To1Factor)) { flag = true; } } else if (orgScore > 0.0d) { // 2.3a merge if the org score is positive and better 0.01*topScore if (topScore * rw1To1Factor > orgScore) { flag = true; } } return(flag); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return null if no candidate is found to correct public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rwMergeFactor, bool debugFlag) { // init the topRankMergeObj MergeObj topRankMergeObj = null; if (candidates.Count > 0) { // 1. find sorted score list for each candidates ... List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 2. find the top ranked str // the 0 element has the highest score because it is sorted // only 1 candidate, use it for nonWord ContextScore topContextScore = null; if (candScoreList.Count > 0) { topContextScore = candScoreList[0]; } // 3. find the mergeObj from the topRankStr (if exist) if (topContextScore != null) { // 3.1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> andStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 3.2 convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 string topRankStr = topContextScore.GetTerm(); topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); // 4. compare the top rank merge to the original string b4 merge // 1. get the word2Vec score for the orgMergeTerm b4 merge // 1.1 wordVec for context int tarPos = topRankMergeObj.GetStartPos(); // tarSize is the total token No of the orgMergeWords int tarSize = topRankMergeObj.GetEndPos() - topRankMergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before merge string orgMergeWord = topRankMergeObj.GetOrgMergeWord(); ContextScore orgContextScore = new ContextScore(orgMergeWord, contextVec, word2VecOm); // validate top merge candidate, set to null if false if (IsTopCandValid(orgContextScore, topContextScore, rwMergeFactor, debugFlag) == false) { // set to null if score is not good enough for corection topRankMergeObj = null; } } } return(topRankMergeObj); }
private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "for the last 10 years was dianosed\n early on set deminita 3 years ago"; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); List <string> testStrList = new List <string>(); testStrList.Add("diagnosed"); testStrList.Add("diagnose"); testStrList.Add("dianosed"); // init context int tarPos = 6; int tarSize = 1; int radius = 2; bool word2VecSkipWord = true; bool debugFlag = false; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); Console.WriteLine("===== Test diagnosed|diagnose|dianosed (window-2) ====="); Console.WriteLine("inText: [" + inText + "]"); Console.WriteLine("============================================"); Console.WriteLine("Candidates|CBOW score|CBOW score 2|Similarity score"); Console.WriteLine("============================================"); foreach (string testStr in testStrList) { Test(testStr, contextVec, w2vIm, w2vOm); } Console.WriteLine("===== Test diagnosed|diagnose|dianosed (whole text) ====="); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); foreach (string testStr in testStrList) { Test(testStr, contextVec, w2vIm, w2vOm); } string inText1 = "Not all doctors know about this syndrome."; List <TokenObj> inTextList1 = TextObj.TextToTokenList(inText1); // remove space token from the list List <TokenObj> nonSpaceTokenList1 = TextObj.GetNonSpaceTokenObjList(inTextList1); Console.WriteLine("===== Test know about|know|about (window) ====="); List <string> testStrList1 = new List <string>(); testStrList1.Add("know about"); testStrList1.Add("know"); testStrList1.Add("about"); tarPos = 3; tarSize = 2; radius = 2; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag); Test(testStrList1[0], contextVec, w2vIm, w2vOm); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag); Test(testStrList1[0], contextVec, w2vIm, w2vOm); tarPos = 3; tarSize = 1; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag); Test(testStrList1[1], contextVec, w2vIm, w2vOm); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag); Test(testStrList1[1], contextVec, w2vIm, w2vOm); tarPos = 4; tarSize = 1; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag); Test(testStrList1[2], contextVec, w2vIm, w2vOm); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag); Test(testStrList1[2], contextVec, w2vIm, w2vOm); string inText2 = "for the last 10 years was diagnosed early on set dementia 3 years ago."; List <TokenObj> inTextList2 = TextObj.TextToTokenList(inText2); // remove space token from the list List <TokenObj> nonSpaceTokenList2 = TextObj.GetNonSpaceTokenObjList(inTextList2); List <string> testStrList2 = new List <string>(); testStrList2.Add("onset"); testStrList2.Add("on set"); Console.WriteLine("===== Test onset|on set (window-3) ====="); Console.WriteLine("inText: [" + inText + "]"); tarPos = 8; tarSize = 2; radius = 3; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag); foreach (string testStr in testStrList2) { Test(testStr, contextVec, w2vIm, w2vOm); } tarPos = 8; tarSize = 1; radius = 3; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag); Test("on", contextVec, w2vIm, w2vOm); tarPos = 9; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag); Test("set", contextVec, w2vIm, w2vOm); Console.WriteLine("===== Test onset|on set (whole text) ====="); radius = nonSpaceTokenList2.Count; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, word2VecSkipWord, debugFlag); foreach (string testStr in testStrList2) { Test(testStr, contextVec, w2vIm, w2vOm); } Console.WriteLine("===== Go through each tokens with diff radius 1-9) ====="); Console.WriteLine("tarPos|tarWord|r=1|r=2|r=3|r=4|r=5|r=6|r=7|r=8|r=9"); //String inText3 = "Broken bones can not sleep at night!"; string inText3 = "not xyxy all doctors know about this syndrome."; List <TokenObj> inTextList3 = TextObj.TextToTokenList(inText3); // remove space token from the list List <TokenObj> nonSpaceTokenList3 = TextObj.GetNonSpaceTokenObjList(inTextList3); tarPos = 0; tarSize = 1; radius = 0; foreach (TokenObj tokenObj in nonSpaceTokenList3) { // skip the space token string tokenStr = tokenObj.GetTokenStr(); string inStr = Word2VecContext.NormWordForWord2Vec(tokenStr); Console.Write(tarPos + "|" + tokenStr + "|"); // print out all radius for (int r = 1; r < 10; r++) { contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, inTextList2, w2vIm, r, word2VecSkipWord, debugFlag); TestWin(inStr, contextVec, w2vIm, w2vOm); } Console.WriteLine(""); tarPos++; } }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag) { // init string topRankStr = inStr; // Find the correction str if (candidates.Count > 0) { // 1. sorted score list for each candidates ... // This ranking can be improved if n-gram model (frequecny) is used List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 1.1 get the top tank candidate ContextScore topContextScore = candScoreList[0]; // 2. validate the top rank // 2.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2.2 wordVec for the original words before split ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); // 2.3 compare the top rank split to the original string b4 split if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topContextScore.GetTerm(); } // debug print if (debugFlag == true) { // print focus token (original) DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag); // print candidates ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } } return(topRankStr); }
public static List <string> GetContext(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool debugFlag) { bool allContext = false; return(GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag, allContext)); }
private static List <string> GetContext(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool debugFlag, bool allContext) { // normal TokenObj to string, use coreTerm.lc List <string> normTextList = new List <string>(); if (nonSpaceTokenList != null) { foreach (TokenObj tokenObj in nonSpaceTokenList) { // norm the token, such as [NUM], [URL], [EMAIL] // TBD, should be done in pre-correction, preProcess string normWord = NormWordForWord2Vec(tokenObj.GetTokenStr()); normTextList.Add(normWord); } } // get the context list by normStr (becasue normStr is key in w2v) List <string> contextList = GetContextForTar(tarPos, tarSize, normTextList, w2vIm, radius, word2VecSkipWord, allContext); DebugPrint.PrintContext(contextList, debugFlag); return(contextList); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag) { // init WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetRw1To1ContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); double wf1 = cSpellApi.GetOrthoScoreEdDistFac(); double wf2 = cSpellApi.GetOrthoScorePhoneticFac(); double wf3 = cSpellApi.GetOrthoScoreOverlapFac(); int tarSize = 1; // only for one-to-one, no merge here string topRankStr = inStr; // use cSpell top candidates int topNo = 1; // top sort string inStrLc = inStr.ToLower(); List <CSpellScore> cSpellScoreList = RankByCSpellRealWord1To1.GetCandidateScoreList(inStrLc, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); // Find the correction str and correct if (cSpellScoreList.Count > 0) { // the rw top rank must be in both NC and orthographic CSpellScore topScore = cSpellScoreList[0]; double topFScore = topScore.GetFScore().GetScore(); //frequency double topTScore = topScore.GetOScore().GetTokenScore(); // Token double topPScore = topScore.GetOScore().GetPhoneticScore(); //Phone double topOScore = topScore.GetOScore().GetOverlapScore(); //overlap ContextScore orgContextScore = null; // check the frequency // get the max score of frequency, eidt, phonetic, and overlap // the top rank must have all top score for above if ((topFScore == CSpellScore.GetMaxFScore(cSpellScoreList)) && (topTScore == CSpellScore.GetMaxEScore(cSpellScoreList)) && (topPScore == CSpellScore.GetMaxPScore(cSpellScoreList)) && (topOScore == CSpellScore.GetMaxOScore(cSpellScoreList))) { ContextScore topContextScore = topScore.GetCScore(); // 1.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before one-to-one orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); FrequencyScore orgFScore = new FrequencyScore(inStr, wordWcMap); // pass the orgContextScore if (IsTopCandValid(inStr, orgContextScore, topScore, orgFScore, cSpellApi, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topScore.GetCandStr(); // debug print for ananlysis /// <summary> ///* /// System.out.println("======= cSpellScoreList.size(): " /// + cSpellScoreList.size() + " ========"); /// System.out.println(inStr /// + "," + String.format("%1.8f", orgFScore.GetScore()) /// + "," + String.format("%1.8f", orgContextScore.GetScore())); /// System.out.println(CSpellScore.GetScoreHeader()); /// for(CSpellScore cSpellScore: cSpellScoreList) /// { /// System.out.println(cSpellScore.ToString(",")); /// } /// ** /// </summary> } } // debug print if (debugFlag == true) { // print focus token (original) if (orgContextScore != null) { DebugPrint.PrintScore(orgContextScore.ToString(), debugFlag); } else { DebugPrint.PrintScore("No score for focus (" + inStr + ")", debugFlag); } // print candidate var list = cSpellScoreList.Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintScore(item, debugFlag); } } } return(topRankStr); }