private int compareByEnsemble(CSpellScore o1, CSpellScore o2) { int @out = 0; OrthographicScore oScore1 = ((CSpellScore)o1).GetOScore(); OrthographicScore oScore2 = ((CSpellScore)o2).GetOScore(); FrequencyScore fScore1 = ((CSpellScore)o1).GetFScore(); FrequencyScore fScore2 = ((CSpellScore)o2).GetFScore(); ContextScore cScore1 = ((CSpellScore)o1).GetCScore(); ContextScore cScore2 = ((CSpellScore)o2).GetCScore(); double score1 = 0.6 * oScore1.GetScore() + 0.25 * fScore1.GetScore() + 0.15 * cScore1.GetScore(); double score2 = 0.6 * oScore2.GetScore() + 0.25 * fScore2.GetScore() + 0.15 * cScore2.GetScore(); // 1. compared by orthographic score, best // SCR-2: use a fixed number to ensure result is not 0. if (score2 > score1) { // from high to low @out = 1; } else if (score2 < score1) { @out = -1; } // 2. alphabetic order else { string cand1 = ((CSpellScore)o1).GetCandStr(); string cand2 = ((CSpellScore)o2).GetCandStr(); @out = cand2.CompareTo(cand1); } return(@out); }
// check score rule for real-word merge correctionrrayList<TokenObj> // nonSpaceTokenList, private static bool IsTopCandValid(ContextScore orgContextScore, ContextScore topContextScore, double rwMergeFactor, bool debugFlag) { // Score rules for merge double orgScore = orgContextScore.GetScore(); double topScore = topContextScore.GetScore(); bool flag = false; // 2.1 no merge correction if orgScore is 0.0d, no word2Vec information if (orgScore < 0.0d) { // 2.2a merge if the org score is negative and top score is positive if (topScore > 0.0d) { flag = true; } // 2.2b merge if the org score is negative and top score is better // this is needed for higher recall and F1 else if ((topScore < 0.0d) && (topScore > orgScore * rwMergeFactor)) { flag = true; } } else if (orgScore > 0.0d) { // 2.3a merge if the org score is positive and better 0.01*topScore if (topScore * rwMergeFactor > orgScore) { flag = true; } } return(flag); }
private static void TestOnSet(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "He was diagnosed early on set dementia 3 years ago."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); int tarPos = 4; int tarSize = 2; // "on set" has 2 tokens int radius = 2; bool word2VecSkipWord = true; bool debugFlag = false; // 1 context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); string str1 = "onset"; ContextScore s1 = new ContextScore(str1, contextVec, w2vOm); string str2 = "on set"; ContextScore s2 = new ContextScore(str2, contextVec, w2vOm); Console.WriteLine("- [" + str1 + "]: " + s1.ToString()); Console.WriteLine("- [" + str2 + "]: " + s2.ToString()); }
// private methods private static bool IsTopCandValidByScores(ContextScore orgContextScore, FrequencyScore orgFreqScore, ContextScore topContextScore, CSpellScore topCSpellScore, CSpellApi cSpellApi) { // init bool flag = false; double rw1To1CandCsFactor = cSpellApi.GetRankRw1To1CandCsFac(); double rw1To1WordMinCs = cSpellApi.GetRankRw1To1WordMinCs(); double rw1To1CandMinCs = cSpellApi.GetRankRw1To1CandMinCs(); double rw1To1CandCsDist = cSpellApi.GetRankRw1To1CandCsDist(); double rw1To1CandFsFactor = cSpellApi.GetRankRw1To1CandFsFac(); double rw1To1CandMinFs = cSpellApi.GetRankRw1To1CandMinFs(); double rw1To1CandFsDist = cSpellApi.GetRankRw1To1CandFsDist(); double orgScore = orgContextScore.GetScore(); double topScore = topContextScore.GetScore(); // another rule for word2Vec on real-word // check contect score: // 1. the topScore is bigger enough to cover the orgScore // 2. the distance is > a value for confidence if (((topScore / -orgScore) > rw1To1CandCsFactor) && (orgScore > rw1To1WordMinCs) && (topScore > rw1To1CandMinCs) && ((topScore - orgScore) > rw1To1CandCsDist)) //609|796|0.6920 // check frequency, all positive: // 1. cand has better frequency // 2. the difference is withint a range { double orgFScore = orgFreqScore.GetScore(); double topFScore = topCSpellScore.GetFScore().GetScore(); if (((topFScore / orgFScore) > rw1To1CandFsFactor) && (topFScore > rw1To1CandMinFs) && ((topFScore > orgFScore) || ((orgFScore - topFScore) < rw1To1CandFsDist))) // within freq range { flag = true; } } return(flag); }
/// <summary> /// Compare two object o1 and o2. Both objects o1 and o2 are /// FrequencyScore. The compare algorithm: /// </summary> /// <param name="o1"> first object to be compared </param> /// <param name="o2"> second object to be compared /// </param> /// <returns> a negative integer, 0, or positive integer to represent the /// object o1 is less, equals, or greater than object 02. </returns> public virtual int Compare(ContextScore o1, ContextScore o2) { int @out = 0; // 1. compare total score first double score1 = ((ContextScore)o1).GetScore(); double score2 = ((ContextScore)o2).GetScore(); // SCR-2: use a fixed number to ensure result is not 0. if (score2 > score1) { // from high to low @out = 1; } else if (score2 < score1) { @out = -1; } else // 2. alphabetic order of word { string term1 = ((ContextScore)o1).GetTerm(); string term2 = ((ContextScore)o2).GetTerm(); @out = term2.CompareTo(term1); } return(@out); }
// private constructor public CSpellScore(string wordStr, string candStr, WordWcMap wordWcMap, DoubleVec contextVec, Word2Vec word2Vec, double wf1, double wf2, double wf3) { wordStr_ = wordStr; candStr_ = candStr; // calculate score oScore_ = new OrthographicScore(wordStr_, candStr_, wf1, wf2, wf3); fScore_ = new FrequencyScore(candStr_, wordWcMap); nScore_ = new NoisyChannelScore(wordStr_, candStr_, wordWcMap, wf1, wf2, wf3); cScore_ = new ContextScore(candStr_, contextVec, word2Vec); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return null if no candidate is found to correct public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rwMergeFactor, bool debugFlag) { // init the topRankMergeObj MergeObj topRankMergeObj = null; if (candidates.Count > 0) { // 1. find sorted score list for each candidates ... List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 2. find the top ranked str // the 0 element has the highest score because it is sorted // only 1 candidate, use it for nonWord ContextScore topContextScore = null; if (candScoreList.Count > 0) { topContextScore = candScoreList[0]; } // 3. find the mergeObj from the topRankStr (if exist) if (topContextScore != null) { // 3.1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> andStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 3.2 convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 string topRankStr = topContextScore.GetTerm(); topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); // 4. compare the top rank merge to the original string b4 merge // 1. get the word2Vec score for the orgMergeTerm b4 merge // 1.1 wordVec for context int tarPos = topRankMergeObj.GetStartPos(); // tarSize is the total token No of the orgMergeWords int tarSize = topRankMergeObj.GetEndPos() - topRankMergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before merge string orgMergeWord = topRankMergeObj.GetOrgMergeWord(); ContextScore orgContextScore = new ContextScore(orgMergeWord, contextVec, word2VecOm); // validate top merge candidate, set to null if false if (IsTopCandValid(orgContextScore, topContextScore, rwMergeFactor, debugFlag) == false) { // set to null if score is not good enough for corection topRankMergeObj = null; } } } return(topRankMergeObj); }
// private method // Test merge and Split private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm) { // 0. process the inText TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("=========================================="); Console.WriteLine("-- inTextList: [" + inText + "]"); bool word2VecSkipWord = true; bool debugFlag = false; // 1.a context with window radius DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); // 1.b context with all inText DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); // 1.c get score1 ContextScore score1 = new ContextScore(mergedWord, contextVec, w2vOm); ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm); Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore())); // 2. split words ContextScore score2 = new ContextScore(splitWords, contextVec, w2vOm); ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm); Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore())); // 3. 3. 3. Use avg. score on single words // This method use different context for each single word List <string> splitWordList = TermUtil.ToWordList(splitWords); int index = 0; double scoreSAvg = 0.0d; // radius double scoreSAAvg = 0.0d; // all inText //debugFlag = false; foreach (string splitWord in splitWordList) { // window radius DoubleVec contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); ContextScore scoreS = new ContextScore(splitWord, contextVecS, w2vOm); //System.out.println("-- " + scoreS.ToString()); scoreSAvg += scoreS.GetScore(); // all text DoubleVec contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); ContextScore scoreSA = new ContextScore(splitWord, contextVecSA, w2vOm); //System.out.println("-- " + scoreSA.ToString()); scoreSAAvg += scoreSA.GetScore(); index++; } scoreSAvg = scoreSAvg / index; // window scoreSAAvg = scoreSAAvg / index; // all text Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg)); }
private static bool CheckRealWord1To1Rules(ContextScore topContextScore, string inStr, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rw1To1Factor, bool debugFlag) { // return false if no topCand found if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr))) { return(false); } // 1. get the word2Vec score for the org inStr b4 one-to-one // 1.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before one-to-one ContextScore orgCs = new ContextScore(inStr, contextVec, word2VecOm); DebugPrint.Println("--- Real-Word One-To-One Context Score Detail: ---", debugFlag); DebugPrint.Println("- Score - orgTerm: " + orgCs.ToString(), debugFlag); DebugPrint.Println("- Score - top 1-to-1: " + topContextScore.ToString(), debugFlag); DebugPrint.Println("- rw1To1Factor: " + rw1To1Factor, debugFlag); // Score rules for one-to-one double orgScore = orgCs.GetScore(); double topScore = topContextScore.GetScore(); bool flag = false; // 2.1 no one-to-one correction if orgScore is 0.0d, no word2Vec information if (orgScore < 0.0d) { // 2.2a one-to-one if the org score is negative and top score is positive if (topScore > 0.0d) { // another rule for word2Vec on real-word if (((topScore - orgScore) > 0.085) && (orgScore > -0.085)) // help from 0.6812 to 0.6877 { flag = true; } } // 2.2b one-to-one if the org score is negative and top score is better else if ((topScore < 0.0d) && (topScore > orgScore * rw1To1Factor)) { flag = true; } } else if (orgScore > 0.0d) { // 2.3a merge if the org score is positive and better 0.01*topScore if (topScore * rw1To1Factor > orgScore) { flag = true; } } return(flag); }
// return candidate set with context score // word2Vec is the word|wordVec map to get the wordVec // Not sorted, because it is a set // tarPos: starting position of target token // tarSize: token size of target token (single word = 1) // contextRadius: windown radius public static HashSet <ContextScore> GetCandidateScoreSet(HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { // 1. get the context and contextVec, using input matrix DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2. get context score for all candidates HashSet <ContextScore> candScoreSet = new HashSet <ContextScore>(); foreach (string cand in candidates) { // get ContextSocre for each candidates, use output matrix ContextScore cs = new ContextScore(cand, contextVec, word2VecOm); candScoreSet.Add(cs); } return(candScoreSet); }
// by combination, O, N, F, C private int compareByCombo(CSpellScore o1, CSpellScore o2) { int @out = 0; OrthographicScore oScore1 = ((CSpellScore)o1).GetOScore(); OrthographicScore oScore2 = ((CSpellScore)o2).GetOScore(); NoisyChannelScore nScore1 = ((CSpellScore)o1).GetNScore(); NoisyChannelScore nScore2 = ((CSpellScore)o2).GetNScore(); FrequencyScore fScore1 = ((CSpellScore)o1).GetFScore(); FrequencyScore fScore2 = ((CSpellScore)o2).GetFScore(); ContextScore cScore1 = ((CSpellScore)o1).GetCScore(); ContextScore cScore2 = ((CSpellScore)o2).GetCScore(); // 1. compared by orthographic score, best if (oScore1.GetScore() != oScore2.GetScore()) { OrthographicScoreComparator <OrthographicScore> osc = new OrthographicScoreComparator <OrthographicScore>(); @out = osc.Compare(oScore1, oScore2); } // 2. compared by noise channel score, 2nd best else if (nScore1.GetScore() != nScore2.GetScore()) { NoisyChannelScoreComparator <NoisyChannelScore> nsc = new NoisyChannelScoreComparator <NoisyChannelScore>(); @out = nsc.Compare(nScore1, nScore2); } // 3. compared by pure frequency score, 3rd best else if (fScore1.GetScore() != fScore2.GetScore()) { FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>(); @out = fsc.Compare(fScore1, fScore2); } // 4. compared by context score, 4 last else if (cScore1.GetScore() != cScore2.GetScore()) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); @out = csc.Compare(cScore1, cScore2); } // 5. alphabetic order else { string cand1 = ((CSpellScore)o1).GetCandStr(); string cand2 = ((CSpellScore)o2).GetCandStr(); @out = cand2.CompareTo(cand1); } return(@out); }
// return candidate set with context score // word2Vec is the word|wordVec map to get the wordVec // Not sorted, because it is a set // tarPos: starting position of target token // tarSize: token size of target token (single word = 1) public static HashSet <ContextScore> GetCandidateScoreSet(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { HashSet <ContextScore> candScoreSet = new HashSet <ContextScore>(); // get context score for all candidates // go through all merge candidates, all have differetn context foreach (MergeObj mergeObj in candidates) { // 1. get the context and contextVec, using input matrix int tarPos = mergeObj.GetStartPos(); int tarSize = mergeObj.GetEndPos() - mergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2. get ContextSocre for each merge, use output matrix string mergeWord = mergeObj.GetCoreMergeWord(); ContextScore cs = new ContextScore(mergeWord, contextVec, word2VecOm); candScoreSet.Add(cs); } return(candScoreSet); }
// Use context and frequency scor eto validate the top ranked candidate private static bool IsTopCandValid(string inStr, ContextScore orgContextScore, CSpellScore topCSpellScore, FrequencyScore orgFreqScore, CSpellApi cSpellApi, bool debugFlag) { ContextScore topContextScore = topCSpellScore.GetCScore(); // return false if no topCand found if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr))) { return(false); } // Score rules for one-to-one double orgScore = orgContextScore.GetScore(); double topScore = topContextScore.GetScore(); bool flag = false; double rw1To1CFactor = cSpellApi.GetRankRw1To1CFac(); // 2.1 no 1-to-1 correction if orgScore is 0.0d, no word2Vec information if (orgScore < 0.0d) { // 2.2a one-to-one if the org score is negative and top score is positive if (topScore > 0.0d) { // further check by ratio, dist, and min. by CScore and FScore if (IsTopCandValidByScores(orgContextScore, orgFreqScore, topContextScore, topCSpellScore, cSpellApi) == true) { flag = true; } } // 2.2b 1-to-1 if the org score is negative, top score is better else if ((topScore < 0.0d) && (topScore > orgScore * rw1To1CFactor)) { flag = true; } } else if (orgScore > 0.0d) { // 2.3a merge if the org score is positive, better 0.01*topScore if (topScore * rw1To1CFactor > orgScore) { flag = true; } } return(flag); }
// return the best ranked str from candidates using context score // this method is replaced by GetTopRankStr, which sorted by comparator public static string GetTopRankStrByScore(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { // 1. get the context and contextVec DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); string topRankStr = inStr; double maxScore = 0.0d; foreach (string cand in candidates) { ContextScore cs = new ContextScore(cand, contextVec, word2VecOm); double score = cs.GetScore(); // update only if the score is > 0.0d if (score > maxScore) { topRankStr = cand; maxScore = score; } } return(topRankStr); }
private int compareByContext(CSpellScore o1, CSpellScore o2) { int @out = 0; ContextScore cScore1 = ((CSpellScore)o1).GetCScore(); ContextScore cScore2 = ((CSpellScore)o2).GetCScore(); // 1. compared by context score, 4 last if (cScore1.GetScore() != cScore2.GetScore()) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); @out = csc.Compare(cScore1, cScore2); } // 2. alphabetic order else { string cand1 = ((CSpellScore)o1).GetCandStr(); string cand2 = ((CSpellScore)o2).GetCandStr(); @out = cand2.CompareTo(cand1); } return(@out); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag) { // init string topRankStr = inStr; // Find the correction str if (candidates.Count > 0) { // 1. sorted score list for each candidates ... // This ranking can be improved if n-gram model (frequecny) is used List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 1.1 get the top tank candidate ContextScore topContextScore = candScoreList[0]; // 2. validate the top rank // 2.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2.2 wordVec for the original words before split ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); // 2.3 compare the top rank split to the original string b4 split if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topContextScore.GetTerm(); } // debug print if (debugFlag == true) { // print focus token (original) DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag); // print candidates ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } } return(topRankStr); }
// private methods private static bool IsTopCandValid(string inStr, ContextScore orgContextScore, ContextScore topContextScore, double rwSplitFactor, bool debugFlag) { // return false if no topCand found if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr))) { return(false); } // Score rules for split double orgScore = orgContextScore.GetScore(); double topScore = topContextScore.GetScore(); bool flag = false; // 2.1 no split correction if orgScore is 0.0d, no word2Vec information if (orgScore < 0.0d) { // 2.2a split if the org score is negative and top score is positive if (topScore > 0.0d) { flag = true; } // 2.2b split if the org score is negative and top score is better // not used for now, saved for future usage else if ((topScore < 0.0d) && (topScore > orgScore * rwSplitFactor)) { flag = true; } } // not used for now, saved for future usage else if (orgScore > 0.0d) { // 2.3a merge if the org score is positive and better 0.01*topScore if (topScore * rwSplitFactor > orgScore) { flag = true; } } return(flag); }
// return the best ranked str from candidates using context score // this method is replaced by GetTopRankStr, which sorted by comparator public static MergeObj GetTopRankMergeObjByScore(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { MergeObj topRankMergeObj = null; double maxScore = 0.0d; foreach (MergeObj mergeObj in candidates) { // 1. get the context and contextVec int tarPos = mergeObj.GetStartPos(); int tarSize = mergeObj.GetEndPos() - mergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2. get ContextSocre for each merge, use output matrix string mergeWord = mergeObj.GetCoreMergeWord(); ContextScore cs = new ContextScore(mergeWord, contextVec, word2VecOm); double score = cs.GetScore(); // update only if the score is > 0.0d if (score > maxScore) { topRankMergeObj = mergeObj; maxScore = score; } } return(topRankMergeObj); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag) { // init WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetRw1To1ContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); double wf1 = cSpellApi.GetOrthoScoreEdDistFac(); double wf2 = cSpellApi.GetOrthoScorePhoneticFac(); double wf3 = cSpellApi.GetOrthoScoreOverlapFac(); int tarSize = 1; // only for one-to-one, no merge here string topRankStr = inStr; // use cSpell top candidates int topNo = 1; // top sort string inStrLc = inStr.ToLower(); List <CSpellScore> cSpellScoreList = RankByCSpellRealWord1To1.GetCandidateScoreList(inStrLc, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); // Find the correction str and correct if (cSpellScoreList.Count > 0) { // the rw top rank must be in both NC and orthographic CSpellScore topScore = cSpellScoreList[0]; double topFScore = topScore.GetFScore().GetScore(); //frequency double topTScore = topScore.GetOScore().GetTokenScore(); // Token double topPScore = topScore.GetOScore().GetPhoneticScore(); //Phone double topOScore = topScore.GetOScore().GetOverlapScore(); //overlap ContextScore orgContextScore = null; // check the frequency // get the max score of frequency, eidt, phonetic, and overlap // the top rank must have all top score for above if ((topFScore == CSpellScore.GetMaxFScore(cSpellScoreList)) && (topTScore == CSpellScore.GetMaxEScore(cSpellScoreList)) && (topPScore == CSpellScore.GetMaxPScore(cSpellScoreList)) && (topOScore == CSpellScore.GetMaxOScore(cSpellScoreList))) { ContextScore topContextScore = topScore.GetCScore(); // 1.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before one-to-one orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); FrequencyScore orgFScore = new FrequencyScore(inStr, wordWcMap); // pass the orgContextScore if (IsTopCandValid(inStr, orgContextScore, topScore, orgFScore, cSpellApi, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topScore.GetCandStr(); // debug print for ananlysis /// <summary> ///* /// System.out.println("======= cSpellScoreList.size(): " /// + cSpellScoreList.size() + " ========"); /// System.out.println(inStr /// + "," + String.format("%1.8f", orgFScore.GetScore()) /// + "," + String.format("%1.8f", orgContextScore.GetScore())); /// System.out.println(CSpellScore.GetScoreHeader()); /// for(CSpellScore cSpellScore: cSpellScoreList) /// { /// System.out.println(cSpellScore.ToString(",")); /// } /// ** /// </summary> } } // debug print if (debugFlag == true) { // print focus token (original) if (orgContextScore != null) { DebugPrint.PrintScore(orgContextScore.ToString(), debugFlag); } else { DebugPrint.PrintScore("No score for focus (" + inStr + ")", debugFlag); } // print candidate var list = cSpellScoreList.Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintScore(item, debugFlag); } } } return(topRankStr); }