// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return null if no candidate is found to correct public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rwMergeFactor, bool debugFlag) { // init the topRankMergeObj MergeObj topRankMergeObj = null; if (candidates.Count > 0) { // 1. find sorted score list for each candidates ... List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 2. find the top ranked str // the 0 element has the highest score because it is sorted // only 1 candidate, use it for nonWord ContextScore topContextScore = null; if (candScoreList.Count > 0) { topContextScore = candScoreList[0]; } // 3. find the mergeObj from the topRankStr (if exist) if (topContextScore != null) { // 3.1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> andStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 3.2 convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 string topRankStr = topContextScore.GetTerm(); topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); // 4. compare the top rank merge to the original string b4 merge // 1. get the word2Vec score for the orgMergeTerm b4 merge // 1.1 wordVec for context int tarPos = topRankMergeObj.GetStartPos(); // tarSize is the total token No of the orgMergeWords int tarSize = topRankMergeObj.GetEndPos() - topRankMergeObj.GetStartPos() + 1; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 1.2 wordVec for the original words before merge string orgMergeWord = topRankMergeObj.GetOrgMergeWord(); ContextScore orgContextScore = new ContextScore(orgMergeWord, contextVec, word2VecOm); // validate top merge candidate, set to null if false if (IsTopCandValid(orgContextScore, topContextScore, rwMergeFactor, debugFlag) == false) { // set to null if score is not good enough for corection topRankMergeObj = null; } } } return(topRankMergeObj); }
// clean up mergeObjList: // 1. contain, remove the previous one // 2. overlap, remove the latter one // This is a quick fix for window = 2. the permanemnt fix should be a // real-time update on each merge private static List <MergeObj> CleanUpMergeObjList(List <MergeObj> mergeObjList) { List <MergeObj> outMergeObjList = new List <MergeObj>(); bool skipNext = false; for (int i = 0; i < mergeObjList.Count; i++) { MergeObj mergeObj1 = mergeObjList[i]; if (i < mergeObjList.Count - 1) { MergeObj mergeObj2 = mergeObjList[i + 1]; // next mergeObj int startPos1 = mergeObj1.GetStartPos(); int startPos2 = mergeObj2.GetStartPos(); int endPos1 = mergeObj1.GetEndPos(); int endPos2 = mergeObj2.GetEndPos(); // mergeObj2 contains mergeObj1 if ((startPos1 == startPos2) && (endPos1 < endPos2)) { continue; } // merObj2 has overlap with mergeObj1 else if ((startPos2 > startPos1) && (startPos2 < endPos1)) { outMergeObjList.Add(mergeObj1); skipNext = true; } else { if (skipNext == true) { skipNext = false; } else { outMergeObjList.Add(mergeObj1); } } } else { // add the last mergeObj if (skipNext == false) { outMergeObjList.Add(mergeObj1); } } } return(outMergeObjList); }