// tarPos: start from 0, not include empty space token public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int rankMode = cSpellApi.GetRankMode(); double wf1 = cSpellApi.GetOrthoScoreEdDistFac(); double wf2 = cSpellApi.GetOrthoScorePhoneticFac(); double wf3 = cSpellApi.GetOrthoScoreOverlapFac(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); string topRankStr = inStr; int maxCandNo = cSpellApi.GetCanMaxCandNo(); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetNw1To1ContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); double rangeFactor = cSpellApi.GetRankNwS1RankRangeFac(); double nwS1MinOScore = cSpellApi.GetRankNwS1MinOScore(); int tarSize = 1; // only for one-to-one or split, no merge here // get the top ranked candidate if (candidates.Count > 0) { // get the top rank str by scores switch (rankMode) { case CSpellApi.RANK_MODE_ORTHOGRAPHIC: topRankStr = RankByOrthographic.GetTopRankStr(inStr, candidates, wf1, wf2, wf3); ScoreDetailByMode.PrintOrthographicScore(inStr, candidates, maxCandNo, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_FREQUENCY: topRankStr = RankByFrequency.GetTopRankStr(candidates, wordWcMap); ScoreDetailByMode.PrintFrequencyScore(candidates, wordWcMap, maxCandNo, debugFlag); break; case CSpellApi.RANK_MODE_CONTEXT: topRankStr = RankByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius); ScoreDetailByMode.PrintContextScore(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, maxCandNo, debugFlag); break; case CSpellApi.RANK_MODE_NOISY_CHANNEL: topRankStr = RankByNoisyChannel.GetTopRankStr(inStr, candidates, wordWcMap, wf1, wf2, wf3); ScoreDetailByMode.PrintNoisyChannelScore(inStr, candidates, wordWcMap, maxCandNo, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_ENSEMBLE: topRankStr = RankByEnsemble.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, wf1, wf2, wf3); // ensemble use same basic socre as CSpell ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_CSPELL: topRankStr = RankByCSpellNonWord.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, nwS1MinOScore, wf1, wf2, wf3); ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); break; } } return(topRankStr); }
// TBD, this file should be deleted by moving each method to // the assocaited ranking class // public method public static void PrintContextScore(HashSet <string> candSet, int tarPos, int tarSize, List <TokenObj> inTextList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int maxCandNo, bool debugFlag) { if (debugFlag == true) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); HashSet <ContextScore> cScoreSet = RankByContext.GetCandidateScoreSet(candSet, tarPos, tarSize, inTextList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); var list = cScoreSet.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag) { // init string topRankStr = inStr; // Find the correction str if (candidates.Count > 0) { // 1. sorted score list for each candidates ... // This ranking can be improved if n-gram model (frequecny) is used List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 1.1 get the top tank candidate ContextScore topContextScore = candScoreList[0]; // 2. validate the top rank // 2.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2.2 wordVec for the original words before split ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); // 2.3 compare the top rank split to the original string b4 split if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topContextScore.GetTerm(); } // debug print if (debugFlag == true) { // print focus token (original) DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag); // print candidates ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } } return(topRankStr); }