// return the best ranked str from candidates using orthographic score // tarPos: start from 0, not include empty space token private static MergeObj GetTopRankMergeObjByFrequency(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); int maxCandNo = cSpellApi.GetCanMaxCandNo(); MergeObj topRankMergeObj = null; // get the top rank mergeObj by frequency if (candidates.Count > 0) { // 1. convert mergeObj set to string set // key: coreMergeWord, MergeObj Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>(); foreach (MergeObj mergeObj in candidates) { string mergeWord = mergeObj.GetCoreMergeWord(); candStrMergeObjMap[mergeWord] = mergeObj; } HashSet <string> candStrSet = new HashSet <string>(candStrMergeObjMap.Keys); // 2. find the top rank by Str string topRankStr = RankByFrequency.GetTopRankStr(candStrSet, wordWcMap); // 3. convert back from top rank str to MergeObj // topRankStr should never be null because candidates is > 0 if (!string.ReferenceEquals(topRankStr, null)) { topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr); } // 4. print out frequency score detail ScoreDetailByMode.PrintFrequencyScore(candStrSet, wordWcMap, maxCandNo, debugFlag); } return(topRankMergeObj); }
// tarPos: start from 0, not include empty space token public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int rankMode = cSpellApi.GetRankMode(); double wf1 = cSpellApi.GetOrthoScoreEdDistFac(); double wf2 = cSpellApi.GetOrthoScorePhoneticFac(); double wf3 = cSpellApi.GetOrthoScoreOverlapFac(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); string topRankStr = inStr; int maxCandNo = cSpellApi.GetCanMaxCandNo(); Word2Vec word2VecIm = cSpellApi.GetWord2VecIm(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int contextRadius = cSpellApi.GetNw1To1ContextRadius(); bool word2VecSkipWord = cSpellApi.GetWord2VecSkipWord(); double rangeFactor = cSpellApi.GetRankNwS1RankRangeFac(); double nwS1MinOScore = cSpellApi.GetRankNwS1MinOScore(); int tarSize = 1; // only for one-to-one or split, no merge here // get the top ranked candidate if (candidates.Count > 0) { // get the top rank str by scores switch (rankMode) { case CSpellApi.RANK_MODE_ORTHOGRAPHIC: topRankStr = RankByOrthographic.GetTopRankStr(inStr, candidates, wf1, wf2, wf3); ScoreDetailByMode.PrintOrthographicScore(inStr, candidates, maxCandNo, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_FREQUENCY: topRankStr = RankByFrequency.GetTopRankStr(candidates, wordWcMap); ScoreDetailByMode.PrintFrequencyScore(candidates, wordWcMap, maxCandNo, debugFlag); break; case CSpellApi.RANK_MODE_CONTEXT: topRankStr = RankByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius); ScoreDetailByMode.PrintContextScore(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, maxCandNo, debugFlag); break; case CSpellApi.RANK_MODE_NOISY_CHANNEL: topRankStr = RankByNoisyChannel.GetTopRankStr(inStr, candidates, wordWcMap, wf1, wf2, wf3); ScoreDetailByMode.PrintNoisyChannelScore(inStr, candidates, wordWcMap, maxCandNo, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_ENSEMBLE: topRankStr = RankByEnsemble.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, wf1, wf2, wf3); // ensemble use same basic socre as CSpell ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); break; case CSpellApi.RANK_MODE_CSPELL: topRankStr = RankByCSpellNonWord.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, nwS1MinOScore, wf1, wf2, wf3); ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag); break; } } return(topRankStr); }
public static void PrintFrequencyScore(HashSet <string> candSet, WordWcMap wordWcMap, int maxCandNo, bool debugFlag) { if (debugFlag == true) { FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>(); HashSet <FrequencyScore> fScoreSet = RankByFrequency.GetCandidateScoreSet(candSet, wordWcMap); var list = fScoreSet.OrderBy(x => x, fsc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintFScore(item, debugFlag); } } }