// TBD, this file should be deleted by moving each method to // the assocaited ranking class // public method public static void PrintContextScore(HashSet <string> candSet, int tarPos, int tarSize, List <TokenObj> inTextList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int maxCandNo, bool debugFlag) { if (debugFlag == true) { ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); HashSet <ContextScore> cScoreSet = RankByContext.GetCandidateScoreSet(candSet, tarPos, tarSize, inTextList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); var list = cScoreSet.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } }
// return candidate scoreObj list sorted by score, higher first public static List <ContextScore> GetCandidateScoreList(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag) { // find score object set for each candidates ... HashSet <ContextScore> candScoreSet = GetCandidateScoreSet(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // sorted by the score, higher go first List <ContextScore> candScoreList = new List <ContextScore>(candScoreSet); ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); candScoreList.Sort(csc); // print detail foreach (ContextScore contextScore in candScoreList) { DebugPrint.PrintCScore(contextScore.ToString(), debugFlag); } return(candScoreList); }
// return the best ranked str from candidates using word2Vec score // inTokenList, includes space token, is not coreTerm.Lc // return the orignal inStr if no candidate has score > 0.0d public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag) { // init string topRankStr = inStr; // Find the correction str if (candidates.Count > 0) { // 1. sorted score list for each candidates ... // This ranking can be improved if n-gram model (frequecny) is used List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag); // 1.1 get the top tank candidate ContextScore topContextScore = candScoreList[0]; // 2. validate the top rank // 2.1 wordVec for context DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag); // 2.2 wordVec for the original words before split ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm); // 2.3 compare the top rank split to the original string b4 split if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true) { // no correction: if score is not good enough for corection topRankStr = topContextScore.GetTerm(); } // debug print if (debugFlag == true) { // print focus token (original) DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag); // print candidates ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>(); var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { DebugPrint.PrintCScore(item, debugFlag); } } } return(topRankStr); }