Пример #1
0
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return the orignal inStr if no candidate has score > 0.0d
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag)
        {
            // init
            string topRankStr = inStr;

            // Find the correction str
            if (candidates.Count > 0)
            {
                // 1. sorted score list for each candidates ...
                // This ranking can be improved if n-gram model (frequecny) is used
                List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag);
                // 1.1 get the top tank candidate
                ContextScore topContextScore = candScoreList[0];
                // 2. validate the top rank
                // 2.1 wordVec for context
                DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                // 2.2 wordVec for the original words before split
                ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm);
                // 2.3 compare the top rank split to the original string b4 split
                if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true)
                {
                    // no correction: if score is not good enough for corection
                    topRankStr = topContextScore.GetTerm();
                }
                // debug print
                if (debugFlag == true)
                {
                    // print focus token (original)
                    DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag);
                    // print candidates
                    ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>();
                    var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList();
                    foreach (var item in list)
                    {
                        DebugPrint.PrintCScore(item, debugFlag);
                    }
                }
            }
            return(topRankStr);
        }