Exemplo n.º 1
0
        private int compareByEnsemble(CSpellScore o1, CSpellScore o2)
        {
            int @out = 0;
            OrthographicScore oScore1 = ((CSpellScore)o1).GetOScore();
            OrthographicScore oScore2 = ((CSpellScore)o2).GetOScore();
            FrequencyScore    fScore1 = ((CSpellScore)o1).GetFScore();
            FrequencyScore    fScore2 = ((CSpellScore)o2).GetFScore();
            ContextScore      cScore1 = ((CSpellScore)o1).GetCScore();
            ContextScore      cScore2 = ((CSpellScore)o2).GetCScore();
            double            score1  = 0.6 * oScore1.GetScore() + 0.25 * fScore1.GetScore() + 0.15 * cScore1.GetScore();
            double            score2  = 0.6 * oScore2.GetScore() + 0.25 * fScore2.GetScore() + 0.15 * cScore2.GetScore();

            // 1. compared by orthographic score, best
            // SCR-2: use a fixed number to ensure result is not 0.
            if (score2 > score1)
            {
                // from high to low
                @out = 1;
            }
            else if (score2 < score1)
            {
                @out = -1;
            }
            // 2. alphabetic order
            else
            {
                string cand1 = ((CSpellScore)o1).GetCandStr();
                string cand2 = ((CSpellScore)o2).GetCandStr();
                @out = cand2.CompareTo(cand1);
            }
            return(@out);
        }
        // check score rule for real-word merge correctionrrayList<TokenObj>
        // nonSpaceTokenList,
        private static bool IsTopCandValid(ContextScore orgContextScore, ContextScore topContextScore, double rwMergeFactor, bool debugFlag)
        {
            // Score rules for merge
            double orgScore = orgContextScore.GetScore();
            double topScore = topContextScore.GetScore();
            bool   flag     = false;

            // 2.1 no merge correction if orgScore is 0.0d, no word2Vec information
            if (orgScore < 0.0d)
            {
                // 2.2a merge if the org score is negative and top score is positive
                if (topScore > 0.0d)
                {
                    flag = true;
                }
                // 2.2b merge if the org score is negative and top score is better
                // this is needed for higher recall and F1
                else if ((topScore < 0.0d) && (topScore > orgScore * rwMergeFactor))
                {
                    flag = true;
                }
            }
            else if (orgScore > 0.0d)
            {
                // 2.3a merge if the org score is positive and better 0.01*topScore
                if (topScore * rwMergeFactor > orgScore)
                {
                    flag = true;
                }
            }
            return(flag);
        }
Exemplo n.º 3
0
        private static void TestOnSet(Word2Vec w2vIm, Word2Vec w2vOm)
        {
            string inText = "He was diagnosed early on set dementia 3 years ago.";

            TextObj         textObj    = new TextObj(inText);
            List <TokenObj> inTextList = textObj.GetTokenList();
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            Console.WriteLine("==========================================");
            Console.WriteLine("-- inTextList: [" + inText + "]");
            int  tarPos           = 4;
            int  tarSize          = 2;   // "on set" has 2 tokens
            int  radius           = 2;
            bool word2VecSkipWord = true;
            bool debugFlag        = false;
            // 1 context with window radius
            DoubleVec    contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
            string       str1       = "onset";
            ContextScore s1         = new ContextScore(str1, contextVec, w2vOm);
            string       str2       = "on set";
            ContextScore s2         = new ContextScore(str2, contextVec, w2vOm);

            Console.WriteLine("- [" + str1 + "]: " + s1.ToString());
            Console.WriteLine("- [" + str2 + "]: " + s2.ToString());
        }
        // private methods
        private static bool IsTopCandValidByScores(ContextScore orgContextScore, FrequencyScore orgFreqScore, ContextScore topContextScore, CSpellScore topCSpellScore, CSpellApi cSpellApi)
        {
            // init
            bool   flag = false;
            double rw1To1CandCsFactor = cSpellApi.GetRankRw1To1CandCsFac();
            double rw1To1WordMinCs    = cSpellApi.GetRankRw1To1WordMinCs();
            double rw1To1CandMinCs    = cSpellApi.GetRankRw1To1CandMinCs();
            double rw1To1CandCsDist   = cSpellApi.GetRankRw1To1CandCsDist();
            double rw1To1CandFsFactor = cSpellApi.GetRankRw1To1CandFsFac();
            double rw1To1CandMinFs    = cSpellApi.GetRankRw1To1CandMinFs();
            double rw1To1CandFsDist   = cSpellApi.GetRankRw1To1CandFsDist();
            double orgScore           = orgContextScore.GetScore();
            double topScore           = topContextScore.GetScore();

            // another rule for word2Vec on real-word
            // check contect score:
            // 1. the topScore is bigger enough to cover the orgScore
            // 2. the distance is > a value for confidence
            if (((topScore / -orgScore) > rw1To1CandCsFactor) && (orgScore > rw1To1WordMinCs) && (topScore > rw1To1CandMinCs) && ((topScore - orgScore) > rw1To1CandCsDist))               //609|796|0.6920
            // check frequency, all positive:
            // 1. cand has better frequency
            // 2. the difference is withint a range
            {
                double orgFScore = orgFreqScore.GetScore();
                double topFScore = topCSpellScore.GetFScore().GetScore();
                if (((topFScore / orgFScore) > rw1To1CandFsFactor) && (topFScore > rw1To1CandMinFs) && ((topFScore > orgFScore) || ((orgFScore - topFScore) < rw1To1CandFsDist)))                   // within freq range
                {
                    flag = true;
                }
            }
            return(flag);
        }
        /// <summary>
        /// Compare two object o1 and o2.  Both objects o1 and o2 are
        /// FrequencyScore.  The compare algorithm:
        /// </summary>
        /// <param name="o1">  first object to be compared </param>
        /// <param name="o2">  second object to be compared
        /// </param>
        /// <returns>  a negative integer, 0, or positive integer to represent the
        ///          object o1 is less, equals, or greater than object 02. </returns>
        public virtual int Compare(ContextScore o1, ContextScore o2)
        {
            int @out = 0;

            // 1. compare total score first
            double score1 = ((ContextScore)o1).GetScore();
            double score2 = ((ContextScore)o2).GetScore();

            // SCR-2: use a fixed number to ensure result is not 0.
            if (score2 > score1)
            {
                // from high to low
                @out = 1;
            }
            else if (score2 < score1)
            {
                @out = -1;
            }
            else                 // 2. alphabetic order of word
            {
                string term1 = ((ContextScore)o1).GetTerm();
                string term2 = ((ContextScore)o2).GetTerm();
                @out = term2.CompareTo(term1);
            }
            return(@out);
        }
Exemplo n.º 6
0
 // private constructor
 public CSpellScore(string wordStr, string candStr, WordWcMap wordWcMap, DoubleVec contextVec, Word2Vec word2Vec, double wf1, double wf2, double wf3)
 {
     wordStr_ = wordStr;
     candStr_ = candStr;
     // calculate score
     oScore_ = new OrthographicScore(wordStr_, candStr_, wf1, wf2, wf3);
     fScore_ = new FrequencyScore(candStr_, wordWcMap);
     nScore_ = new NoisyChannelScore(wordStr_, candStr_, wordWcMap, wf1, wf2, wf3);
     cScore_ = new ContextScore(candStr_, contextVec, word2Vec);
 }
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return null if no candidate is found to correct
        public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rwMergeFactor, bool debugFlag)
        {
            // init the topRankMergeObj
            MergeObj topRankMergeObj = null;

            if (candidates.Count > 0)
            {
                // 1. find sorted score list for each candidates ...
                List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag);
                // 2. find the top ranked str
                // the 0 element has the highest score because it is sorted
                // only 1 candidate, use it for nonWord
                ContextScore topContextScore = null;
                if (candScoreList.Count > 0)
                {
                    topContextScore = candScoreList[0];
                }
                // 3. find the mergeObj from the topRankStr (if exist)
                if (topContextScore != null)
                {
                    // 3.1. convert mergeObj set to string set
                    // key: coreMergeWord, MergeObj
                    Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>();
                    foreach (MergeObj mergeObj in candidates)
                    {
                        string mergeWord = mergeObj.GetCoreMergeWord();
                        candStrMergeObjMap[mergeWord] = mergeObj;
                    }
                    HashSet <string> andStrSet = new HashSet <string>(candStrMergeObjMap.Keys);
                    // 3.2 convert back from top rank str to MergeObj
                    // topRankStr should never be null because candidates is > 0
                    string topRankStr = topContextScore.GetTerm();
                    topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr);
                    // 4. compare the top rank merge to the original string b4 merge
                    // 1. get the word2Vec score for the orgMergeTerm b4 merge
                    // 1.1 wordVec for context
                    int tarPos = topRankMergeObj.GetStartPos();
                    // tarSize is the total token No of the orgMergeWords
                    int       tarSize    = topRankMergeObj.GetEndPos() - topRankMergeObj.GetStartPos() + 1;
                    DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                    // 1.2 wordVec for the original words before merge
                    string       orgMergeWord    = topRankMergeObj.GetOrgMergeWord();
                    ContextScore orgContextScore = new ContextScore(orgMergeWord, contextVec, word2VecOm);
                    // validate top merge candidate, set to null if false
                    if (IsTopCandValid(orgContextScore, topContextScore, rwMergeFactor, debugFlag) == false)
                    {
                        // set to null if score is not good enough for corection
                        topRankMergeObj = null;
                    }
                }
            }
            return(topRankMergeObj);
        }
Exemplo n.º 8
0
        // private method
        // Test merge and Split
        private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm)
        {
            // 0. process the inText
            TextObj         textObj    = new TextObj(inText);
            List <TokenObj> inTextList = textObj.GetTokenList();
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            Console.WriteLine("==========================================");
            Console.WriteLine("-- inTextList: [" + inText + "]");
            bool word2VecSkipWord = true;
            bool debugFlag        = false;
            // 1.a context with window radius
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
            // 1.b context with all inText
            DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
            // 1.c get score1
            ContextScore score1  = new ContextScore(mergedWord, contextVec, w2vOm);
            ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm);

            Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore()));
            // 2. split words
            ContextScore score2  = new ContextScore(splitWords, contextVec, w2vOm);
            ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm);

            Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore()));
            // 3. 3. 3. Use avg. score on single words
            // This method use different context for each single word
            List <string> splitWordList = TermUtil.ToWordList(splitWords);
            int           index         = 0;
            double        scoreSAvg     = 0.0d;  // radius
            double        scoreSAAvg    = 0.0d;  // all inText

            //debugFlag = false;
            foreach (string splitWord in splitWordList)
            {
                // window radius
                DoubleVec    contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
                ContextScore scoreS      = new ContextScore(splitWord, contextVecS, w2vOm);
                //System.out.println("-- " + scoreS.ToString());
                scoreSAvg += scoreS.GetScore();
                // all text
                DoubleVec    contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
                ContextScore scoreSA      = new ContextScore(splitWord, contextVecSA, w2vOm);
                //System.out.println("-- " + scoreSA.ToString());
                scoreSAAvg += scoreSA.GetScore();
                index++;
            }
            scoreSAvg  = scoreSAvg / index;            // window
            scoreSAAvg = scoreSAAvg / index;           // all text
            Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg));
        }
        private static bool CheckRealWord1To1Rules(ContextScore topContextScore, string inStr, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rw1To1Factor, bool debugFlag)
        {
            // return false if no topCand found
            if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr)))
            {
                return(false);
            }
            // 1. get the word2Vec score for the org inStr b4 one-to-one
            // 1.1 wordVec for context
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
            // 1.2 wordVec for the original words before one-to-one
            ContextScore orgCs = new ContextScore(inStr, contextVec, word2VecOm);

            DebugPrint.Println("--- Real-Word One-To-One Context Score Detail: ---", debugFlag);
            DebugPrint.Println("- Score - orgTerm: " + orgCs.ToString(), debugFlag);
            DebugPrint.Println("- Score - top 1-to-1: " + topContextScore.ToString(), debugFlag);
            DebugPrint.Println("- rw1To1Factor: " + rw1To1Factor, debugFlag);
            // Score rules for one-to-one
            double orgScore = orgCs.GetScore();
            double topScore = topContextScore.GetScore();
            bool   flag     = false;

            // 2.1 no one-to-one correction if orgScore is 0.0d, no word2Vec information
            if (orgScore < 0.0d)
            {
                // 2.2a one-to-one if the org score is negative and top score is positive
                if (topScore > 0.0d)
                {
                    // another rule for word2Vec on real-word
                    if (((topScore - orgScore) > 0.085) && (orgScore > -0.085))                       // help from 0.6812 to 0.6877
                    {
                        flag = true;
                    }
                }
                // 2.2b one-to-one if the org score is negative and top score is better
                else if ((topScore < 0.0d) && (topScore > orgScore * rw1To1Factor))
                {
                    flag = true;
                }
            }
            else if (orgScore > 0.0d)
            {
                // 2.3a merge if the org score is positive and better 0.01*topScore
                if (topScore * rw1To1Factor > orgScore)
                {
                    flag = true;
                }
            }
            return(flag);
        }
Exemplo n.º 10
0
        // return candidate set with context score
        // word2Vec is the word|wordVec map to get the wordVec
        // Not sorted, because it is a set
        // tarPos: starting position of target token
        // tarSize: token size of target token (single word = 1)
        // contextRadius: windown radius
        public static HashSet <ContextScore> GetCandidateScoreSet(HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag)
        {
            // 1. get the context and contextVec, using input matrix
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);

            // 2. get context score for all candidates
            HashSet <ContextScore> candScoreSet = new HashSet <ContextScore>();

            foreach (string cand in candidates)
            {
                // get ContextSocre for each candidates, use output matrix
                ContextScore cs = new ContextScore(cand, contextVec, word2VecOm);
                candScoreSet.Add(cs);
            }
            return(candScoreSet);
        }
Exemplo n.º 11
0
        // by combination, O, N, F, C
        private int compareByCombo(CSpellScore o1, CSpellScore o2)
        {
            int @out = 0;
            OrthographicScore oScore1 = ((CSpellScore)o1).GetOScore();
            OrthographicScore oScore2 = ((CSpellScore)o2).GetOScore();
            NoisyChannelScore nScore1 = ((CSpellScore)o1).GetNScore();
            NoisyChannelScore nScore2 = ((CSpellScore)o2).GetNScore();
            FrequencyScore    fScore1 = ((CSpellScore)o1).GetFScore();
            FrequencyScore    fScore2 = ((CSpellScore)o2).GetFScore();
            ContextScore      cScore1 = ((CSpellScore)o1).GetCScore();
            ContextScore      cScore2 = ((CSpellScore)o2).GetCScore();

            // 1. compared by orthographic score, best
            if (oScore1.GetScore() != oScore2.GetScore())
            {
                OrthographicScoreComparator <OrthographicScore> osc = new OrthographicScoreComparator <OrthographicScore>();
                @out = osc.Compare(oScore1, oScore2);
            }
            // 2. compared by noise channel score, 2nd best
            else if (nScore1.GetScore() != nScore2.GetScore())
            {
                NoisyChannelScoreComparator <NoisyChannelScore> nsc = new NoisyChannelScoreComparator <NoisyChannelScore>();
                @out = nsc.Compare(nScore1, nScore2);
            }
            // 3. compared by pure frequency score, 3rd best
            else if (fScore1.GetScore() != fScore2.GetScore())
            {
                FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>();
                @out = fsc.Compare(fScore1, fScore2);
            }
            // 4. compared by context score, 4 last
            else if (cScore1.GetScore() != cScore2.GetScore())
            {
                ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>();
                @out = csc.Compare(cScore1, cScore2);
            }
            // 5. alphabetic order
            else
            {
                string cand1 = ((CSpellScore)o1).GetCandStr();
                string cand2 = ((CSpellScore)o2).GetCandStr();
                @out = cand2.CompareTo(cand1);
            }
            return(@out);
        }
        // return candidate set with context score
        // word2Vec is the word|wordVec map to get the wordVec
        // Not sorted, because it is a set
        // tarPos: starting position of target token
        // tarSize: token size of target token (single word = 1)
        public static HashSet <ContextScore> GetCandidateScoreSet(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag)
        {
            HashSet <ContextScore> candScoreSet = new HashSet <ContextScore>();

            // get context score for all candidates
            // go through all merge candidates, all have differetn context
            foreach (MergeObj mergeObj in candidates)
            {
                // 1. get the context and contextVec, using input matrix
                int       tarPos     = mergeObj.GetStartPos();
                int       tarSize    = mergeObj.GetEndPos() - mergeObj.GetStartPos() + 1;
                DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                // 2. get ContextSocre for each merge, use output matrix
                string       mergeWord = mergeObj.GetCoreMergeWord();
                ContextScore cs        = new ContextScore(mergeWord, contextVec, word2VecOm);
                candScoreSet.Add(cs);
            }
            return(candScoreSet);
        }
        // Use context and frequency scor eto validate the top ranked candidate
        private static bool IsTopCandValid(string inStr, ContextScore orgContextScore, CSpellScore topCSpellScore, FrequencyScore orgFreqScore, CSpellApi cSpellApi, bool debugFlag)
        {
            ContextScore topContextScore = topCSpellScore.GetCScore();

            // return false if no topCand found
            if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr)))
            {
                return(false);
            }
            // Score rules for one-to-one
            double orgScore      = orgContextScore.GetScore();
            double topScore      = topContextScore.GetScore();
            bool   flag          = false;
            double rw1To1CFactor = cSpellApi.GetRankRw1To1CFac();

            // 2.1 no 1-to-1 correction if orgScore is 0.0d, no word2Vec information
            if (orgScore < 0.0d)
            {
                // 2.2a one-to-one if the org score is negative and top score is positive
                if (topScore > 0.0d)
                {
                    // further check by ratio, dist, and min. by CScore and FScore
                    if (IsTopCandValidByScores(orgContextScore, orgFreqScore, topContextScore, topCSpellScore, cSpellApi) == true)
                    {
                        flag = true;
                    }
                }
                // 2.2b 1-to-1 if the org score is negative, top score is better
                else if ((topScore < 0.0d) && (topScore > orgScore * rw1To1CFactor))
                {
                    flag = true;
                }
            }
            else if (orgScore > 0.0d)
            {
                // 2.3a merge if the org score is positive, better 0.01*topScore
                if (topScore * rw1To1CFactor > orgScore)
                {
                    flag = true;
                }
            }
            return(flag);
        }
Exemplo n.º 14
0
        // return the best ranked str from candidates using context score
        // this method is replaced by GetTopRankStr, which sorted by comparator
        public static string GetTopRankStrByScore(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag)
        {
            // 1. get the context and contextVec
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
            string    topRankStr = inStr;
            double    maxScore   = 0.0d;

            foreach (string cand in candidates)
            {
                ContextScore cs    = new ContextScore(cand, contextVec, word2VecOm);
                double       score = cs.GetScore();
                // update only if the score is > 0.0d
                if (score > maxScore)
                {
                    topRankStr = cand;
                    maxScore   = score;
                }
            }
            return(topRankStr);
        }
Exemplo n.º 15
0
        private int compareByContext(CSpellScore o1, CSpellScore o2)
        {
            int          @out    = 0;
            ContextScore cScore1 = ((CSpellScore)o1).GetCScore();
            ContextScore cScore2 = ((CSpellScore)o2).GetCScore();

            // 1. compared by context score, 4 last
            if (cScore1.GetScore() != cScore2.GetScore())
            {
                ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>();
                @out = csc.Compare(cScore1, cScore2);
            }
            // 2. alphabetic order
            else
            {
                string cand1 = ((CSpellScore)o1).GetCandStr();
                string cand2 = ((CSpellScore)o2).GetCandStr();
                @out = cand2.CompareTo(cand1);
            }
            return(@out);
        }
Exemplo n.º 16
0
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return the orignal inStr if no candidate has score > 0.0d
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag)
        {
            // init
            string topRankStr = inStr;

            // Find the correction str
            if (candidates.Count > 0)
            {
                // 1. sorted score list for each candidates ...
                // This ranking can be improved if n-gram model (frequecny) is used
                List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag);
                // 1.1 get the top tank candidate
                ContextScore topContextScore = candScoreList[0];
                // 2. validate the top rank
                // 2.1 wordVec for context
                DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                // 2.2 wordVec for the original words before split
                ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm);
                // 2.3 compare the top rank split to the original string b4 split
                if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true)
                {
                    // no correction: if score is not good enough for corection
                    topRankStr = topContextScore.GetTerm();
                }
                // debug print
                if (debugFlag == true)
                {
                    // print focus token (original)
                    DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag);
                    // print candidates
                    ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>();
                    var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList();
                    foreach (var item in list)
                    {
                        DebugPrint.PrintCScore(item, debugFlag);
                    }
                }
            }
            return(topRankStr);
        }
Exemplo n.º 17
0
        // private methods
        private static bool IsTopCandValid(string inStr, ContextScore orgContextScore, ContextScore topContextScore, double rwSplitFactor, bool debugFlag)
        {
            // return false if no topCand found
            if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr)))
            {
                return(false);
            }
            // Score rules for split
            double orgScore = orgContextScore.GetScore();
            double topScore = topContextScore.GetScore();
            bool   flag     = false;

            // 2.1 no split correction if orgScore is 0.0d, no word2Vec information
            if (orgScore < 0.0d)
            {
                // 2.2a split if the org score is negative and top score is positive
                if (topScore > 0.0d)
                {
                    flag = true;
                }
                // 2.2b split if the org score is negative and top score is better
                // not used for now, saved for future usage
                else if ((topScore < 0.0d) && (topScore > orgScore * rwSplitFactor))
                {
                    flag = true;
                }
            }
            // not used for now, saved for future usage
            else if (orgScore > 0.0d)
            {
                // 2.3a merge if the org score is positive and better 0.01*topScore
                if (topScore * rwSplitFactor > orgScore)
                {
                    flag = true;
                }
            }
            return(flag);
        }
        // return the best ranked str from candidates using context score
        // this method is replaced by GetTopRankStr, which sorted by comparator
        public static MergeObj GetTopRankMergeObjByScore(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, bool debugFlag)
        {
            MergeObj topRankMergeObj = null;
            double   maxScore        = 0.0d;

            foreach (MergeObj mergeObj in candidates)
            {
                // 1. get the context and contextVec
                int       tarPos     = mergeObj.GetStartPos();
                int       tarSize    = mergeObj.GetEndPos() - mergeObj.GetStartPos() + 1;
                DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                // 2. get ContextSocre for each merge, use output matrix
                string       mergeWord = mergeObj.GetCoreMergeWord();
                ContextScore cs        = new ContextScore(mergeWord, contextVec, word2VecOm);
                double       score     = cs.GetScore();
                // update only if the score is > 0.0d
                if (score > maxScore)
                {
                    topRankMergeObj = mergeObj;
                    maxScore        = score;
                }
            }
            return(topRankMergeObj);
        }
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return the orignal inStr if no candidate has score > 0.0d
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag)
        {
            // init
            WordWcMap wordWcMap        = cSpellApi.GetWordWcMap();
            Word2Vec  word2VecIm       = cSpellApi.GetWord2VecIm();
            Word2Vec  word2VecOm       = cSpellApi.GetWord2VecOm();
            int       contextRadius    = cSpellApi.GetRw1To1ContextRadius();
            bool      word2VecSkipWord = cSpellApi.GetWord2VecSkipWord();
            int       maxCandNo        = cSpellApi.GetCanMaxCandNo();
            double    wf1        = cSpellApi.GetOrthoScoreEdDistFac();
            double    wf2        = cSpellApi.GetOrthoScorePhoneticFac();
            double    wf3        = cSpellApi.GetOrthoScoreOverlapFac();
            int       tarSize    = 1;    // only for one-to-one, no merge here
            string    topRankStr = inStr;
            // use cSpell top candidates
            int                topNo           = 1; // top sort
            string             inStrLc         = inStr.ToLower();
            List <CSpellScore> cSpellScoreList = RankByCSpellRealWord1To1.GetCandidateScoreList(inStrLc, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);

            // Find the correction str and correct
            if (cSpellScoreList.Count > 0)
            {
                // the rw top rank must be in both NC and orthographic
                CSpellScore  topScore        = cSpellScoreList[0];
                double       topFScore       = topScore.GetFScore().GetScore();         //frequency
                double       topTScore       = topScore.GetOScore().GetTokenScore();    // Token
                double       topPScore       = topScore.GetOScore().GetPhoneticScore(); //Phone
                double       topOScore       = topScore.GetOScore().GetOverlapScore();  //overlap
                ContextScore orgContextScore = null;
                // check the frequency
                // get the max score of frequency, eidt, phonetic, and overlap
                // the top rank must have all top score for above
                if ((topFScore == CSpellScore.GetMaxFScore(cSpellScoreList)) && (topTScore == CSpellScore.GetMaxEScore(cSpellScoreList)) && (topPScore == CSpellScore.GetMaxPScore(cSpellScoreList)) && (topOScore == CSpellScore.GetMaxOScore(cSpellScoreList)))
                {
                    ContextScore topContextScore = topScore.GetCScore();
                    // 1.1 wordVec for context
                    DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                    // 1.2 wordVec for the original words before one-to-one
                    orgContextScore = new ContextScore(inStr, contextVec, word2VecOm);
                    FrequencyScore orgFScore = new FrequencyScore(inStr, wordWcMap);
                    // pass the orgContextScore
                    if (IsTopCandValid(inStr, orgContextScore, topScore, orgFScore, cSpellApi, debugFlag) == true)
                    {
                        // no correction: if score is not good enough for corection
                        topRankStr = topScore.GetCandStr();
                        // debug print for ananlysis
                        /// <summary>
                        ///*
                        /// System.out.println("======= cSpellScoreList.size(): "
                        ///    + cSpellScoreList.size() + " ========");
                        /// System.out.println(inStr
                        ///    + "," + String.format("%1.8f", orgFScore.GetScore())
                        ///    + "," + String.format("%1.8f", orgContextScore.GetScore()));
                        /// System.out.println(CSpellScore.GetScoreHeader());
                        /// for(CSpellScore cSpellScore: cSpellScoreList)
                        /// {
                        ///    System.out.println(cSpellScore.ToString(","));
                        /// }
                        /// **
                        /// </summary>
                    }
                }
                // debug print
                if (debugFlag == true)
                {
                    // print focus token (original)
                    if (orgContextScore != null)
                    {
                        DebugPrint.PrintScore(orgContextScore.ToString(), debugFlag);
                    }
                    else
                    {
                        DebugPrint.PrintScore("No score for focus (" + inStr + ")", debugFlag);
                    }
                    // print candidate
                    var list = cSpellScoreList.Take(maxCandNo).Select(obj => obj.ToString()).ToList();
                    foreach (var item in list)
                    {
                        DebugPrint.PrintScore(item, debugFlag);
                    }
                }
            }
            return(topRankStr);
        }