// public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>detect if real-word for merge
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from merge.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     <li>frequency (TBD)
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="tarPos">    the position of target tokenObj </param>
        /// <param name="nonSpaceTokenList"> token list without space tokens </param>
        /// <param name="cSpellApi"> for all dictioanry and Word2Vec data </param>
        /// <param name="debugFlag"> boolean flag for debug print
        /// </param>
        /// <returns>    the corrected merged word in MergeObj if the target token
        ///             matches real-word merged rules.
        ///             Otherwise, a null of MergeObj is returned. </returns>
        // return the original term if no good correctin are found
        public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            // get tarWord from tarTokenObj and init outTokenObj
            TokenObj tarTokenObj = nonSpaceTokenList[tarPos];
            string   tarWord     = tarTokenObj.GetTokenStr();
            // 1. only remove ending punctuation for coreTerm
            // No coreStr is used for real-word merge for less aggressive
            //String coreStr = TermUtil.StripEndPuncSpace(tarWord).toLowerCase();
            // 2. real-word merge correction
            // check if tarWord and removeEndPuncStr is OOV
            MergeObj outMergeObj = null;             // no merge if it is null

            if ((tarTokenObj.GetProcHist().Count == 0) && (RealWordMergeDetector.IsDetect(tarWord, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3. get candidates from merge
                // set mergeWithHypehn to false for real-word merge
                HashSet <MergeObj> mergeSet = RealWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi);
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 just use frenquency or context, no orthoGraphic
                // in case of using context
                // need the context & frequency score for the orgMergeTerm
                outMergeObj = RankRealWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
            }
            return(outMergeObj);
        }
Exemple #2
0
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from one-to-one.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> the position for target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s)
        /// </param>
        /// <returns>    the corrected word in tokenObj if suggested word found.
        ///             Otherwise, the original input token is returned. </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. real-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWord1To1Detector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3 get 1-to-1 candidates set from correction
                // TBD. realWordFlag to use metaphone ...
                // this process is very slow, 7 min., need to improved
                HashSet <string> candSet = RealWord1To1Candidates.GetCandidates(coreStr, cSpellApi);
                /// <summary>
                ///** development analysis print out to see total RW
                ///            totalRwNo_++;
                ///            int candSize = candSet.size();
                ///            if(candSize != 0)
                ///            {
                ///                totalCandNo_ += candSize;
                ///                maxCandSize_
                ///                    = ((candSize > maxCandSize_)?candSize:maxCandSize_);
                ///                System.out.println("---- totalRwNo|totalCandNo(" + coreStr
                ///                    + "): " + totalRwNo_ + "|" + candSize + "|"
                ///                    + totalCandNo_ + "|" + maxCandSize_);
                ///                System.out.println(candSet);
                ///            }
                /// ***
                /// </summary>
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWord1To1ByCSpell.GetTopRankStr(coreStr, candSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord, StringComparison.OrdinalIgnoreCase) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_1);                     // 1-to-1
                    DebugPrint.PrintCorrect("RW", "RealWord1To1Corrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }
        // public method
        // the input mergeObjList is in the same order of index as inTokenList
        // TBD: has bug: "imple ment ation" => implementimplementation
        public static List <TokenObj> CorrectTokenListByMerge(List <TokenObj> inTokenList, List <MergeObj> mergeObjList, string procHistStr, bool debugFlag, CSpellApi cSpellApi)
        {
            // 0. unify the mergeObjList to remove contain and overlap
            List <MergeObj> mergeObjListC = CleanUpMergeObjList(mergeObjList);

            List <TokenObj> outTokenList = new List <TokenObj>();
            // 1. go through all mergeObj
            int curIndex = 0;

            foreach (MergeObj mergeObj in mergeObjListC)
            {
                //System.out.println(mergeObj.ToString());
                int startIndex = mergeObj.GetStartIndex();
                int endIndex   = mergeObj.GetEndIndex();
                // 1. update tokens before merge start
                for (int i = curIndex; i < startIndex; i++)
                {
                    outTokenList.Add(inTokenList[i]);
                }
                // 2. update merge at target
                string   mergeWord     = mergeObj.GetMergeWord();
                string   orgMergeWord  = mergeObj.GetOrgMergeWord();
                string   tarWord       = mergeObj.GetTarWord();
                TokenObj mergeTokenObj = new TokenObj(orgMergeWord, mergeWord);
                // update process history
                for (int i = startIndex; i <= endIndex; i++)
                {
                    // merge focus token
                    if (i == mergeObj.GetTarIndex())
                    {
                        cSpellApi.UpdateCorrectNo();
                        mergeTokenObj.AddProcToHist(procHistStr + TokenObj.MERGE_START_STR + tarWord + TokenObj.MERGE_END_STR);
                        //DebugPrint.PrintCorrect("NW",
                        DebugPrint.PrintCorrect(procHistStr, "MergeCorrector (" + tarWord + ")", orgMergeWord, mergeWord, debugFlag);
                    }
                    else                         // not merge focus token, context
                    {
                        TokenObj      contextToken    = inTokenList[i];
                        List <string> contextProcHist = contextToken.GetProcHist();
                        foreach (string procHist in contextProcHist)
                        {
                            mergeTokenObj.AddProcToHist(procHist + TokenObj.MERGE_START_STR + contextToken.GetTokenStr() + TokenObj.MERGE_END_STR);
                        }
                    }
                }
                outTokenList.Add(mergeTokenObj);
                curIndex = endIndex + 1;
            }
            // 2. add tokens after the last merge Obj
            for (int i = curIndex; i < inTokenList.Count; i++)
            {
                outTokenList.Add(inTokenList[i]);
            }
            return(outTokenList);
        }
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get split candidates
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> cSpell API object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position of the target token to be split </param>
        /// <param name="nonSpaceTokenList"> the token list without space tokens
        /// </param>
        /// <returns>    the split words in tokenObj.  </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3. get split candidates set from correction
                int maxSplitNo            = cSpellApi.GetCanRwMaxSplitNo();
                HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                // get candidates from split
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_S);                     //split
                    DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }