/// <summary>
        /// This method handle leading digit by adding a space after the digit
        /// It is desgined to work on the input of single word.
        /// </summary>
        /// <param name="inWord">  the input token (single word)
        /// </param>
        /// <returns>   the corrected word, does nto change the case,
        ///           the original input token is returned if no mapping is found. </returns>
        public static string Process(string inWord)
        {
            string outWord = inWord;
            // convert to coreterm, such as hereditary2)
            bool        splitFlag  = false;
            int         ctType     = CoreTermUtil.CT_TYPE_SPACE_PUNC;
            CoreTermObj cto        = new CoreTermObj(inWord, ctType);
            string      inCoreTerm = cto.GetCoreTerm();
            // update core term: check if the token leads with digit
            var matcherLd = patternED_.Match(inCoreTerm);

            if ((matcherLd.Success == true) && (DigitPuncTokenUtil.IsDigitPunc(inCoreTerm) == false))               // can't be digit
            // update core term: split if it is an exception
            {
                if (IsException(inCoreTerm) == false)
                {
                    string outCoreTerm = matcherLd.Groups[1].Value + matcherLd.Groups[2].Value + GlobalVars.SPACE_STR + matcherLd.Groups[3].Value;
                    cto.SetCoreTerm(outCoreTerm);
                    splitFlag = true;
                }
            }
            // get outWord from coreTermObj if split happens
            if (splitFlag == true)
            {
                outWord = cto.ToString();
            }
            return(outWord);
        }
Пример #2
0
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from one-to-one.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> the position for target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s)
        /// </param>
        /// <returns>    the corrected word in tokenObj if suggested word found.
        ///             Otherwise, the original input token is returned. </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. real-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWord1To1Detector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3 get 1-to-1 candidates set from correction
                // TBD. realWordFlag to use metaphone ...
                // this process is very slow, 7 min., need to improved
                HashSet <string> candSet = RealWord1To1Candidates.GetCandidates(coreStr, cSpellApi);
                /// <summary>
                ///** development analysis print out to see total RW
                ///            totalRwNo_++;
                ///            int candSize = candSet.size();
                ///            if(candSize != 0)
                ///            {
                ///                totalCandNo_ += candSize;
                ///                maxCandSize_
                ///                    = ((candSize > maxCandSize_)?candSize:maxCandSize_);
                ///                System.out.println("---- totalRwNo|totalCandNo(" + coreStr
                ///                    + "): " + totalRwNo_ + "|" + candSize + "|"
                ///                    + totalCandNo_ + "|" + maxCandSize_);
                ///                System.out.println(candSet);
                ///            }
                /// ***
                /// </summary>
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWord1To1ByCSpell.GetTopRankStr(coreStr, candSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord, StringComparison.OrdinalIgnoreCase) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_1);                     // 1-to-1
                    DebugPrint.PrintCorrect("RW", "RealWord1To1Corrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }
        /// <summary>
        /// This method splits the input word by adding a space after ending
        /// punctuation.  The input must be single word (no space).
        /// The process method splits the inWord by adding space(s) after endingPunc.
        /// Current algorithm can only handle max. up to 3 endignPuncs.
        /// One in each component of coreTermObj: coreTerm, prefix, and suffix.
        /// - prefix: leading str with punc|spac|number
        /// - coreterm: = the original str - prefix - suffix
        /// - suffix: ending str with punc|space|number
        /// This can be improved by using recursive algorithm in the coreTerm.
        /// For example: "ankle,before.The" in 15737.txt will be split twice in
        /// recursive algorithm.
        /// </summary>
        /// <param name="inWord">  the input token (single word)
        /// </param>
        /// <returns>   the splited word. </returns>
        public static string Process(string inWord)
        {
            string outWord   = inWord;
            bool   debugFlag = false;
            // eProcess: check if can skip
            int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;

            if (IsQualified(inWord) == true)
            {
                // 0. convert to coreTerm object
                bool        splitFlag = false;
                CoreTermObj cto       = new CoreTermObj(inWord, ctType);
                // 1. update coreTerm
                string inCoreTerm     = cto.GetCoreTerm();
                string lastEndingPunc = FindLastEndingPunc(inCoreTerm);
                // add a space after the last endingPunc
                if (!string.ReferenceEquals(lastEndingPunc, null))
                {
                    // get the splitObj and then the split string
                    string outCoreTerm = EndingPunc.GetSplitStr(inCoreTerm, lastEndingPunc);
                    cto.SetCoreTerm(outCoreTerm);
                    splitFlag = true;
                }
                // 2. update the prefix when it ends with a endingPunc
                // prefix contains punc and numbers
                string prefix = cto.GetPrefix();
                if ((prefix.Length != 0) && (EndsWithEndingPunc(prefix) == true))                   // ends with endingPunc
                {
                    prefix = prefix + GlobalVars.SPACE_STR;
                    cto.SetPrefix(prefix);
                    splitFlag = true;
                }
                // 3. update the suffix and add a space after the last endingPunc
                // suffix contians punctuation and numbers
                string suffix = cto.GetSuffix();
                if ((suffix.Length != 0) && (ContainsEndingPunc(suffix) == true) && (IsPureEndingPunc(suffix) == false))                   // can't be pure endingPuncs
                // add space after the last endingPunc
                {
                    string lastEndingPunc2 = FindLastEndingPunc(suffix);
                    if (!string.ReferenceEquals(lastEndingPunc2, null))
                    {
                        // get the splitObj and then the split string
                        string outSuffix = EndingPunc.GetSplitStr(suffix, lastEndingPunc2);
                        cto.SetSuffix(outSuffix);
                        splitFlag = true;
                    }
                }
                // update outWord
                if (splitFlag == true)
                {
                    outWord = cto.ToString();
                }
            }
            return(outWord);
        }
        // broader matcher
        private static bool IsQualified(string inWord)
        {
            bool qFlag = false;
            // use coreTerm for URL and eMail
            int         ctType     = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj cto        = new CoreTermObj(inWord, ctType);
            string      inCoreTerm = cto.GetCoreTerm();

            // check if pass the matcher to be qualified
            if ((ContainsEndingPunc(inWord) == true) && (InternetTokenUtil.IsEmail(inCoreTerm) == false) && (InternetTokenUtil.IsUrl(inCoreTerm) == false) && (DigitPuncTokenUtil.IsDigitPunc(inWord) == false))               //skip if digitPunc
            {
                qFlag = true;
            }
            return(qFlag);
        }
        // broader matcher
        private static bool IsQualified(string inWord)
        {
            bool qFlag = false;
            // use coreTerm for URL and eMail
            int         ctType     = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj cto        = new CoreTermObj(inWord, ctType);
            string      inCoreTerm = cto.GetCoreTerm();

            // check if pass the matcher to be qualified
            if (ContainsLeadingPunc(inWord) == true)               // contains leadingPunc
            {
                qFlag = true;
            }
            return(qFlag);
        }
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get split candidates
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> cSpell API object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position of the target token to be split </param>
        /// <param name="nonSpaceTokenList"> the token list without space tokens
        /// </param>
        /// <returns>    the split words in tokenObj.  </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3. get split candidates set from correction
                int maxSplitNo            = cSpellApi.GetCanRwMaxSplitNo();
                HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                // get candidates from split
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_S);                     //split
                    DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }
Пример #7
0
        /// <summary>
        /// This method uses context scores to find the correct term.
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position for target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s)
        /// </param>
        /// <returns>    the corrected word in tokenObj if the coreTerm is OOV
        ///             and suggested word found. Otherwise, the original input token
        ///             is returned. </returns>
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is spelling errors - non-word
            //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag);
            // TBD .. need to separate 1-to-1 and split
            if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3.1 get 1-to-1 candidates set from correction, no split
                HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi);
                // add split
                // TBD ...
                if (funcMode != CSpellApi.FUNC_MODE_NW_1)
                {
                    // 3.2 get candidates from split
                    int maxSplitNo            = cSpellApi.GetCanNwMaxSplitNo();
                    HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                    // 3.4 set split candidates to candidate
                    if (funcMode == CSpellApi.FUNC_MODE_NW_S)
                    {
                        candSet = new HashSet <string>(splitSet);
                    }
                    else                         // 3.4 add split candidates
                    {
                        candSet.addAll(splitSet);
                    }
                }
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 from orthoGraphic

                /*
                 * // not used context
                 * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet,
                 *  cSpellApi, debugFlag);
                 */
                // in case of using context
                string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a process
                if (inWord.Equals(outWord) == false)
                {
                    outTokenObj.SetTokenStr(outWord);
                    if (TermUtil.IsMultiword(outWord) == true)
                    {
                        cSpellApi.UpdateCorrectNo();
                        outTokenObj.AddProcToHist(TokenObj.HIST_NW_S);                         //split
                        DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag);
                    }
                    else                         // 1To1 correct
                    {
                        cSpellApi.UpdateCorrectNo();
                        outTokenObj.AddProcToHist(TokenObj.HIST_NW_1);
                        DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag);
                    }
                }
            }
            return(outTokenObj);
        }