/// <summary> /// This method handle leading digit by adding a space after the digit /// It is desgined to work on the input of single word. /// </summary> /// <param name="inWord"> the input token (single word) /// </param> /// <returns> the corrected word, does nto change the case, /// the original input token is returned if no mapping is found. </returns> public static string Process(string inWord) { string outWord = inWord; // convert to coreterm, such as hereditary2) bool splitFlag = false; int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC; CoreTermObj cto = new CoreTermObj(inWord, ctType); string inCoreTerm = cto.GetCoreTerm(); // update core term: check if the token leads with digit var matcherLd = patternED_.Match(inCoreTerm); if ((matcherLd.Success == true) && (DigitPuncTokenUtil.IsDigitPunc(inCoreTerm) == false)) // can't be digit // update core term: split if it is an exception { if (IsException(inCoreTerm) == false) { string outCoreTerm = matcherLd.Groups[1].Value + matcherLd.Groups[2].Value + GlobalVars.SPACE_STR + matcherLd.Groups[3].Value; cto.SetCoreTerm(outCoreTerm); splitFlag = true; } } // get outWord from coreTermObj if split happens if (splitFlag == true) { outWord = cto.ToString(); } return(outWord); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get candidates /// <ul> /// <li>get candidates from one-to-one. /// </ul> /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> the position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if suggested word found. /// Otherwise, the original input token is returned. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. real-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWord1To1Detector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3 get 1-to-1 candidates set from correction // TBD. realWordFlag to use metaphone ... // this process is very slow, 7 min., need to improved HashSet <string> candSet = RealWord1To1Candidates.GetCandidates(coreStr, cSpellApi); /// <summary> ///** development analysis print out to see total RW /// totalRwNo_++; /// int candSize = candSet.size(); /// if(candSize != 0) /// { /// totalCandNo_ += candSize; /// maxCandSize_ /// = ((candSize > maxCandSize_)?candSize:maxCandSize_); /// System.out.println("---- totalRwNo|totalCandNo(" + coreStr /// + "): " + totalRwNo_ + "|" + candSize + "|" /// + totalCandNo_ + "|" + maxCandSize_); /// System.out.println(candSet); /// } /// *** /// </summary> // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWord1To1ByCSpell.GetTopRankStr(coreStr, candSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord, StringComparison.OrdinalIgnoreCase) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_1); // 1-to-1 DebugPrint.PrintCorrect("RW", "RealWord1To1Corrector", inWord, outWord, debugFlag); } } return(outTokenObj); }
/// <summary> /// This method splits the input word by adding a space after ending /// punctuation. The input must be single word (no space). /// The process method splits the inWord by adding space(s) after endingPunc. /// Current algorithm can only handle max. up to 3 endignPuncs. /// One in each component of coreTermObj: coreTerm, prefix, and suffix. /// - prefix: leading str with punc|spac|number /// - coreterm: = the original str - prefix - suffix /// - suffix: ending str with punc|space|number /// This can be improved by using recursive algorithm in the coreTerm. /// For example: "ankle,before.The" in 15737.txt will be split twice in /// recursive algorithm. /// </summary> /// <param name="inWord"> the input token (single word) /// </param> /// <returns> the splited word. </returns> public static string Process(string inWord) { string outWord = inWord; bool debugFlag = false; // eProcess: check if can skip int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; if (IsQualified(inWord) == true) { // 0. convert to coreTerm object bool splitFlag = false; CoreTermObj cto = new CoreTermObj(inWord, ctType); // 1. update coreTerm string inCoreTerm = cto.GetCoreTerm(); string lastEndingPunc = FindLastEndingPunc(inCoreTerm); // add a space after the last endingPunc if (!string.ReferenceEquals(lastEndingPunc, null)) { // get the splitObj and then the split string string outCoreTerm = EndingPunc.GetSplitStr(inCoreTerm, lastEndingPunc); cto.SetCoreTerm(outCoreTerm); splitFlag = true; } // 2. update the prefix when it ends with a endingPunc // prefix contains punc and numbers string prefix = cto.GetPrefix(); if ((prefix.Length != 0) && (EndsWithEndingPunc(prefix) == true)) // ends with endingPunc { prefix = prefix + GlobalVars.SPACE_STR; cto.SetPrefix(prefix); splitFlag = true; } // 3. update the suffix and add a space after the last endingPunc // suffix contians punctuation and numbers string suffix = cto.GetSuffix(); if ((suffix.Length != 0) && (ContainsEndingPunc(suffix) == true) && (IsPureEndingPunc(suffix) == false)) // can't be pure endingPuncs // add space after the last endingPunc { string lastEndingPunc2 = FindLastEndingPunc(suffix); if (!string.ReferenceEquals(lastEndingPunc2, null)) { // get the splitObj and then the split string string outSuffix = EndingPunc.GetSplitStr(suffix, lastEndingPunc2); cto.SetSuffix(outSuffix); splitFlag = true; } } // update outWord if (splitFlag == true) { outWord = cto.ToString(); } } return(outWord); }
// broader matcher private static bool IsQualified(string inWord) { bool qFlag = false; // use coreTerm for URL and eMail int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj cto = new CoreTermObj(inWord, ctType); string inCoreTerm = cto.GetCoreTerm(); // check if pass the matcher to be qualified if ((ContainsEndingPunc(inWord) == true) && (InternetTokenUtil.IsEmail(inCoreTerm) == false) && (InternetTokenUtil.IsUrl(inCoreTerm) == false) && (DigitPuncTokenUtil.IsDigitPunc(inWord) == false)) //skip if digitPunc { qFlag = true; } return(qFlag); }
// broader matcher private static bool IsQualified(string inWord) { bool qFlag = false; // use coreTerm for URL and eMail int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj cto = new CoreTermObj(inWord, ctType); string inCoreTerm = cto.GetCoreTerm(); // check if pass the matcher to be qualified if (ContainsLeadingPunc(inWord) == true) // contains leadingPunc { qFlag = true; } return(qFlag); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get split candidates /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> cSpell API object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position of the target token to be split </param> /// <param name="nonSpaceTokenList"> the token list without space tokens /// </param> /// <returns> the split words in tokenObj. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get split candidates set from correction int maxSplitNo = cSpellApi.GetCanRwMaxSplitNo(); HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // get candidates from split // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_S); //split DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag); } } return(outTokenObj); }
/// <summary> /// This method uses context scores to find the correct term. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if the coreTerm is OOV /// and suggested word found. Otherwise, the original input token /// is returned. </returns> public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is spelling errors - non-word //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag); // TBD .. need to separate 1-to-1 and split if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3.1 get 1-to-1 candidates set from correction, no split HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi); // add split // TBD ... if (funcMode != CSpellApi.FUNC_MODE_NW_1) { // 3.2 get candidates from split int maxSplitNo = cSpellApi.GetCanNwMaxSplitNo(); HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // 3.4 set split candidates to candidate if (funcMode == CSpellApi.FUNC_MODE_NW_S) { candSet = new HashSet <string>(splitSet); } else // 3.4 add split candidates { candSet.addAll(splitSet); } } // 4. Ranking: get top ranked candidates as corrected terms // 4.1 from orthoGraphic /* * // not used context * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet, * cSpellApi, debugFlag); */ // in case of using context string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a process if (inWord.Equals(outWord) == false) { outTokenObj.SetTokenStr(outWord); if (TermUtil.IsMultiword(outWord) == true) { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_S); //split DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag); } else // 1To1 correct { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_1); DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag); } } } return(outTokenObj); }