// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>detect if real-word for merge /// <li>get candidates /// <ul> /// <li>get candidates from merge. /// </ul> /// <li>Rank candidates /// <ul> /// <li>context /// <li>frequency (TBD) /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="tarPos"> the position of target tokenObj </param> /// <param name="nonSpaceTokenList"> token list without space tokens </param> /// <param name="cSpellApi"> for all dictioanry and Word2Vec data </param> /// <param name="debugFlag"> boolean flag for debug print /// </param> /// <returns> the corrected merged word in MergeObj if the target token /// matches real-word merged rules. /// Otherwise, a null of MergeObj is returned. </returns> // return the original term if no good correctin are found public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag) { // get tarWord from tarTokenObj and init outTokenObj TokenObj tarTokenObj = nonSpaceTokenList[tarPos]; string tarWord = tarTokenObj.GetTokenStr(); // 1. only remove ending punctuation for coreTerm // No coreStr is used for real-word merge for less aggressive //String coreStr = TermUtil.StripEndPuncSpace(tarWord).toLowerCase(); // 2. real-word merge correction // check if tarWord and removeEndPuncStr is OOV MergeObj outMergeObj = null; // no merge if it is null if ((tarTokenObj.GetProcHist().Count == 0) && (RealWordMergeDetector.IsDetect(tarWord, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get candidates from merge // set mergeWithHypehn to false for real-word merge HashSet <MergeObj> mergeSet = RealWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); // 4. Ranking: get top ranked candidates as corrected terms // 4.1 just use frenquency or context, no orthoGraphic // in case of using context // need the context & frequency score for the orgMergeTerm outMergeObj = RankRealWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); } return(outMergeObj); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get candidates /// <ul> /// <li>get candidates from one-to-one. /// </ul> /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> the position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if suggested word found. /// Otherwise, the original input token is returned. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. real-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWord1To1Detector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3 get 1-to-1 candidates set from correction // TBD. realWordFlag to use metaphone ... // this process is very slow, 7 min., need to improved HashSet <string> candSet = RealWord1To1Candidates.GetCandidates(coreStr, cSpellApi); /// <summary> ///** development analysis print out to see total RW /// totalRwNo_++; /// int candSize = candSet.size(); /// if(candSize != 0) /// { /// totalCandNo_ += candSize; /// maxCandSize_ /// = ((candSize > maxCandSize_)?candSize:maxCandSize_); /// System.out.println("---- totalRwNo|totalCandNo(" + coreStr /// + "): " + totalRwNo_ + "|" + candSize + "|" /// + totalCandNo_ + "|" + maxCandSize_); /// System.out.println(candSet); /// } /// *** /// </summary> // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWord1To1ByCSpell.GetTopRankStr(coreStr, candSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord, StringComparison.OrdinalIgnoreCase) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_1); // 1-to-1 DebugPrint.PrintCorrect("RW", "RealWord1To1Corrector", inWord, outWord, debugFlag); } } return(outTokenObj); }
// public method // the input mergeObjList is in the same order of index as inTokenList // TBD: has bug: "imple ment ation" => implementimplementation public static List <TokenObj> CorrectTokenListByMerge(List <TokenObj> inTokenList, List <MergeObj> mergeObjList, string procHistStr, bool debugFlag, CSpellApi cSpellApi) { // 0. unify the mergeObjList to remove contain and overlap List <MergeObj> mergeObjListC = CleanUpMergeObjList(mergeObjList); List <TokenObj> outTokenList = new List <TokenObj>(); // 1. go through all mergeObj int curIndex = 0; foreach (MergeObj mergeObj in mergeObjListC) { //System.out.println(mergeObj.ToString()); int startIndex = mergeObj.GetStartIndex(); int endIndex = mergeObj.GetEndIndex(); // 1. update tokens before merge start for (int i = curIndex; i < startIndex; i++) { outTokenList.Add(inTokenList[i]); } // 2. update merge at target string mergeWord = mergeObj.GetMergeWord(); string orgMergeWord = mergeObj.GetOrgMergeWord(); string tarWord = mergeObj.GetTarWord(); TokenObj mergeTokenObj = new TokenObj(orgMergeWord, mergeWord); // update process history for (int i = startIndex; i <= endIndex; i++) { // merge focus token if (i == mergeObj.GetTarIndex()) { cSpellApi.UpdateCorrectNo(); mergeTokenObj.AddProcToHist(procHistStr + TokenObj.MERGE_START_STR + tarWord + TokenObj.MERGE_END_STR); //DebugPrint.PrintCorrect("NW", DebugPrint.PrintCorrect(procHistStr, "MergeCorrector (" + tarWord + ")", orgMergeWord, mergeWord, debugFlag); } else // not merge focus token, context { TokenObj contextToken = inTokenList[i]; List <string> contextProcHist = contextToken.GetProcHist(); foreach (string procHist in contextProcHist) { mergeTokenObj.AddProcToHist(procHist + TokenObj.MERGE_START_STR + contextToken.GetTokenStr() + TokenObj.MERGE_END_STR); } } } outTokenList.Add(mergeTokenObj); curIndex = endIndex + 1; } // 2. add tokens after the last merge Obj for (int i = curIndex; i < inTokenList.Count; i++) { outTokenList.Add(inTokenList[i]); } return(outTokenList); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get split candidates /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> cSpell API object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position of the target token to be split </param> /// <param name="nonSpaceTokenList"> the token list without space tokens /// </param> /// <returns> the split words in tokenObj. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get split candidates set from correction int maxSplitNo = cSpellApi.GetCanRwMaxSplitNo(); HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // get candidates from split // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_S); //split DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag); } } return(outTokenObj); }