// 3 operations: // convert a tokenObj to a arrayList of tokenObjs: // 1. merge (delete) a tokenObj if the str is empty (length = 0) // 2. keep the same tokenObj if str is a single word // 3. split a tokenObj if the str contains space public static void AddSplit1To1Correction(List <TokenObj> inList, TokenObj inToken) { string tokenStr = inToken.GetTokenStr(); // 1. do not add to the list if the token is empty if ((string.ReferenceEquals(tokenStr, null)) || (tokenStr.Length == 0)) { // do nothing } // 2. keep the same tokenObj if there is no change // 1-to-1 correction else if (TermUtil.IsMultiword(tokenStr) == false) { Add1To1Correction(inList, inToken); // TB Deleted //inList.add(inToken); } // 3. split a tokenObj to an arrayList if the str has space else { AddSplitCorrection(inList, inToken); /* TB deleted * ArrayList<TokenObj> tempTokenList = new ArrayList<TokenObj>(); * // keep token and delimiters * String[] tokenArray = tokenStr.split(TextObj.patternStrSpace_); * tempTokenList = new ArrayList<TokenObj>(Arrays.stream(tokenArray) * .map(token -> new TokenObj(inToken, token)) * .collect(Collectors.toList())); * inList.addAll(tempTokenList); */ } }
/// <summary> /// This method uses context scores to find the correct term. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if the coreTerm is OOV /// and suggested word found. Otherwise, the original input token /// is returned. </returns> public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is spelling errors - non-word //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag); // TBD .. need to separate 1-to-1 and split if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3.1 get 1-to-1 candidates set from correction, no split HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi); // add split // TBD ... if (funcMode != CSpellApi.FUNC_MODE_NW_1) { // 3.2 get candidates from split int maxSplitNo = cSpellApi.GetCanNwMaxSplitNo(); HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // 3.4 set split candidates to candidate if (funcMode == CSpellApi.FUNC_MODE_NW_S) { candSet = new HashSet <string>(splitSet); } else // 3.4 add split candidates { candSet.addAll(splitSet); } } // 4. Ranking: get top ranked candidates as corrected terms // 4.1 from orthoGraphic /* * // not used context * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet, * cSpellApi, debugFlag); */ // in case of using context string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a process if (inWord.Equals(outWord) == false) { outTokenObj.SetTokenStr(outWord); if (TermUtil.IsMultiword(outWord) == true) { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_S); //split DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag); } else // 1To1 correct { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_1); DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag); } } } return(outTokenObj); }