// public method // process public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // pre-porcess // update Pos for the inTokenList TextObj.UpdateIndexPos(inTokenList); // 1. remove non space-token and convert to non-space-token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // 2. process: go through each token for detection and correction // to find merge corrections (mergeObjList) int index = 0; List <MergeObj> mergeObjList = new List <MergeObj>(); int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); while (index < inTokenList.Count) { TokenObj curTokenObj = inTokenList[index]; // update the tarPos // SCR-3, use legit token if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true) { int tarPos = inTokenList[index].GetPos(); // correct term is the highest ranked candidates MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); if (mergeObj == null) // no merge correction { index++; } else // has merge correction { mergeObjList.Add(mergeObj); // next token after end token, this ensures no overlap merge index = mergeObj.GetEndIndex() + 1; } } else // space token // update index { index++; } } // update the output for merge for the whole inTokenList, // has to update after the loop bz merge might // happen to the previous token // update the tokenObj up to the merge, then go to the next token // update operation info also List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi); return(outTokenList); }
public static List <TokenObj> Process(List <TokenObj> inTokenList, int ndMaxSplitNo, Dictionary <string, string> infExpMap, bool debugFlag) { DebugPrint.PrintProcess("1. NonDictionary", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // process on each tokenObj List <TokenObj> outTokenList = new List <TokenObj>(inTokenList .Select(token => XmlHtmlHandler.Process(token, debugFlag)) .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => LeadingDigitSplitter.Process(token, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => EndingDigitSplitter.Process(token, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList()); /* * List<TokenObj> outTokenList2 = inTokenList * .Select(token => XmlHtmlHandler.Process(token, debugFlag)) * .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)).ToList(); * foreach(var item in outTokenList2) { * Console.WriteLine(item.GetTokenStr()); * } * List<TokenObj> outTokenList3 = inTokenList * .Select(token => XmlHtmlHandler.Process(token, debugFlag)) * .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) * .SelectMany(token => TextObj.FlatTokenToArrayList(token)).ToList(); * foreach(var item in outTokenList3) { * Console.WriteLine(item.GetTokenStr()); * } * * * List<TokenObj> outTokenList = inTokenList * .Select(token => XmlHtmlHandler.Process(token, debugFlag)) * .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) * .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) * .Select(token => LeadingDigitSplitter.Process(token, debugFlag)) * .Select(token => EndingDigitSplitter.Process(token, debugFlag)) * .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList(); */ return(outTokenList); }
// public method // Use: for loop, the latest and greatest implementation // original implementation with for loop, To be deleted // the core of spell-correction, include split // inTokenList is the whole text public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // init the output TokenList List <TokenObj> outTokenList = new List <TokenObj>(); // process: go through each token for detection and correction // for the 1-to-1 and split correction int tarPos = 0; // the position of the tokenObj in the inTokenList // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // use the inTokenList to keep the same spcae token TokenObj outTokenObj = null; int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); foreach (TokenObj tokenObj in inTokenList) { /// <summary> /// no context /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj, /// cSpellApi, debugFlag); /// /// </summary> // skip empty space tokens and long tokens // SCR-3, use legit token if (tokenObj.IsLegitToken(maxLegitTokenLength) == true) { // correct term is the highest ranked candidate outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // used tarPos for context module tarPos++; } else { outTokenObj = tokenObj; } // add the corrected tokenObj to the output token list // use FlatMap because there might be a split Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj); } return(outTokenList); }