// public method
        // process
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // pre-porcess
            // update Pos for the inTokenList
            TextObj.UpdateIndexPos(inTokenList);
            // 1. remove non space-token and convert to non-space-token list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // 2. process: go through each token for detection and correction
            // to find merge corrections (mergeObjList)
            int             index               = 0;
            List <MergeObj> mergeObjList        = new List <MergeObj>();
            int             maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            while (index < inTokenList.Count)
            {
                TokenObj curTokenObj = inTokenList[index];

                // update the tarPos
                // SCR-3, use legit token
                if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    int tarPos = inTokenList[index].GetPos();
                    // correct term is the highest ranked candidates
                    MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag);
                    if (mergeObj == null)                       // no merge correction
                    {
                        index++;
                    }
                    else                         // has merge correction
                    {
                        mergeObjList.Add(mergeObj);
                        // next token after end token, this ensures no overlap merge
                        index = mergeObj.GetEndIndex() + 1;
                    }
                }
                else                     // space token
                                         // update index
                {
                    index++;
                }
            }
            // update the output for merge for the whole inTokenList,
            // has to update after the loop bz merge might
            // happen to the previous token
            // update the tokenObj up to the merge, then go to the next token
            // update operation info also
            List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi);

            return(outTokenList);
        }
Пример #2
0
        public static List <TokenObj> Process(List <TokenObj> inTokenList, int ndMaxSplitNo, Dictionary <string, string> infExpMap, bool debugFlag)
        {
            DebugPrint.PrintProcess("1. NonDictionary", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // process on each tokenObj

            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList
                                                               .Select(token => XmlHtmlHandler.Process(token, debugFlag))
                                                               .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => LeadingDigitSplitter.Process(token, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => EndingDigitSplitter.Process(token, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList());

            /*
             * List<TokenObj> outTokenList2 = inTokenList
             *      .Select(token => XmlHtmlHandler.Process(token, debugFlag))
             *      .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)).ToList();
             * foreach(var item in outTokenList2) {
             *      Console.WriteLine(item.GetTokenStr());
             * }
             * List<TokenObj> outTokenList3 = inTokenList
             *      .Select(token => XmlHtmlHandler.Process(token, debugFlag))
             *      .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
             *      .SelectMany(token => TextObj.FlatTokenToArrayList(token)).ToList();
             * foreach(var item in outTokenList3) {
             *      Console.WriteLine(item.GetTokenStr());
             * }
             *
             *
             * List<TokenObj> outTokenList = inTokenList
             *      .Select(token => XmlHtmlHandler.Process(token, debugFlag))
             *      .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
             *      .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
             *      .Select(token => LeadingDigitSplitter.Process(token, debugFlag))
             *      .Select(token => EndingDigitSplitter.Process(token, debugFlag))
             *      .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList();
             */
            return(outTokenList);
        }
Пример #3
0
        // public method
        // Use: for loop, the latest and greatest implementation
        // original implementation with for loop, To be deleted
        // the core of spell-correction, include split
        // inTokenList is the whole text
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // init the output TokenList
            List <TokenObj> outTokenList = new List <TokenObj>();
            // process: go through each token for detection and correction
            // for the 1-to-1 and split correction
            int tarPos = 0;             // the position of the tokenObj in the inTokenList
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // use the inTokenList to keep the same spcae token
            TokenObj outTokenObj         = null;
            int      maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            foreach (TokenObj tokenObj in inTokenList)
            {
                /// <summary>
                /// no context
                /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj,
                ///    cSpellApi, debugFlag);
                ///
                /// </summary>
                // skip empty space tokens and long tokens
                // SCR-3, use legit token
                if (tokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    // correct term is the highest ranked candidate
                    outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                    // used tarPos for context module
                    tarPos++;
                }
                else
                {
                    outTokenObj = tokenObj;
                }
                // add the corrected tokenObj to the output token list
                // use FlatMap because there might be a split
                Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj);
            }
            return(outTokenList);
        }