private static void TestProcessText()
        {
            // init
            Console.WriteLine("----- Test Process Text: -----");
            string inText       = "Head rolling & rock(5'8").";
            int    MaxRecursive = 5;
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token)).Select(token => LeadingPuncSplitter.Process(token, MaxRecursive)).ToList());
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------- LeadingPuncSplitter( ) Test -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToHistString());
                index++;
            }
        }
        private static void TestProcess(Dictionary <string, string> informalExpMap)
        {
            // init
            Console.WriteLine("----- Test Process Text: -----");
            string inText = "u rolling &amp;amp; pls(12years).";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token))
                                                               .Select(token => LeadingPuncSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => LeadingDigitSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => InformalExpHandler.Process(token, informalExpMap)).ToList());

            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------- LeadingDigitSplitter( ) Test ----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToHistString());
                index++;
            }
        }
        // privat methods
        private static void Test(string configFile)
        {
            // init
            Console.WriteLine("----- Test Pre-Correction Text: -----");
            string    inText       = "We  cant theredve hell.Plz u r good123. ";
            CSpellApi cSpellApi    = new CSpellApi(configFile);
            int       ndMaxSplitNo = cSpellApi.GetCanNdMaxSplitNo();
            Dictionary <string, string> infExpMap = cSpellApi.GetInformalExpressionMap();
            bool debugFlag = false;

            // 1. convert input to TokenObjs
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = ProcNdCorrector.Process(inTokenList, ndMaxSplitNo, infExpMap, debugFlag);
            string          outText      = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------------------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToHistString());
                index++;
            }
        }
        // use TextObj (instead of TextIoObj)
        public static string ProcessByStr(string inText, int maxSpRecursiveNo, Dictionary <string, string> infExpMap, bool debugFlag)
        {
            List <TokenObj> outTokenList = Process(inText, maxSpRecursiveNo, infExpMap, debugFlag);
            // result text: convert from TokenObj to str
            string outText = TextObj.TokenListToText(outTokenList);

            return(outText);
        }
示例#5
0
        public static (string, List <TokenObj>) ProcessToStrExt(string inText, CSpellApi cSpellApi, bool debugFlag)
        {
            // 1. input
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag);
            // 2. convert results to text
            string outText = TextObj.TokenListToText(outTokenList);

            return(outText, outTokenList);
        }
        // public method
        // process
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // pre-porcess
            // update Pos for the inTokenList
            TextObj.UpdateIndexPos(inTokenList);
            // 1. remove non space-token and convert to non-space-token list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // 2. process: go through each token for detection and correction
            // to find merge corrections (mergeObjList)
            int             index               = 0;
            List <MergeObj> mergeObjList        = new List <MergeObj>();
            int             maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            while (index < inTokenList.Count)
            {
                TokenObj curTokenObj = inTokenList[index];

                // update the tarPos
                // SCR-3, use legit token
                if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    int tarPos = inTokenList[index].GetPos();
                    // correct term is the highest ranked candidates
                    MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag);
                    if (mergeObj == null)                       // no merge correction
                    {
                        index++;
                    }
                    else                         // has merge correction
                    {
                        mergeObjList.Add(mergeObj);
                        // next token after end token, this ensures no overlap merge
                        index = mergeObj.GetEndIndex() + 1;
                    }
                }
                else                     // space token
                                         // update index
                {
                    index++;
                }
            }
            // update the output for merge for the whole inTokenList,
            // has to update after the loop bz merge might
            // happen to the previous token
            // update the tokenObj up to the merge, then go to the next token
            // update operation info also
            List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi);

            return(outTokenList);
        }
        public static string Process(string inWord, int maxProcess)
        {
            string lastText = inWord;
            string outText  = Process(inWord);

            while ((maxProcess > 0) && (outText.Equals(lastText) == false))
            {
                // recusively process
                lastText = outText;
                // converts to textObj for recursively process
                List <TokenObj> inTokenList  = TextObj.TextToTokenList(lastText);
                List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => Process(token)).ToList());

                outText = TextObj.TokenListToText(outTokenList);
                maxProcess--;
            }
            return(outText);
        }
        public static List <TokenObj> Process(List <TokenObj> inTokenList, int ndMaxSplitNo, Dictionary <string, string> infExpMap, bool debugFlag)
        {
            DebugPrint.PrintProcess("1. NonDictionary", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // process on each tokenObj

            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList
                                                               .Select(token => XmlHtmlHandler.Process(token, debugFlag))
                                                               .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => LeadingDigitSplitter.Process(token, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => EndingDigitSplitter.Process(token, debugFlag))
                                                               .SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList());

            /*
             * List<TokenObj> outTokenList2 = inTokenList
             *      .Select(token => XmlHtmlHandler.Process(token, debugFlag))
             *      .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)).ToList();
             * foreach(var item in outTokenList2) {
             *      Console.WriteLine(item.GetTokenStr());
             * }
             * List<TokenObj> outTokenList3 = inTokenList
             *      .Select(token => XmlHtmlHandler.Process(token, debugFlag))
             *      .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
             *      .SelectMany(token => TextObj.FlatTokenToArrayList(token)).ToList();
             * foreach(var item in outTokenList3) {
             *      Console.WriteLine(item.GetTokenStr());
             * }
             *
             *
             * List<TokenObj> outTokenList = inTokenList
             *      .Select(token => XmlHtmlHandler.Process(token, debugFlag))
             *      .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
             *      .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag))
             *      .Select(token => LeadingDigitSplitter.Process(token, debugFlag))
             *      .Select(token => EndingDigitSplitter.Process(token, debugFlag))
             *      .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList();
             */
            return(outTokenList);
        }
示例#9
0
        // public method
        // Use: for loop, the latest and greatest implementation
        // original implementation with for loop, To be deleted
        // the core of spell-correction, include split
        // inTokenList is the whole text
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // init the output TokenList
            List <TokenObj> outTokenList = new List <TokenObj>();
            // process: go through each token for detection and correction
            // for the 1-to-1 and split correction
            int tarPos = 0;             // the position of the tokenObj in the inTokenList
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // use the inTokenList to keep the same spcae token
            TokenObj outTokenObj         = null;
            int      maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            foreach (TokenObj tokenObj in inTokenList)
            {
                /// <summary>
                /// no context
                /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj,
                ///    cSpellApi, debugFlag);
                ///
                /// </summary>
                // skip empty space tokens and long tokens
                // SCR-3, use legit token
                if (tokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    // correct term is the highest ranked candidate
                    outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                    // used tarPos for context module
                    tarPos++;
                }
                else
                {
                    outTokenObj = tokenObj;
                }
                // add the corrected tokenObj to the output token list
                // use FlatMap because there might be a split
                Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj);
            }
            return(outTokenList);
        }
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string          inText      = "hotflashes and knowaboutare not forr playsure.";
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
示例#11
0
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string inText = "She had problems dur ing her pregnancies. That is a dis appoint ment. Good!";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
示例#12
0
        // private method
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // test non-word, one-to-one, split, and merge correction, all lowerCase
            string inText = "hotflashes and knowaboutare not forr playsure dur ing my disa ppoint ment.";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
示例#13
0
        // privat methods
        private static void Test(CSpellApi cSpellApi)
        {
            Console.WriteLine("----- Test Pre-Correction Text: -----");
            string inText = "We cant spel ACHindex 987Pfimbria dianosed.";
            //CSpellApi cSpellApi = new CSpellApi(configFile);
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi);
            string          outText      = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------------------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToString());
                index++;
            }
        }