private static void TestProcess(Dictionary <string, string> informalExpMap)
        {
            // init
            Console.WriteLine("----- Test Process Text: -----");
            string inText = "u rolling &amp;amp; pls(12years).";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token))
                                                               .Select(token => LeadingPuncSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => LeadingDigitSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token))
                                                               .Select(token => InformalExpHandler.Process(token, informalExpMap)).ToList());

            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------- LeadingDigitSplitter( ) Test ----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToHistString());
                index++;
            }
        }
        // privat methods
        private static void Test(string configFile)
        {
            // init
            Console.WriteLine("----- Test Pre-Correction Text: -----");
            string    inText       = "We  cant theredve hell.Plz u r good123. ";
            CSpellApi cSpellApi    = new CSpellApi(configFile);
            int       ndMaxSplitNo = cSpellApi.GetCanNdMaxSplitNo();
            Dictionary <string, string> infExpMap = cSpellApi.GetInformalExpressionMap();
            bool debugFlag = false;

            // 1. convert input to TokenObjs
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = ProcNdCorrector.Process(inTokenList, ndMaxSplitNo, infExpMap, debugFlag);
            string          outText      = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------------------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToHistString());
                index++;
            }
        }
示例#3
0
        // public methods
        // private methods
        private static void Test()
        {
            int             tarIndex         = 6;            // target index
            int             startIndex       = 4;            // start index of merge
            int             endIndex         = 6;            // end index of merge
            int             tarPos           = 3;            // target pos
            int             startPos         = 2;            // start pos of merge
            int             endPos           = 3;            // end pos of merge
            int             mergeNo          = 1;            // total no of merged tokens
            string          tarWord          = "gnosed";     // target term
            string          mergeWord        = "diagnosed."; // suggested merged terms
            string          coreMergeWord    = "diagnosed";  // core suggested merged terms
            string          orgMergeWord     = "dia gnosed"; // org word b4 merge
            MergeObj        mergeObj         = new MergeObj(tarWord, orgMergeWord, mergeWord, coreMergeWord, mergeNo, startIndex, tarIndex, endIndex, startPos, tarPos, endPos);
            string          inText           = "He is dia gnosed last week.";
            List <TokenObj> inTextList       = TextObj.TextToTokenList(inText);
            List <TokenObj> nonSpaceTextList = TextObj.GetNonSpaceTokenObjList(inTextList);

            Console.WriteLine("------ Merge Obj -------");
            Console.WriteLine(mergeObj.ToString());
            Console.WriteLine("------ Non Merge Term -------");
            string nonMergeTerm = GetNonMergeTerm(mergeObj, nonSpaceTextList);

            Console.WriteLine("- inText: [" + inText + "]");
            Console.WriteLine("- nonMergeTerm: [" + nonMergeTerm + "]");
        }
        public static List <TokenObj> Process(string inText, int maxSpRecursiveNo, Dictionary <string, string> infExpMap)
        {
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;

            return(Process(inText, maxSpRecursiveNo, infExpMap, debugFlag));
        }
        private static void TestProcessText()
        {
            // init
            Console.WriteLine("----- Test Process Text: -----");
            string inText       = "Head rolling &amp;amp; rock(5'8&quot;).";
            int    MaxRecursive = 5;
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token)).Select(token => LeadingPuncSplitter.Process(token, MaxRecursive)).ToList());
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------- LeadingPuncSplitter( ) Test -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToHistString());
                index++;
            }
        }
        //the core of pre-correction api
        public static List <TokenObj> Process(string inText, int maxSpRecursiveNo, Dictionary <string, string> infExpMap, bool debugFlag)
        {
            // 1. input
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);

            // 2. process on each tokenObj
            return(ProcNdCorrector.Process(inTokenList, maxSpRecursiveNo, infExpMap, debugFlag));
        }
示例#7
0
        /// <summary>
        /// cSpell correction process, output to an ArrayList of TokenObj by using
        /// funcMode and rankMode from configuratin files, with debug print option.
        /// </summary>
        /// <param name="inText">   input text to be corrected </param>
        /// <param name="debugFlag"> boolean flag for debug print </param>
        /// <returns>  an ArrayList of TokenObj </returns>
        public virtual List <TokenObj> ProcessToTokenObj(string inText, bool debugFlag)
        {
            DebugPrint.Println("====== SpellApi.Process( ), funcMode: " + funcMode_ + ", rankMode: " + rankMode_ + " ======", debugFlag);
            // non-dictionary and dictionary base correction
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = CorrectionApi.ProcessByTokenObj(inTokenList, this, debugFlag);

            return(outTokenList);
        }
示例#8
0
        public static (string, List <TokenObj>) ProcessToStrExt(string inText, CSpellApi cSpellApi, bool debugFlag)
        {
            // 1. input
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag);
            // 2. convert results to text
            string outText = TextObj.TokenListToText(outTokenList);

            return(outText, outTokenList);
        }
        public static string Process(string inWord, int maxProcess)
        {
            string lastText = inWord;
            string outText  = Process(inWord);

            while ((maxProcess > 0) && (outText.Equals(lastText) == false))
            {
                // recusively process
                lastText = outText;
                // converts to textObj for recursively process
                List <TokenObj> inTokenList  = TextObj.TextToTokenList(lastText);
                List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => Process(token)).ToList());

                outText = TextObj.TokenListToText(outTokenList);
                maxProcess--;
            }
            return(outText);
        }
示例#10
0
        private static void TestGetCorrectTerm(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string          inText      = "Dur ing my absent.";
            bool            debugFlag   = false;
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            // 1. convert to the non-empty token list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // result
            int      tarPos   = 0;
            MergeObj mergeObj = NonWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag);

            // print out
            Console.WriteLine("--------- GetCorrectTerm( ) -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("In nonSpaceTokenList: [" + nonSpaceTokenList.Count + "]");
            Console.WriteLine("Out MergeObj: [" + mergeObj.ToString() + "]");
        }
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string          inText      = "hotflashes and knowaboutare not forr playsure.";
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
示例#12
0
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string inText = "She had problems dur ing her pregnancies. That is a dis appoint ment. Good!";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
示例#13
0
        // private method
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // test non-word, one-to-one, split, and merge correction, all lowerCase
            string inText = "hotflashes and knowaboutare not forr playsure dur ing my disa ppoint ment.";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
示例#14
0
        // privat methods
        private static void Test(CSpellApi cSpellApi)
        {
            Console.WriteLine("----- Test Pre-Correction Text: -----");
            string inText = "We cant spel ACHindex 987Pfimbria dianosed.";
            //CSpellApi cSpellApi = new CSpellApi(configFile);
            List <TokenObj> inTokenList  = TextObj.TextToTokenList(inText);
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi);
            string          outText      = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("--------------------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            int index = 0;

            foreach (TokenObj tokenObj in outTokenList)
            {
                Console.WriteLine(index + "|" + tokenObj.ToString());
                index++;
            }
        }
示例#15
0
        // private methods
        private static void Test()
        {
            // init cSpellApi
            string    configFile = "../data/Config/cSpell.properties";
            CSpellApi cSpellApi  = new CSpellApi(configFile);

            Console.WriteLine("===== Unit Test of MergeCandidates =====");
            //String inText = "He was dia gnosed  early onset deminita 3 year ago.";
            // example from 73.txt
            //String inText = "I have seven live births with no problems dur ing my pregnancies. That is a dis appoint ment";
            string          inText     = "That is a disa ppoint ment.";
            List <TokenObj> inTextList = TextObj.TextToTokenList(inText);
            string          inStr      = String.Join("|", inTextList.Select(obj => obj.GetTokenStr()));

            Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]");
            Console.WriteLine("-------------------------");
            foreach (TokenObj tokenObj in inTextList)
            {
                Console.WriteLine(tokenObj.ToString());
            }
            int tarPos = 4;

            Console.WriteLine("-------------------------");
            Console.WriteLine("- tarPos: " + tarPos);
            Console.WriteLine("- maxMergeNo: " + cSpellApi.GetCanNwMaxMergeNo());
            Console.WriteLine("------ merge set -------");
            // pre-Process: convert to the non-empty token list
            List <TokenObj> nonSpaceTextList = TextObj.GetNonSpaceTokenObjList(inTextList);
            // get the candidate for a specified target position
            HashSet <MergeObj> mergeSet = GetCandidates(tarPos, nonSpaceTextList, cSpellApi);

            // print out
            foreach (MergeObj mergeObj in mergeSet)
            {
                Console.WriteLine(mergeObj.ToString());
            }
        }
示例#16
0
        private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm)
        {
            string          inText     = "... last 10 years #$% was dianosed test123 yahoo.com early on set deminita 3 year ago.";
            List <TokenObj> inTextList = TextObj.TextToTokenList(inText);

            Console.WriteLine("======= Word2VecContext ======================");
            Console.WriteLine(" - inText: [" + inText + "]");
            string inStr = String.Join("|", inTextList.Select(obj => obj.GetTokenStr()));

            Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]");

            int  tarPos    = 0;
            int  tarSize   = 1;
            int  index     = 0;
            int  radius    = 3;
            bool debugFlag = false;

            Console.WriteLine("------ Test GetContext (no skip), radius=3 ...");
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            foreach (TokenObj tokenObj in inTextList)
            {
                // not the space token
                if (tokenObj.IsSpaceToken() == false)
                {
                    string tokenStr = tokenObj.GetTokenStr();
                    // word2VecSkipWord = false (no skip)
                    List <string> contextList = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, false, debugFlag);
                    string        contextStr  = String.Join("|", contextList);
                    Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr + "]");
                    tarPos++;
                }
                index++;
            }
            Console.WriteLine("------ Test GetContext (skip) , radius=3 ...");
            Console.WriteLine(" - inText: [" + inText + "]");
            tarPos = 0;
            foreach (TokenObj tokenObj in inTextList)
            {
                // not the space token
                if (tokenObj.IsSpaceToken() == false)
                {
                    string tokenStr = tokenObj.GetTokenStr();
                    // word2VecSkipWord = true (skip)
                    List <string> contextList2 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, true, debugFlag);
                    string        contextStr2  = String.Join("|", contextList2);
                    Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr2 + "]");
                    tarPos++;
                }
                index++;
            }
            Console.WriteLine("------ Test GetContext (skip) , all ...");
            Console.WriteLine(" - inText: [" + inText + "]");
            tarPos = 0;
            // not the space token
            foreach (TokenObj tokenObj in nonSpaceTokenList)
            {
                string tokenStr = tokenObj.GetTokenStr();
                // word2VecSkipWord = true (skip)
                List <string> contextList3 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, true, debugFlag);
                string        contextStr3  = String.Join("|", contextList3);
                Console.WriteLine(tarPos + "|" + tokenStr + ": [" + contextStr3 + "]");
                tarPos++;
            }
        }
示例#17
0
        private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm)
        {
            string          inText     = "for the last 10 years    was dianosed\n early on set deminita 3 years ago";
            List <TokenObj> inTextList = TextObj.TextToTokenList(inText);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);
            List <string>   testStrList       = new List <string>();

            testStrList.Add("diagnosed");
            testStrList.Add("diagnose");
            testStrList.Add("dianosed");
            // init context
            int       tarPos           = 6;
            int       tarSize          = 1;
            int       radius           = 2;
            bool      word2VecSkipWord = true;
            bool      debugFlag        = false;
            DoubleVec contextVec       = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);

            Console.WriteLine("===== Test diagnosed|diagnose|dianosed (window-2) =====");
            Console.WriteLine("inText: [" + inText + "]");
            Console.WriteLine("============================================");
            Console.WriteLine("Candidates|CBOW score|CBOW score 2|Similarity score");
            Console.WriteLine("============================================");
            foreach (string testStr in testStrList)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            Console.WriteLine("===== Test diagnosed|diagnose|dianosed (whole text) =====");
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
            foreach (string testStr in testStrList)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            string          inText1     = "Not all doctors know about this syndrome.";
            List <TokenObj> inTextList1 = TextObj.TextToTokenList(inText1);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList1 = TextObj.GetNonSpaceTokenObjList(inTextList1);

            Console.WriteLine("===== Test know about|know|about (window) =====");
            List <string> testStrList1 = new List <string>();

            testStrList1.Add("know about");
            testStrList1.Add("know");
            testStrList1.Add("about");
            tarPos     = 3;
            tarSize    = 2;
            radius     = 2;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test(testStrList1[0], contextVec, w2vIm, w2vOm);
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag);
            Test(testStrList1[0], contextVec, w2vIm, w2vOm);
            tarPos     = 3;
            tarSize    = 1;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test(testStrList1[1], contextVec, w2vIm, w2vOm);
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag);
            Test(testStrList1[1], contextVec, w2vIm, w2vOm);
            tarPos     = 4;
            tarSize    = 1;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test(testStrList1[2], contextVec, w2vIm, w2vOm);
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag);
            Test(testStrList1[2], contextVec, w2vIm, w2vOm);

            string          inText2     = "for the last   10 years was diagnosed early on set dementia 3 years ago.";
            List <TokenObj> inTextList2 = TextObj.TextToTokenList(inText2);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList2 = TextObj.GetNonSpaceTokenObjList(inTextList2);
            List <string>   testStrList2       = new List <string>();

            testStrList2.Add("onset");
            testStrList2.Add("on set");
            Console.WriteLine("===== Test onset|on set (window-3) =====");
            Console.WriteLine("inText: [" + inText + "]");
            tarPos     = 8;
            tarSize    = 2;
            radius     = 3;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag);
            foreach (string testStr in testStrList2)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            tarPos     = 8;
            tarSize    = 1;
            radius     = 3;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test("on", contextVec, w2vIm, w2vOm);
            tarPos     = 9;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test("set", contextVec, w2vIm, w2vOm);
            Console.WriteLine("===== Test onset|on set (whole text) =====");
            radius     = nonSpaceTokenList2.Count;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, word2VecSkipWord, debugFlag);
            foreach (string testStr in testStrList2)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            Console.WriteLine("===== Go through each tokens with diff radius 1-9) =====");
            Console.WriteLine("tarPos|tarWord|r=1|r=2|r=3|r=4|r=5|r=6|r=7|r=8|r=9");
            //String inText3 = "Broken bones can not sleep at night!";
            string          inText3     = "not xyxy all doctors know about this syndrome.";
            List <TokenObj> inTextList3 = TextObj.TextToTokenList(inText3);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList3 = TextObj.GetNonSpaceTokenObjList(inTextList3);

            tarPos  = 0;
            tarSize = 1;
            radius  = 0;
            foreach (TokenObj tokenObj in nonSpaceTokenList3)
            {
                // skip the space token
                string tokenStr = tokenObj.GetTokenStr();
                string inStr    = Word2VecContext.NormWordForWord2Vec(tokenStr);
                Console.Write(tarPos + "|" + tokenStr + "|");
                // print out all radius
                for (int r = 1; r < 10; r++)
                {
                    contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, inTextList2, w2vIm, r, word2VecSkipWord, debugFlag);
                    TestWin(inStr, contextVec, w2vIm, w2vOm);
                }
                Console.WriteLine("");
                tarPos++;
            }
        }