コード例 #1
0
        // private methods
        private static int RunTest(bool detailFlag, long limitNo)
        {
            // init dic
            string    configFile = "../data/Config/cSpell.properties";
            CSpellApi cSpellApi  = new CSpellApi(configFile);
            WordWcMap wordWcMap  = cSpellApi.GetWordWcMap();
            double    wf1        = cSpellApi.GetOrthoScoreEdDistFac();
            double    wf2        = cSpellApi.GetOrthoScorePhoneticFac();
            double    wf3        = cSpellApi.GetOrthoScoreOverlapFac();

            cSpellApi.SetRankMode(CSpellApi.RANK_MODE_NOISY_CHANNEL);
            // provide cmdLine interface
            int returnValue = 0;
            NoisyChannelScoreComparator <NoisyChannelScore> ncsc = new NoisyChannelScoreComparator <NoisyChannelScore>();

            try {
                StreamReader stdInput = new StreamReader(Console.OpenStandardInput());
                try {
                    string inText = null;
                    Console.WriteLine("- Please input a text (type \"Ctl-d\" to quit) > ");
                    while (!string.ReferenceEquals((inText = stdInput.ReadLine()), null))
                    {
                        // ---------------------------------
                        // Get spell correction on the input
                        // ---------------------------------
                        // get all possible candidates
                        HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(inText, cSpellApi);
                        Console.WriteLine("-- canSet.size(): " + candSet.Count);
                        // get final suggestion
                        string topRankStr = GetTopRankStr(inText, candSet, wordWcMap, wf1, wf2, wf3);
                        Console.WriteLine("- top tank str: " + topRankStr);
                        // print details
                        if (detailFlag == true)
                        {
                            HashSet <NoisyChannelScore> candScoreSet = GetCandidateScoreSet(inText, candSet, wordWcMap, wf1, wf2, wf3);
                            Console.WriteLine("------ Suggestion List ------");
                            var list = candScoreSet.OrderBy(x => x, ncsc).Take((int)limitNo).Select(obj => obj.ToString()).ToList();
                            foreach (var item in list)
                            {
                                Console.WriteLine(item);
                            }
                        }
                    }
                } catch (Exception e2) {
                    Console.Error.WriteLine(e2.Message);
                    returnValue = -1;
                }
            } catch (Exception e) {
                Console.Error.WriteLine(e.Message);
                returnValue = -1;
            }
            return(returnValue);
        }
コード例 #2
0
        private static void TestTpStr(string str1, string str2, CSpellApi cSpellApi)
        {
            HashSet <string> candSet = GetCandidates(str1.ToLower(), cSpellApi);
            bool             flag    = candSet.Contains(str2);

            if (flag == true)
            {
                totalTpNo_++;
            }
            totalTpStrNo_++;
            Console.WriteLine(flag + "|" + totalTpNo_ + "|" + totalTpStrNo_ + "|" + str1 + "|" + str2 + "|" + EditDistance.GetDistanceForRealWord(str1, str2) + "|" + RefinedSoundex.GetDistanceDetailStr(str1, str2) + "|" + Metaphone2.GetDistanceDetailStr(str1, str2, 10));
        }
コード例 #3
0
        // public method
        // process
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // pre-porcess
            // update Pos for the inTokenList
            TextObj.UpdateIndexPos(inTokenList);
            // 1. remove non space-token and convert to non-space-token list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // 2. process: go through each token for detection and correction
            // to find merge corrections (mergeObjList)
            int             index               = 0;
            List <MergeObj> mergeObjList        = new List <MergeObj>();
            int             maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            while (index < inTokenList.Count)
            {
                TokenObj curTokenObj = inTokenList[index];

                // update the tarPos
                // SCR-3, use legit token
                if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    int tarPos = inTokenList[index].GetPos();
                    // correct term is the highest ranked candidates
                    MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag);
                    if (mergeObj == null)                       // no merge correction
                    {
                        index++;
                    }
                    else                         // has merge correction
                    {
                        mergeObjList.Add(mergeObj);
                        // next token after end token, this ensures no overlap merge
                        index = mergeObj.GetEndIndex() + 1;
                    }
                }
                else                     // space token
                                         // update index
                {
                    index++;
                }
            }
            // update the output for merge for the whole inTokenList,
            // has to update after the loop bz merge might
            // happen to the previous token
            // update the tokenObj up to the merge, then go to the next token
            // update operation info also
            List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi);

            return(outTokenList);
        }
コード例 #4
0
        private static bool TestCand(string inWord, string cand, CSpellApi cSpellApi)
        {
            HashSet <string> candSet = GetCandidates(inWord, cSpellApi);
            bool             hasCand = candSet.Contains(cand);

            totalNo_++;
            if (hasCand == true)
            {
                totalCandNo_++;
                Console.WriteLine(inWord + ", " + cand);
            }
            return(hasCand);
        }
コード例 #5
0
        // public method
        public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag)
        {
            /*
             * // use frequency score for merge
             * MergeObj mergeObj = GetTopRankMergeObjByFrequency(candidates,
             *  cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
             * // use context score for merge
             * MergeObj mergeObj = GetTopRankMergeObjByContext(candidates,
             *  cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
             */
            // use combination
            MergeObj mergeObj = GetTopRankMergeObjByCSpell(candidates, cSpellApi, tarPos, nonSpaceTokenList, debugFlag);

            return(mergeObj);
        }
コード例 #6
0
        // test driver
        public static void MainTest(string[] args)
        {
            if (args.Length > 0)
            {
                Console.Error.WriteLine("*** Usage: java RealWord1To1Candidates");
                Environment.Exit(1);
            }
            // init
            string    configFile = "../data/Config/cSpell.properties";
            CSpellApi cSpellApi  = new CSpellApi(configFile);

            //Tests(cSpellApi);
            //TestDists();
            TestTestSet(cSpellApi);             // test candidate rule for TP and FP
        }
コード例 #7
0
        private static void TestPnDic(CSpellApi cSpellApi)
        {
            // test split dictionary
            RootDictionary pnDic = cSpellApi.GetPnDic();

            // test words
            List <string> wordList = new List <string>();

            wordList.Add("hu");
            wordList.Add("Hu");
            foreach (string word in wordList)
            {
                Console.WriteLine("-- pnDic(" + word + "): " + pnDic.IsDicWord(word));
            }
        }
コード例 #8
0
        // test driver
        public static void MainTest(string[] args)
        {
            string configFile = "../data/Config/cSpell.properties";

            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java SpellCorrection <configFile>");
                Environment.Exit(0);
            }

            // init
            CSpellApi cSpellApi = new CSpellApi(configFile);

            // test
            TestProcess(cSpellApi);
        }
コード例 #9
0
        private static void TestGetCorrectTerm(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string inText = "hotflashes";
            // test process:
            TokenObj inToken  = new TokenObj(inText);
            TokenObj outToken = NonWordCorrector.GetCorrectTerm(inToken, cSpellApi);
            // result
            string outText = outToken.GetTokenStr();

            // print out
            Console.WriteLine("--------- GetCorrectTerm( ) -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
        }
コード例 #10
0
        private static void TestSplitDic(CSpellApi cSpellApi)
        {
            // test split dictionary
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();

            // test words
            List <string> wordList = new List <string>();

            wordList.Add("do");
            wordList.Add("i");
            wordList.Add("ng");
            wordList.Add("ilove");
            foreach (string word in wordList)
            {
                Console.WriteLine("-- SplitDic(" + word + "): " + splitWordDic.IsDicWord(word));
            }
        }
コード例 #11
0
        // real-word candidate has more restriction than non-word
        // TBD, need to organize the code ...
        // the check should be done in the ranking
        // Core process for real-word candidates
        private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi)
        {
            RootDictionary suggestDic = cSpellApi.GetSuggestDic();
            Word2Vec       word2VecOm = cSpellApi.GetWord2VecOm();
            WordWcMap      wordWcMap  = cSpellApi.GetWordWcMap();
            // real-word, check phonetic and suggDic
            // 1. check suggDic
            // 1.1 edDist <= 1
            // 1.2 edDist <= 2 && phonetic dist <= 1
            // 2. check if inflections, not a candidate real-word, not correct
            bool   flag                = false;
            int    rw1To1CandMinWc     = cSpellApi.GetCanRw1To1CandMinWc();
            int    rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength();
            string inWordLc            = inWord.ToLower();
            int    inWordLen           = inWordLc.Length;
            int    candLen             = cand.Length;
            int    lenDiff             = inWordLen - candLen;

            // 1. check suggDic and inflVars
            if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false))             // not inflVars
            {
                //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1
                // more restriction for real-word candidates
                int pmDist     = Metaphone2.GetDistance(inWordLc, cand);
                int prDist     = RefinedSoundex.GetDistance(inWordLc, cand);
                int leadDist   = GetLeadCharDist(inWordLc, cand);
                int endDist    = GetEndCharDist(inWordLc, cand);
                int lengthDist = GetLengthDist(inWordLc, cand);
                int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist;
                int editDist   = EditDistance.GetDistanceForRealWord(inWordLc, cand);
                int totalDist2 = editDist + pmDist + prDist;
                // if they sound the same
                if ((pmDist == 0) && (prDist == 0))
                {
                    flag = true;
                }
                // if they sound similar and orthographic is also similar
                // fixed from empierical test, not configuable
                else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0))
                {
                    flag = true;
                }
            }
            return(flag);
        }
コード例 #12
0
        // TBD... this is the bottle neck because so many real-words call this
        // needs to speed up
        //
        // public method
        // Get candidates from dictionary by Edit-distance:
        // 1. get all possible combinations from insert, remove, replace, switch
        //    chars. However, it does not include space (so no split).
        // 2. check if the combination is in dictionary
        public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi)
        {
            int    maxLength = cSpellApi.GetCanRw1To1WordMaxLength();
            string inWordLc  = inWord.ToLower();
            // 1. get it from the memoery to speed up running time
            HashSet <string> candidates = candMap_.GetValueOrNull(inWordLc);

            // 2. generate candidates on the fly, find all possibile candidates
            if (candidates == null)
            {
                // 2.1. get all possible candidates
                // bottle neck for real-word: 7 min.
                HashSet <string> candidatesByEd = CandidatesUtil1To1.GetCandidatesByEd(inWord, maxLength);
                // filter out those are not valid words
                candidates = new HashSet <string>();
                // 2.2. bottle neck for real-word: 2 min.
                foreach (string candByEd in candidatesByEd)
                {
                    // check if valid one-to-one candidate word
                    if (IsValid1To1Cand(inWordLc, candByEd, cSpellApi) == true)
                    {
                        candidates.Add(candByEd);
                    }
                }
                // update candMap_ and save to memory to speed up runing time
                // TBD, need to set the maxKeyNo for candMap_ to prevent
                // max. key size need to be <= 2**31-1 = 2,147,483,647
                // slow performance and crash could happen if too many keys
                if (candMap_.ContainsKey(inWordLc) == false)
                {
                    candMap_[inWordLc] = candidates;
                    // warning msg< suggest value: < 1,500,000,000 for performance
                    int maxHashKeySize = cSpellApi.GetCanRw1To1CandMaxKeySize();
                    int hashKeySize    = candMap_.Keys.Count;
                    if (hashKeySize > maxHashKeySize)
                    {
                        if ((hashKeySize % 100) == 0)
                        {
                            Console.Error.WriteLine("** [email protected]: the size of key in RW-1To1-Cand-HashMap is too big (" + hashKeySize + " > " + maxHashKeySize + "). Please rerun the cSpell and increase the max. hash key size in the cSpell config (must < 2,147,483,647).");
                        }
                    }
                }
            }
            return(candidates);
        }
コード例 #13
0
        private static void Tests(CSpellApi cSpellApi)
        {
            List <string> testList = new List <string>();

            TestCand("too", "to", cSpellApi);
            TestCand("then", "than", cSpellApi);
            TestCand("thing", "think", cSpellApi);
            TestCand("sisters", "sisters'", cSpellApi);
            TestCand("know", "now", cSpellApi);
            TestCand("tried", "tired", cSpellApi);
            TestCand("specially", "especially", cSpellApi);
            TestCand("law", "lat", cSpellApi);
            TestCand("domestic", "damaged", cSpellApi);
            TestCand("Weather", "whether", cSpellApi);
            TestCand("there", "their", cSpellApi);
            TestCand("then", "than", cSpellApi);
            TestCand("fine", "find", cSpellApi);
            TestCand("bowl", "bowel", cSpellApi);
            TestCand("off", "of", cSpellApi);
            TestCand("Dies", "Does", cSpellApi);
            TestCand("descended", "undescended", cSpellApi);
            TestCand("effect", "affect", cSpellApi);
            TestCand("pregnancy", "pregnant", cSpellApi);
            TestCand("leave", "live", cSpellApi);
            TestCand("affects", "effects", cSpellApi);
            TestCand("their", "there", cSpellApi);
            TestCand("you", "your", cSpellApi);
            TestCand("medical", "medicine", cSpellApi);
            TestCand("medical", "medicine", cSpellApi);
            TestCand("swollen", "swelling", cSpellApi);
            TestCand("swollen", "swelling", cSpellApi);
            TestCand("well", "swell", cSpellApi);
            TestCand("FRIENDS", "friend's", cSpellApi);
            TestCand("access", "excess", cSpellApi);
            TestCand("where", "were", cSpellApi);
            TestCand("spot", "stop", cSpellApi);
            TestCand("weather", "whether", cSpellApi);
            TestCand("were", "we're", cSpellApi);
            TestCand("small", "smell", cSpellApi);
            TestCand("bond", "bone", cSpellApi);
            TestCand("then", "than", cSpellApi);
            TestCand("leave", "live", cSpellApi);
            TestCand("meningitidis", "meningitis", cSpellApi);
            Console.WriteLine(totalNo_ + "|" + totalCandNo_);
        }
コード例 #14
0
        // test driver
        public static void MainTest(string[] args)
        {
            string configFile = "../data/Config/cSpell.properties";

            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java BasicDictionary");
                Environment.Exit(0);
            }

            // init
            CSpellApi cSpellApi = new CSpellApi(configFile);

            // test case and print out
            //Test();
            TestSplitDic(cSpellApi);
            TestPnDic(cSpellApi);
        }
コード例 #15
0
        // public method
        public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // only have time to test Word2Vec score, yet to test other scores
            MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);

            /*
             * // use frequency score for merge
             * MergeObj mergeObj = GetTopRankMergeObjByFrequency(candidates,
             *  cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
             * // use context score for merge
             * MergeObj mergeObj = GetTopRankMergeObjByContext(candidates,
             *  cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
             * // use combination
             * MergeObj mergeObj = GetTopRankMergeObjByCSpell(candidates,
             *  cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
             */
            return(mergeObj);
        }
コード例 #16
0
        // public method
        // Get candidates from dictionary by Edit-distance:
        // 1. get all possible combinations from insert, remove, replace, switch
        //    chars. However, it does not include space (so no split).
        // 2. check if the combination is in dictionary
        public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi)
        {
            int maxLength = cSpellApi.GetCanNw1To1WordMaxLength();
            // find all possibility
            HashSet <string> candidatesByEd = CandidatesUtil1To1.GetCandidatesByEd(inWord, maxLength);
            // filter out those are not valid words
            HashSet <string> candidates = new HashSet <string>();

            foreach (string candByEd in candidatesByEd)
            {
                // check if valid one-to-one candidate word
                if (IsValid1To1Cand(inWord, candByEd, cSpellApi) == true)
                {
                    candidates.Add(candByEd);
                }
            }
            return(candidates);
        }
コード例 #17
0
        private static void TestGetCorrectTerm(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string          inText      = "Dur ing my absent.";
            bool            debugFlag   = false;
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            // 1. convert to the non-empty token list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // result
            int      tarPos   = 0;
            MergeObj mergeObj = NonWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag);

            // print out
            Console.WriteLine("--------- GetCorrectTerm( ) -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("In nonSpaceTokenList: [" + nonSpaceTokenList.Count + "]");
            Console.WriteLine("Out MergeObj: [" + mergeObj.ToString() + "]");
        }
コード例 #18
0
        // check dic and exception
        private static bool IsRealWordMerge(string inWord, CSpellApi cSpellApi, bool debugFlag)
        {
            // init
            RootDictionary checkDic = cSpellApi.GetSplitWordDic();             // merge Dic
            RootDictionary unitDic  = cSpellApi.GetUnitDic();
            // real word merge must:
            // 1. known in the dictionary
            // 2. not exception, such as url, email, digit, ...
            // => if excpetion, even is a non-word, still not a misspelt
            bool realWordMergeFlag = (checkDic.IsValidWord(inWord)) && (!IsRealWordExceptions(inWord, unitDic));

            if (debugFlag == true)
            {
                bool wordInDicFlag     = checkDic.IsValidWord(inWord);
                bool wordExceptionFlag = IsRealWordExceptions(inWord, unitDic);
                DebugPrint.PrintRwMergeDetect(inWord, realWordMergeFlag, wordInDicFlag, wordExceptionFlag, debugFlag);
            }
            return(realWordMergeFlag);
        }
コード例 #19
0
        // public test driver
        public static void MainTest(string[] args)
        {
            string configFile = "../data/Config/cSpell.properties";

            if (args.Length == 1)
            {
                configFile = args[0];
            }
            if (args.Length > 0)
            {
                Console.Error.WriteLine("Usage: java NonWordDetector <config>");
                Environment.Exit(1);
            }
            // init, read in from config
            CSpellApi cSpellApi = new CSpellApi(configFile);

            // Test
            Tests(cSpellApi);
        }
コード例 #20
0
        public static bool IsNonWord(string inWord, CSpellApi cSpellApi, bool debugFlag)
        {
            // init
            RootDictionary checkDic = cSpellApi.GetCheckDic();
            RootDictionary unitDic  = cSpellApi.GetUnitDic();
            // non-word must be:
            // 1. not known in the dictionary
            // 2. not exception, such as url, email, digit, ...
            // => if excpetion, even is a nor-word, still not a misspelt
            bool nonWordFlag = (!checkDic.IsValidWord(inWord)) && (!IsNonWordExceptions(inWord, unitDic));

            if (debugFlag == true)
            {
                bool wordDicFlag       = checkDic.IsValidWord(inWord);
                bool wordExceptionFlag = IsNonWordExceptions(inWord, unitDic);
                DebugPrint.PrintNwDetect(inWord, nonWordFlag, wordDicFlag, wordExceptionFlag, debugFlag);
            }
            return(nonWordFlag);
        }
コード例 #21
0
        // public method
        // Use: for loop, the latest and greatest implementation
        // original implementation with for loop, To be deleted
        // the core of spell-correction, include split
        // inTokenList is the whole text
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // init the output TokenList
            List <TokenObj> outTokenList = new List <TokenObj>();
            // process: go through each token for detection and correction
            // for the 1-to-1 and split correction
            int tarPos = 0;             // the position of the tokenObj in the inTokenList
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // use the inTokenList to keep the same spcae token
            TokenObj outTokenObj         = null;
            int      maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            foreach (TokenObj tokenObj in inTokenList)
            {
                /// <summary>
                /// no context
                /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj,
                ///    cSpellApi, debugFlag);
                ///
                /// </summary>
                // skip empty space tokens and long tokens
                // SCR-3, use legit token
                if (tokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    // correct term is the highest ranked candidate
                    outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                    // used tarPos for context module
                    tarPos++;
                }
                else
                {
                    outTokenObj = tokenObj;
                }
                // add the corrected tokenObj to the output token list
                // use FlatMap because there might be a split
                Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj);
            }
            return(outTokenList);
        }
コード例 #22
0
        // test driver
        public static void MainTest(string[] args)
        {
            string configFile = "../data/Config/cSpell.properties";

            if (args.Length == 1)
            {
                configFile = args[0];
            }
            else if (args.Length > 0)
            {
                Console.WriteLine("Usage: java RealWordCorrector <configFile>");
                Environment.Exit(0);
            }

            // init
            CSpellApi cSpellApi = new CSpellApi(configFile);

            // test
            TestSplit(cSpellApi);
        }
コード例 #23
0
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string          inText      = "hotflashes and knowaboutare not forr playsure.";
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
コード例 #24
0
        // private method
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // test non-word, one-to-one, split, and merge correction, all lowerCase
            string inText = "hotflashes and knowaboutare not forr playsure dur ing my disa ppoint ment.";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
コード例 #25
0
        private static void TestProcess(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string inText = "She had problems dur ing her pregnancies. That is a dis appoint ment. Good!";
            // test process:  must use ArrayList<TextObj>
            List <TokenObj> inTokenList = TextObj.TextToTokenList(inText);
            bool            debugFlag   = false;
            // process
            List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag);
            // result
            string outText = TextObj.TokenListToText(outTokenList);

            // print out
            Console.WriteLine("------ GetCorrection by Process( ) ------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
            Console.WriteLine("----- Details -----------");
            // print out operation details
            Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList));
        }
コード例 #26
0
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get split candidates
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> cSpell API object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position of the target token to be split </param>
        /// <param name="nonSpaceTokenList"> the token list without space tokens
        /// </param>
        /// <returns>    the split words in tokenObj.  </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3. get split candidates set from correction
                int maxSplitNo            = cSpellApi.GetCanRwMaxSplitNo();
                HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                // get candidates from split
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_S);                     //split
                    DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }
コード例 #27
0
        // check all split words
        private static bool CheckSplitWords(string inTerm, CSpellApi cSpellApi)
        {
            // convert to word list
            List <string> splitWordList = TermUtil.ToWordList(inTerm);
            // go through all split words, they can be:
            // 1. digit (pure number)
            // 2. unit
            // 3. word in the split word dictionary: English + ProperNoun (not Aa)
            // if any splitWord is not above, the split is false
            bool flag = true;

            foreach (string splitWord in splitWordList)
            {
                // check each split word
                if (IsValidSplitWord(splitWord, cSpellApi) == false)
                {
                    flag = false;
                    break;
                }
            }
            return(flag);
        }
コード例 #28
0
        // public method
        // filter out with dictionary
        // Use no Abb/Acr dictionary to exclude terms are abb/acr
        // The inWord must be a coreTerm.
        public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi, int maxSplitNo)
        {
            // init from cSpellApi
            RootDictionary mwDic = cSpellApi.GetMwDic();
            // 1. find all possibie split combination by spaces
            // must be <= maxSplitNo
            HashSet <string> splitSet = CandidatesUtilSplit.GetSplitSet(inWord, maxSplitNo);
            // filter out those are not valid
            HashSet <string> candidates = new HashSet <string>();

            // 2. multiwords: check the whole list of split terms
            // only inlcude dictionary that have multiword - lexicon
            // TBD: this will find "perse" to "per se", however, "perse" is
            // a valid word in eng_medical.dic so cSpell can't correct it.
            // Need to refine the dictionary later!
            foreach (string split in splitSet)
            {
                if (mwDic.IsDicWord(split) == true)
                {
                    candidates.Add(split);
                }
            }
            // 3. if no multiwords found from step 2.
            // check each split terms, mark as candidate if they are in Dic,
            // Acr/Abb are excluded to eliminate noise such as 'a', 'ab', etc.
            if (candidates.Count == 0)
            {
                // go through each split words
                foreach (string split in splitSet)
                {
                    // add to candidate if all split words are valid
                    if (IsValidSplitCand(split, cSpellApi) == true)
                    {
                        candidates.Add(split);
                    }
                }
            }
            return(candidates);
        }
コード例 #29
0
        // for the split, we don't want Aa as a valid word
        // because it will cause too much noise (less precision)
        // TBD ... re-organize
        private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi)
        {
            // splitWord uses LexiconNoAa for Dic
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();
            WordWcMap      wordWcMap    = cSpellApi.GetWordWcMap();
            Word2Vec       word2VecOm   = cSpellApi.GetWord2VecOm();
            RootDictionary unitDic      = cSpellApi.GetUnitDic();
            RootDictionary pnDic        = cSpellApi.GetPnDic();
            //RootDictionary aaDic = cSpellApi.GetAaDic();
            int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc();
            // real-word cand split word must:
            // 1. check if in the splitWordDic, No Aa with a small length
            // such as cel is an overlap, it is aa or not-aa
            // 2. has word2Vec
            // 3. has WC
            // 4. not unit, mg -> ...
            // 5. not properNoun, human -> Hu man, where Hu is pn
            // children -> child ren, where ren is pn
            bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord));

            return(flag);
        }
コード例 #30
0
        // private methods
        private static void Test()
        {
            // init cSpellApi
            string    configFile = "../data/Config/cSpell.properties";
            CSpellApi cSpellApi  = new CSpellApi(configFile);

            Console.WriteLine("===== Unit Test of MergeCandidates =====");
            //String inText = "He was dia gnosed  early onset deminita 3 year ago.";
            // example from 73.txt
            //String inText = "I have seven live births with no problems dur ing my pregnancies. That is a dis appoint ment";
            string          inText     = "That is a disa ppoint ment.";
            List <TokenObj> inTextList = TextObj.TextToTokenList(inText);
            string          inStr      = String.Join("|", inTextList.Select(obj => obj.GetTokenStr()));

            Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]");
            Console.WriteLine("-------------------------");
            foreach (TokenObj tokenObj in inTextList)
            {
                Console.WriteLine(tokenObj.ToString());
            }
            int tarPos = 4;

            Console.WriteLine("-------------------------");
            Console.WriteLine("- tarPos: " + tarPos);
            Console.WriteLine("- maxMergeNo: " + cSpellApi.GetCanNwMaxMergeNo());
            Console.WriteLine("------ merge set -------");
            // pre-Process: convert to the non-empty token list
            List <TokenObj> nonSpaceTextList = TextObj.GetNonSpaceTokenObjList(inTextList);
            // get the candidate for a specified target position
            HashSet <MergeObj> mergeSet = GetCandidates(tarPos, nonSpaceTextList, cSpellApi);

            // print out
            foreach (MergeObj mergeObj in mergeSet)
            {
                Console.WriteLine(mergeObj.ToString());
            }
        }