private static void TestSplit(CSpellApi cSpellApi)
        {
            // setup test case
            // 10349.txt
            //String inText = "sounding in my ear every time for along time.";
            // 13864.txt
            string          inText            = "I donate my self to be apart of this study.";
            TextObj         textObj           = new TextObj(inText);
            List <TokenObj> inTextList        = textObj.GetTokenList();
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);
            //int tarPos = 7;
            int      tarPos     = 6;
            TokenObj inTokenObj = nonSpaceTokenList[tarPos];
            bool     debugFlag  = false;

            Console.WriteLine("====== Real-Word One-To-One Correction Test =====");
            Console.WriteLine("-- inTextList: [" + inText + "]");
            Console.WriteLine("-- tarPos: [" + tarPos + "]");
            Console.WriteLine("-- inTokenObj: [" + inTokenObj.ToString() + "]");
            // get the correct term
            TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);

            // print out
            Console.WriteLine("--------- GetCorrectTermStr( ) -----------");
            Console.WriteLine("-- outTokenObj: [" + outTokenObj.ToString() + "]");
        }
Beispiel #2
0
        private static void Test1To1(CSpellApi cSpellApi)
        {
            // setup test case
            // 51.txt
            //String inText = "You'd thing that this is good.";
            //String inText = "The doctor thing that this is good.";
            string          inText            = "you would thing that is good.";
            TextObj         textObj           = new TextObj(inText);
            List <TokenObj> inTextList        = textObj.GetTokenList();
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);
            int             tarPos            = 2;
            TokenObj        inTokenObj        = nonSpaceTokenList[tarPos];
            bool            debugFlag         = false;

            Console.WriteLine("====== Real-Word One-To-One Correction Test =====");
            Console.WriteLine("-- inTextList: [" + inText + "]");
            Console.WriteLine("-- tarPos: [" + tarPos + "]");
            Console.WriteLine("-- inTokenObj: [" + inTokenObj.ToString() + "]");
            // get the correct term
            TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);

            // print out
            Console.WriteLine("--------- GetCorrectTermStr( ) -----------");
            Console.WriteLine("-- outTokenObj: [" + outTokenObj.ToString() + "]");
        }
Beispiel #3
0
        /// <summary>
        /// This method does not use context scores to find the correct term.
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print
        /// </param>
        /// <returns>    the corrected word in tokenObj if the coreTerm is OOV
        ///             and suggested word found. Otherwise, the original input token
        ///             is returned. </returns>
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag)
        {
            int             tarPos            = 0; // set to 0 if not use context
            List <TokenObj> nonSpaceTokenList = null;

            return(GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList));
        }
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>detect if real-word for merge
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from merge.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     <li>frequency (TBD)
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="tarPos">    the position of target tokenObj </param>
        /// <param name="nonSpaceTokenList"> token list without space tokens </param>
        /// <param name="cSpellApi"> for all dictioanry and Word2Vec data </param>
        /// <param name="debugFlag"> boolean flag for debug print
        /// </param>
        /// <returns>    the corrected merged word in MergeObj if the target token
        ///             matches real-word merged rules.
        ///             Otherwise, a null of MergeObj is returned. </returns>
        // return the original term if no good correctin are found
        public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            // get tarWord from tarTokenObj and init outTokenObj
            TokenObj tarTokenObj = nonSpaceTokenList[tarPos];
            string   tarWord     = tarTokenObj.GetTokenStr();
            // 1. only remove ending punctuation for coreTerm
            // No coreStr is used for real-word merge for less aggressive
            //String coreStr = TermUtil.StripEndPuncSpace(tarWord).toLowerCase();
            // 2. real-word merge correction
            // check if tarWord and removeEndPuncStr is OOV
            MergeObj outMergeObj = null;             // no merge if it is null

            if ((tarTokenObj.GetProcHist().Count == 0) && (RealWordMergeDetector.IsDetect(tarWord, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3. get candidates from merge
                // set mergeWithHypehn to false for real-word merge
                HashSet <MergeObj> mergeSet = RealWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi);
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 just use frenquency or context, no orthoGraphic
                // in case of using context
                // need the context & frequency score for the orgMergeTerm
                outMergeObj = RankRealWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
            }
            return(outMergeObj);
        }
        // 3 operations:
        // convert a tokenObj to a arrayList of tokenObjs:
        // 1. merge (delete) a tokenObj if the str is empty (length = 0)
        // 2. keep the same tokenObj if str is a single word
        // 3. split a tokenObj if the str contains space
        public static void AddSplit1To1Correction(List <TokenObj> inList, TokenObj inToken)
        {
            string tokenStr = inToken.GetTokenStr();

            // 1. do not add to the list if the token is empty
            if ((string.ReferenceEquals(tokenStr, null)) || (tokenStr.Length == 0))
            {
                // do nothing
            }
            // 2. keep the same tokenObj if there is no change
            // 1-to-1 correction
            else if (TermUtil.IsMultiword(tokenStr) == false)
            {
                Add1To1Correction(inList, inToken);
                // TB Deleted
                //inList.add(inToken);
            }
            // 3. split a tokenObj to an arrayList if the str has space
            else
            {
                AddSplitCorrection(inList, inToken);

                /* TB deleted
                 * ArrayList<TokenObj> tempTokenList = new ArrayList<TokenObj>();
                 * // keep token and delimiters
                 * String[] tokenArray = tokenStr.split(TextObj.patternStrSpace_);
                 * tempTokenList = new ArrayList<TokenObj>(Arrays.stream(tokenArray)
                 *  .map(token -> new TokenObj(inToken, token))
                 *  .collect(Collectors.toList()));
                 * inList.addAll(tempTokenList);
                 */
            }
        }
Beispiel #6
0
        private static string GetCorrectTermStr(string inWord, CSpellApi cSpellApi)
        {
            TokenObj inTokenObj  = new TokenObj(inWord);
            TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi);
            string   outWord     = outTokenObj.GetTokenStr();

            return(outWord);
        }
Beispiel #7
0
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from one-to-one.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> the position for target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s)
        /// </param>
        /// <returns>    the corrected word in tokenObj if suggested word found.
        ///             Otherwise, the original input token is returned. </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. real-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWord1To1Detector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3 get 1-to-1 candidates set from correction
                // TBD. realWordFlag to use metaphone ...
                // this process is very slow, 7 min., need to improved
                HashSet <string> candSet = RealWord1To1Candidates.GetCandidates(coreStr, cSpellApi);
                /// <summary>
                ///** development analysis print out to see total RW
                ///            totalRwNo_++;
                ///            int candSize = candSet.size();
                ///            if(candSize != 0)
                ///            {
                ///                totalCandNo_ += candSize;
                ///                maxCandSize_
                ///                    = ((candSize > maxCandSize_)?candSize:maxCandSize_);
                ///                System.out.println("---- totalRwNo|totalCandNo(" + coreStr
                ///                    + "): " + totalRwNo_ + "|" + candSize + "|"
                ///                    + totalCandNo_ + "|" + maxCandSize_);
                ///                System.out.println(candSet);
                ///            }
                /// ***
                /// </summary>
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWord1To1ByCSpell.GetTopRankStr(coreStr, candSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord, StringComparison.OrdinalIgnoreCase) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_1);                     // 1-to-1
                    DebugPrint.PrintCorrect("RW", "RealWord1To1Corrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }
        // public method
        // the input mergeObjList is in the same order of index as inTokenList
        // TBD: has bug: "imple ment ation" => implementimplementation
        public static List <TokenObj> CorrectTokenListByMerge(List <TokenObj> inTokenList, List <MergeObj> mergeObjList, string procHistStr, bool debugFlag, CSpellApi cSpellApi)
        {
            // 0. unify the mergeObjList to remove contain and overlap
            List <MergeObj> mergeObjListC = CleanUpMergeObjList(mergeObjList);

            List <TokenObj> outTokenList = new List <TokenObj>();
            // 1. go through all mergeObj
            int curIndex = 0;

            foreach (MergeObj mergeObj in mergeObjListC)
            {
                //System.out.println(mergeObj.ToString());
                int startIndex = mergeObj.GetStartIndex();
                int endIndex   = mergeObj.GetEndIndex();
                // 1. update tokens before merge start
                for (int i = curIndex; i < startIndex; i++)
                {
                    outTokenList.Add(inTokenList[i]);
                }
                // 2. update merge at target
                string   mergeWord     = mergeObj.GetMergeWord();
                string   orgMergeWord  = mergeObj.GetOrgMergeWord();
                string   tarWord       = mergeObj.GetTarWord();
                TokenObj mergeTokenObj = new TokenObj(orgMergeWord, mergeWord);
                // update process history
                for (int i = startIndex; i <= endIndex; i++)
                {
                    // merge focus token
                    if (i == mergeObj.GetTarIndex())
                    {
                        cSpellApi.UpdateCorrectNo();
                        mergeTokenObj.AddProcToHist(procHistStr + TokenObj.MERGE_START_STR + tarWord + TokenObj.MERGE_END_STR);
                        //DebugPrint.PrintCorrect("NW",
                        DebugPrint.PrintCorrect(procHistStr, "MergeCorrector (" + tarWord + ")", orgMergeWord, mergeWord, debugFlag);
                    }
                    else                         // not merge focus token, context
                    {
                        TokenObj      contextToken    = inTokenList[i];
                        List <string> contextProcHist = contextToken.GetProcHist();
                        foreach (string procHist in contextProcHist)
                        {
                            mergeTokenObj.AddProcToHist(procHist + TokenObj.MERGE_START_STR + contextToken.GetTokenStr() + TokenObj.MERGE_END_STR);
                        }
                    }
                }
                outTokenList.Add(mergeTokenObj);
                curIndex = endIndex + 1;
            }
            // 2. add tokens after the last merge Obj
            for (int i = curIndex; i < inTokenList.Count; i++)
            {
                outTokenList.Add(inTokenList[i]);
            }
            return(outTokenList);
        }
        // use flat map to add split words to the list
        private static void AddSplitCorrection(List <TokenObj> inList, TokenObj inToken)
        {
            List <TokenObj> tempTokenList = new List <TokenObj>();
            // keep token and delimiters
            string tokenStr = inToken.GetTokenStr();

            string[] tokenArray = tokenStr.Split(TextObj.patternStrSpace_, true);
            // flat Map
            tempTokenList = new List <TokenObj>(tokenArray.Select(token => new TokenObj(inToken, token)).ToList());
            inList.AddRange(tempTokenList);
        }
        // public method
        // process
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // pre-porcess
            // update Pos for the inTokenList
            TextObj.UpdateIndexPos(inTokenList);
            // 1. remove non space-token and convert to non-space-token list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // 2. process: go through each token for detection and correction
            // to find merge corrections (mergeObjList)
            int             index               = 0;
            List <MergeObj> mergeObjList        = new List <MergeObj>();
            int             maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            while (index < inTokenList.Count)
            {
                TokenObj curTokenObj = inTokenList[index];

                // update the tarPos
                // SCR-3, use legit token
                if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    int tarPos = inTokenList[index].GetPos();
                    // correct term is the highest ranked candidates
                    MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag);
                    if (mergeObj == null)                       // no merge correction
                    {
                        index++;
                    }
                    else                         // has merge correction
                    {
                        mergeObjList.Add(mergeObj);
                        // next token after end token, this ensures no overlap merge
                        index = mergeObj.GetEndIndex() + 1;
                    }
                }
                else                     // space token
                                         // update index
                {
                    index++;
                }
            }
            // update the output for merge for the whole inTokenList,
            // has to update after the loop bz merge might
            // happen to the previous token
            // update the tokenObj up to the merge, then go to the next token
            // update operation info also
            List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi);

            return(outTokenList);
        }
Beispiel #11
0
        public static string MakeToken(string uid, string psw, int type)
        {
            TokenObj obj = new TokenObj();

            obj.uid       = uid;
            obj.psw       = psw;
            obj.type      = type;
            obj.timestamp = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
            obj.randomstr = Guid.NewGuid().ToString().Replace("-", "");
            string strToken = JsonConvert.SerializeObject(obj);
            string Token    = DES.EncryptDES(strToken);

            return(Token);
        }
        public static TokenObj Process(TokenObj inTokenObj)
        {
            string   inTokenStr  = inTokenObj.GetTokenStr();
            string   outTokenStr = Process(inTokenStr);
            TokenObj outTokenObj = new TokenObj(inTokenObj);

            //update info if there is a process
            if (inTokenStr.Equals(outTokenStr) == false)
            {
                outTokenObj.SetTokenStr(outTokenStr);
                outTokenObj.AddProcToHist(TokenObj.HIST_ND_S_E_P);
            }
            return(outTokenObj);
        }
Beispiel #13
0
        public static TokenObj Process(TokenObj inTokenObj, Dictionary <string, string> informalExpMap, bool debugFlag)
        {
            string   inTokenStr  = inTokenObj.GetTokenStr();
            string   outTokenStr = ProcessWord(inTokenStr, informalExpMap);
            TokenObj outTokenObj = new TokenObj(inTokenObj);

            //update info if there is a process
            if (inTokenStr.Equals(outTokenStr) == false)
            {
                outTokenObj.SetTokenStr(outTokenStr);
                outTokenObj.AddProcToHist(TokenObj.HIST_ND_INFORMAL_EXP);
                DebugPrint.PrintCorrect("ND", "InformalExpHandler", inTokenStr, outTokenStr, debugFlag);
            }
            return(outTokenObj);
        }
        public static TokenObj Process(TokenObj inTokenObj, int maxProcess, bool debugFlag)
        {
            string   inTokenStr  = inTokenObj.GetTokenStr();
            string   outTokenStr = Process(inTokenStr, maxProcess);
            TokenObj outTokenObj = new TokenObj(inTokenObj);

            //update info if there is a process
            if (inTokenStr.Equals(outTokenStr) == false)
            {
                outTokenObj.SetTokenStr(outTokenStr);
                outTokenObj.AddProcToHist(TokenObj.HIST_ND_S_E_P);
                DebugPrint.PrintCorrect("ND", "EndingPuncSplitter", inTokenStr, outTokenStr, debugFlag);
            }
            return(outTokenObj);
        }
Beispiel #15
0
        private static void TestGetCorrectTerm(CSpellApi cSpellApi)
        {
            // init
            // all lowerCase
            string inText = "hotflashes";
            // test process:
            TokenObj inToken  = new TokenObj(inText);
            TokenObj outToken = NonWordCorrector.GetCorrectTerm(inToken, cSpellApi);
            // result
            string outText = outToken.GetTokenStr();

            // print out
            Console.WriteLine("--------- GetCorrectTerm( ) -----------");
            Console.WriteLine("In: [" + inText + "]");
            Console.WriteLine("Out: [" + outText + "]");
        }
        public static TokenObj Process(TokenObj inTokenObj, bool debugFlag)
        {
            // get string from tokenObj
            string inTokenStr  = inTokenObj.GetTokenStr();
            string outTokenStr = ProcessWord(inTokenStr);
            //update info if there is a XMl/Html process
            TokenObj outTokenObj = new TokenObj(inTokenObj);

            if (inTokenStr.Equals(outTokenStr) == false)
            {
                outTokenObj.SetTokenStr(outTokenStr);
                outTokenObj.AddProcToHist(TokenObj.HIST_ND_XML_HTML);
                DebugPrint.PrintCorrect("ND", "XmlHtmlHandler", inTokenStr, outTokenStr, debugFlag);
            }
            return(outTokenObj);
        }
Beispiel #17
0
 public JsonResult <object> Login(dynamic user)
 {
     using (var db = new LiteDB.LiteDatabase(AppDomain.CurrentDomain.BaseDirectory + "\\mydb.db"))
     {
         string password = user.password;
         string userid   = user.user;
         var    users    = db.GetCollection <User>("Users");
         var    first    = users.Find(o => o.name == userid).FirstOrDefault();
         if (first != null && Actions.IsValide(password, first.pwdhash))
         {
             var to = new TokenObj {
                 userid = userid, Expires = DateTime.Now.AddSeconds(20)
             };
             return(Json <object>(new { token = to.ToTokenString() }));
         }
     }
     return(Json <object>(new { token = "" }));
 }
Beispiel #18
0
        // public method
        // Use: for loop, the latest and greatest implementation
        // original implementation with for loop, To be deleted
        // the core of spell-correction, include split
        // inTokenList is the whole text
        public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag);
            DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag);
            // init the output TokenList
            List <TokenObj> outTokenList = new List <TokenObj>();
            // process: go through each token for detection and correction
            // for the 1-to-1 and split correction
            int tarPos = 0;             // the position of the tokenObj in the inTokenList
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList);
            // use the inTokenList to keep the same spcae token
            TokenObj outTokenObj         = null;
            int      maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength();

            foreach (TokenObj tokenObj in inTokenList)
            {
                /// <summary>
                /// no context
                /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj,
                ///    cSpellApi, debugFlag);
                ///
                /// </summary>
                // skip empty space tokens and long tokens
                // SCR-3, use legit token
                if (tokenObj.IsLegitToken(maxLegitTokenLength) == true)
                {
                    // correct term is the highest ranked candidate
                    outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                    // used tarPos for context module
                    tarPos++;
                }
                else
                {
                    outTokenObj = tokenObj;
                }
                // add the corrected tokenObj to the output token list
                // use FlatMap because there might be a split
                Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj);
            }
            return(outTokenList);
        }
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if real-word
        /// <li>get split candidates
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> cSpell API object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position of the target token to be split </param>
        /// <param name="nonSpaceTokenList"> the token list without space tokens
        /// </param>
        /// <returns>    the split words in tokenObj.  </returns>
        // return the original term if no good correctin are found
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is real-word
            if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true))
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3. get split candidates set from correction
                int maxSplitNo            = cSpellApi.GetCanRwMaxSplitNo();
                HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                // get candidates from split
                // 4. Ranking: get top ranked candidates as corrected terms
                // in case of using context
                string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a real-word correction
                if (inWord.Equals(outWord) == false)
                {
                    cSpellApi.UpdateCorrectNo();
                    outTokenObj.SetTokenStr(outWord);
                    outTokenObj.AddProcToHist(TokenObj.HIST_RW_S);                     //split
                    DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag);
                }
            }
            return(outTokenObj);
        }
Beispiel #20
0
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to removeEndPuncStr
        /// <li>detect if misspell (OOV) - non-word, exclude Aa
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from merge.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>orthographic
        ///     <li>frequency
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        ///
        /// </ul>
        /// </summary>
        /// <param name="tarPos">    postion of target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print
        /// </param>
        /// <returns>    the corrected merged word in MergeObj if the token is OOV
        ///             and suggested merged word found.
        ///             Otherwise, a null of MergeObj is returned. </returns>
        // return the original term if no good correctin are found
        public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag)
        {
            // get tarWord from tarTokenObj and init outTokenObj
            TokenObj tarTokenObj = nonSpaceTokenList[tarPos];
            string   tarWord     = tarTokenObj.GetTokenStr();
            MergeObj outMergeObj = null;             // no merge if it is null
            // 1. only remove ending punctuation for coreTerm
            string coreStr = TermUtil.StripEndPuncSpace(tarWord).ToLower();

            // 2. non-word correction
            // check if tarWord and removeEndPuncStr is OOV
            if (NonWordMergeDetector.IsDetect(tarWord, coreStr, cSpellApi, debugFlag) == true)
            {
                cSpellApi.UpdateDetectNo();
                // 3. get candidates from merge
                HashSet <MergeObj> mergeSet = NonWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi);
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 just use frenquency or context, no orthoGraphic
                // in case of using context
                outMergeObj = RankNonWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag);
            }
            return(outMergeObj);
        }
Beispiel #21
0
        public static TokenObj CheckToken(string Token, out int code)
        {
            code = MessageCode.UNKONWN;
            int      token_last_day = 99999;
            TokenObj tokenObj       = null;

            try
            {
                string token = DES.DecryptDES(Token);
                tokenObj = JsonConvert.DeserializeObject <TokenObj>(token);
            }
            catch (Exception ex)
            {
                code = MessageCode.ERROR_TOKEN_VALIDATE;
                return(null);
            }



            //判断Token是否过期
            string   date       = tokenObj.timestamp;
            DateTime token_date = DateTime.Parse(date);
            TimeSpan sp         = DateTime.Now - token_date;

            if (sp.Days > token_last_day)
            {
                code = MessageCode.ERROR_TOKEN_TIMEOUT;
            }
            //判断Token是否合法


            if (tokenCache.Keys.Contains(tokenObj.uid))
            {
                string oldToken = tokenCache[tokenObj.uid];
                if (oldToken == Token)
                {
                    code = MessageCode.SUCCESS;
                    return(tokenObj);
                }
            }
            SGoodsDB db = new SGoodsDB();
            DataSet  ds = db.ExeQuery("select uid,psw from [guser] where uid=@uid and psw=@psw",
                                      new SqlParameter("uid", tokenObj.uid),
                                      new SqlParameter("psw", tokenObj.psw));

            db.Close();
            if (ds == null)
            {
                code = MessageCode.ERROR_EXECUTE_SQL;
                return(null);
            }
            if (ds.Tables[0].Rows.Count == 0)
            {
                code = MessageCode.ERROR_TOKEN_VALIDATE;
                return(null);
            }

            tokenCache[tokenObj.uid] = Token;

            code = MessageCode.SUCCESS;
            return(tokenObj);
        }
 // private method
 private static void Add1To1Correction(List <TokenObj> inList, TokenObj inToken)
 {
     inList.Add(inToken);
 }
Beispiel #23
0
        /// <summary>
        /// This method uses context scores to find the correct term.
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position for target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s)
        /// </param>
        /// <returns>    the corrected word in tokenObj if the coreTerm is OOV
        ///             and suggested word found. Otherwise, the original input token
        ///             is returned. </returns>
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is spelling errors - non-word
            //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag);
            // TBD .. need to separate 1-to-1 and split
            if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)
            {
                cSpellApi.UpdateDetectNo();
                // TBD, should take care of possessive xxx's here
                // 3.1 get 1-to-1 candidates set from correction, no split
                HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi);
                // add split
                // TBD ...
                if (funcMode != CSpellApi.FUNC_MODE_NW_1)
                {
                    // 3.2 get candidates from split
                    int maxSplitNo            = cSpellApi.GetCanNwMaxSplitNo();
                    HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                    // 3.4 set split candidates to candidate
                    if (funcMode == CSpellApi.FUNC_MODE_NW_S)
                    {
                        candSet = new HashSet <string>(splitSet);
                    }
                    else                         // 3.4 add split candidates
                    {
                        candSet.addAll(splitSet);
                    }
                }
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 from orthoGraphic

                /*
                 * // not used context
                 * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet,
                 *  cSpellApi, debugFlag);
                 */
                // in case of using context
                string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                coreTermObj.SetCoreTerm(topRankStr);
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a process
                if (inWord.Equals(outWord) == false)
                {
                    outTokenObj.SetTokenStr(outWord);
                    if (TermUtil.IsMultiword(outWord) == true)
                    {
                        cSpellApi.UpdateCorrectNo();
                        outTokenObj.AddProcToHist(TokenObj.HIST_NW_S);                         //split
                        DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag);
                    }
                    else                         // 1To1 correct
                    {
                        cSpellApi.UpdateCorrectNo();
                        outTokenObj.AddProcToHist(TokenObj.HIST_NW_1);
                        DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag);
                    }
                }
            }
            return(outTokenObj);
        }
Beispiel #24
0
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to coreTerm
        /// <li>detect if misspell (OOV) - non-word
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from 1To1.
        ///     <li>get candidates from split.
        ///     </ul>
        /// <li>Rank candidates
        /// <li>Update information
        /// </ul>
        ///
        /// This method does not use context scores.
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object
        /// </param>
        /// <returns>    the corrected word in tokenObj if the coreTerm is OOV
        ///             and suggested word found. Otherwise, the original input token
        ///             is returned. </returns>
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi)
        {
            bool debugFlag = false;

            return(GetCorrectTerm(inTokenObj, cSpellApi, debugFlag));
        }
        // recursively process
        public static TokenObj Process(TokenObj inTokenObj, int maxProcess)
        {
            bool debugFlag = false;

            return(Process(inTokenObj, maxProcess, debugFlag));
        }
Beispiel #26
0
        // protected method
        // get merge word by merge no, including shift window, fixed window size
        protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic)
        {
            // output merge object list
            HashSet <MergeObj> mergeSet = new HashSet <MergeObj>();
            // find the merge object
            int startPos = tarPos - mergeNo;             // start pos index

            startPos = ((startPos > 0) ? startPos : 0);
            int size = nonSpaceTextList.Count;
            // find the merge word, merged by remove spcae or repalce with "-"
            // shift window by i
            int    startIndex = 0;
            int    tarIndex   = nonSpaceTextList[tarPos].GetIndex();
            string tarWord    = nonSpaceTextList[tarPos].GetTokenStr();
            int    endIndex   = 0;
            // these are vars to be used to MergeObj
            int objStartPos = 0;
            int objTarPos   = tarPos;
            int objEndPos   = 0;

            // all possible merges
            for (int i = startPos; i <= tarPos; i++)
            {
                // get the merged word with fixed window size (mergeNo)
                string mergeWordBySpace  = "";
                string mergeWordByHyphen = "";
                string orgMergeWord      = "";            // the original word b4 merge
                bool   completeFlag      = true;
                startIndex = nonSpaceTextList[i].GetIndex();
                bool firstToken = true;
                objStartPos = i;
                objEndPos   = i + mergeNo;
                int shortWordNo = 0;
                // merge operations
                for (int j = 0; j <= mergeNo; j++)
                {
                    int curPos = i + j;
                    if (curPos < size)                       // check window size
                    {
                        TokenObj curTokenObj = nonSpaceTextList[curPos];
                        string   tokenStr    = curTokenObj.GetTokenStr();
                        // should move to a Util function file
                        // don't combine if exception of puntuaction
                        if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true))                         // eMail
                        {
                            //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true))
                            completeFlag = false;
                            break;
                        }
                        else                             // where merege operation happen
                                                         // don't put the "-" or " " for the first token
                        {
                            if (firstToken == true)
                            {
                                mergeWordBySpace  = tokenStr;
                                mergeWordByHyphen = tokenStr;
                                orgMergeWord      = tokenStr;
                                firstToken        = false;
                                shortWordNo       = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            else
                            {
                                mergeWordBySpace  += tokenStr;
                                mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr;
                                orgMergeWord      += GlobalVars.SPACE_STR + tokenStr;
                                shortWordNo        = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            endIndex = curTokenObj.GetIndex();
                        }
                    }
                    else                         // end of the text list, break out of the loop
                    {
                        completeFlag = false;
                        break;
                    }
                }
                // must complete the fixed window for merging
                if (completeFlag == true)
                {
                    // the orginal word (before merge) can't be a multiword
                    // such as "non clinical"
                    if (mwDic.IsDicWord(orgMergeWord) == false)
                    {
                        // check short word merge
                        if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO))                           // real-word
                        {
                            AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            // Add merge with hyphen to candidate set
                            if (mergeWithHyphen == true)
                            {
                                AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            }
                        }
                    }
                }
            }
            return(mergeSet);
        }
Beispiel #27
0
        // public method
        /// <summary>
        /// A method to process mapping frominformal expression to corrected word.
        /// The lowercase of inWord is used as key for the mapping.
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="informalExpMap"> the map of informal expression
        /// </param>
        /// <returns>    the mapped corrected word (lowercase only) if mappnig found,
        ///             toherwise, the original input token is returned. </returns>
        public static TokenObj Process(TokenObj inTokenObj, Dictionary <string, string> informalExpMap)
        {
            bool debugFlag = false;

            return(Process(inTokenObj, informalExpMap, debugFlag));
        }