private static void TestSplit(CSpellApi cSpellApi) { // setup test case // 10349.txt //String inText = "sounding in my ear every time for along time."; // 13864.txt string inText = "I donate my self to be apart of this study."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); //int tarPos = 7; int tarPos = 6; TokenObj inTokenObj = nonSpaceTokenList[tarPos]; bool debugFlag = false; Console.WriteLine("====== Real-Word One-To-One Correction Test ====="); Console.WriteLine("-- inTextList: [" + inText + "]"); Console.WriteLine("-- tarPos: [" + tarPos + "]"); Console.WriteLine("-- inTokenObj: [" + inTokenObj.ToString() + "]"); // get the correct term TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // print out Console.WriteLine("--------- GetCorrectTermStr( ) -----------"); Console.WriteLine("-- outTokenObj: [" + outTokenObj.ToString() + "]"); }
private static void Test1To1(CSpellApi cSpellApi) { // setup test case // 51.txt //String inText = "You'd thing that this is good."; //String inText = "The doctor thing that this is good."; string inText = "you would thing that is good."; TextObj textObj = new TextObj(inText); List <TokenObj> inTextList = textObj.GetTokenList(); List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); int tarPos = 2; TokenObj inTokenObj = nonSpaceTokenList[tarPos]; bool debugFlag = false; Console.WriteLine("====== Real-Word One-To-One Correction Test ====="); Console.WriteLine("-- inTextList: [" + inText + "]"); Console.WriteLine("-- tarPos: [" + tarPos + "]"); Console.WriteLine("-- inTokenObj: [" + inTokenObj.ToString() + "]"); // get the correct term TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // print out Console.WriteLine("--------- GetCorrectTermStr( ) -----------"); Console.WriteLine("-- outTokenObj: [" + outTokenObj.ToString() + "]"); }
/// <summary> /// This method does not use context scores to find the correct term. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print /// </param> /// <returns> the corrected word in tokenObj if the coreTerm is OOV /// and suggested word found. Otherwise, the original input token /// is returned. </returns> public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag) { int tarPos = 0; // set to 0 if not use context List <TokenObj> nonSpaceTokenList = null; return(GetCorrectTerm(inTokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList)); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>detect if real-word for merge /// <li>get candidates /// <ul> /// <li>get candidates from merge. /// </ul> /// <li>Rank candidates /// <ul> /// <li>context /// <li>frequency (TBD) /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="tarPos"> the position of target tokenObj </param> /// <param name="nonSpaceTokenList"> token list without space tokens </param> /// <param name="cSpellApi"> for all dictioanry and Word2Vec data </param> /// <param name="debugFlag"> boolean flag for debug print /// </param> /// <returns> the corrected merged word in MergeObj if the target token /// matches real-word merged rules. /// Otherwise, a null of MergeObj is returned. </returns> // return the original term if no good correctin are found public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag) { // get tarWord from tarTokenObj and init outTokenObj TokenObj tarTokenObj = nonSpaceTokenList[tarPos]; string tarWord = tarTokenObj.GetTokenStr(); // 1. only remove ending punctuation for coreTerm // No coreStr is used for real-word merge for less aggressive //String coreStr = TermUtil.StripEndPuncSpace(tarWord).toLowerCase(); // 2. real-word merge correction // check if tarWord and removeEndPuncStr is OOV MergeObj outMergeObj = null; // no merge if it is null if ((tarTokenObj.GetProcHist().Count == 0) && (RealWordMergeDetector.IsDetect(tarWord, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get candidates from merge // set mergeWithHypehn to false for real-word merge HashSet <MergeObj> mergeSet = RealWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); // 4. Ranking: get top ranked candidates as corrected terms // 4.1 just use frenquency or context, no orthoGraphic // in case of using context // need the context & frequency score for the orgMergeTerm outMergeObj = RankRealWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); } return(outMergeObj); }
// 3 operations: // convert a tokenObj to a arrayList of tokenObjs: // 1. merge (delete) a tokenObj if the str is empty (length = 0) // 2. keep the same tokenObj if str is a single word // 3. split a tokenObj if the str contains space public static void AddSplit1To1Correction(List <TokenObj> inList, TokenObj inToken) { string tokenStr = inToken.GetTokenStr(); // 1. do not add to the list if the token is empty if ((string.ReferenceEquals(tokenStr, null)) || (tokenStr.Length == 0)) { // do nothing } // 2. keep the same tokenObj if there is no change // 1-to-1 correction else if (TermUtil.IsMultiword(tokenStr) == false) { Add1To1Correction(inList, inToken); // TB Deleted //inList.add(inToken); } // 3. split a tokenObj to an arrayList if the str has space else { AddSplitCorrection(inList, inToken); /* TB deleted * ArrayList<TokenObj> tempTokenList = new ArrayList<TokenObj>(); * // keep token and delimiters * String[] tokenArray = tokenStr.split(TextObj.patternStrSpace_); * tempTokenList = new ArrayList<TokenObj>(Arrays.stream(tokenArray) * .map(token -> new TokenObj(inToken, token)) * .collect(Collectors.toList())); * inList.addAll(tempTokenList); */ } }
private static string GetCorrectTermStr(string inWord, CSpellApi cSpellApi) { TokenObj inTokenObj = new TokenObj(inWord); TokenObj outTokenObj = GetCorrectTerm(inTokenObj, cSpellApi); string outWord = outTokenObj.GetTokenStr(); return(outWord); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get candidates /// <ul> /// <li>get candidates from one-to-one. /// </ul> /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> the position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if suggested word found. /// Otherwise, the original input token is returned. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. real-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWord1To1Detector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3 get 1-to-1 candidates set from correction // TBD. realWordFlag to use metaphone ... // this process is very slow, 7 min., need to improved HashSet <string> candSet = RealWord1To1Candidates.GetCandidates(coreStr, cSpellApi); /// <summary> ///** development analysis print out to see total RW /// totalRwNo_++; /// int candSize = candSet.size(); /// if(candSize != 0) /// { /// totalCandNo_ += candSize; /// maxCandSize_ /// = ((candSize > maxCandSize_)?candSize:maxCandSize_); /// System.out.println("---- totalRwNo|totalCandNo(" + coreStr /// + "): " + totalRwNo_ + "|" + candSize + "|" /// + totalCandNo_ + "|" + maxCandSize_); /// System.out.println(candSet); /// } /// *** /// </summary> // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWord1To1ByCSpell.GetTopRankStr(coreStr, candSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord, StringComparison.OrdinalIgnoreCase) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_1); // 1-to-1 DebugPrint.PrintCorrect("RW", "RealWord1To1Corrector", inWord, outWord, debugFlag); } } return(outTokenObj); }
// public method // the input mergeObjList is in the same order of index as inTokenList // TBD: has bug: "imple ment ation" => implementimplementation public static List <TokenObj> CorrectTokenListByMerge(List <TokenObj> inTokenList, List <MergeObj> mergeObjList, string procHistStr, bool debugFlag, CSpellApi cSpellApi) { // 0. unify the mergeObjList to remove contain and overlap List <MergeObj> mergeObjListC = CleanUpMergeObjList(mergeObjList); List <TokenObj> outTokenList = new List <TokenObj>(); // 1. go through all mergeObj int curIndex = 0; foreach (MergeObj mergeObj in mergeObjListC) { //System.out.println(mergeObj.ToString()); int startIndex = mergeObj.GetStartIndex(); int endIndex = mergeObj.GetEndIndex(); // 1. update tokens before merge start for (int i = curIndex; i < startIndex; i++) { outTokenList.Add(inTokenList[i]); } // 2. update merge at target string mergeWord = mergeObj.GetMergeWord(); string orgMergeWord = mergeObj.GetOrgMergeWord(); string tarWord = mergeObj.GetTarWord(); TokenObj mergeTokenObj = new TokenObj(orgMergeWord, mergeWord); // update process history for (int i = startIndex; i <= endIndex; i++) { // merge focus token if (i == mergeObj.GetTarIndex()) { cSpellApi.UpdateCorrectNo(); mergeTokenObj.AddProcToHist(procHistStr + TokenObj.MERGE_START_STR + tarWord + TokenObj.MERGE_END_STR); //DebugPrint.PrintCorrect("NW", DebugPrint.PrintCorrect(procHistStr, "MergeCorrector (" + tarWord + ")", orgMergeWord, mergeWord, debugFlag); } else // not merge focus token, context { TokenObj contextToken = inTokenList[i]; List <string> contextProcHist = contextToken.GetProcHist(); foreach (string procHist in contextProcHist) { mergeTokenObj.AddProcToHist(procHist + TokenObj.MERGE_START_STR + contextToken.GetTokenStr() + TokenObj.MERGE_END_STR); } } } outTokenList.Add(mergeTokenObj); curIndex = endIndex + 1; } // 2. add tokens after the last merge Obj for (int i = curIndex; i < inTokenList.Count; i++) { outTokenList.Add(inTokenList[i]); } return(outTokenList); }
// use flat map to add split words to the list private static void AddSplitCorrection(List <TokenObj> inList, TokenObj inToken) { List <TokenObj> tempTokenList = new List <TokenObj>(); // keep token and delimiters string tokenStr = inToken.GetTokenStr(); string[] tokenArray = tokenStr.Split(TextObj.patternStrSpace_, true); // flat Map tempTokenList = new List <TokenObj>(tokenArray.Select(token => new TokenObj(inToken, token)).ToList()); inList.AddRange(tempTokenList); }
// public method // process public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // pre-porcess // update Pos for the inTokenList TextObj.UpdateIndexPos(inTokenList); // 1. remove non space-token and convert to non-space-token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // 2. process: go through each token for detection and correction // to find merge corrections (mergeObjList) int index = 0; List <MergeObj> mergeObjList = new List <MergeObj>(); int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); while (index < inTokenList.Count) { TokenObj curTokenObj = inTokenList[index]; // update the tarPos // SCR-3, use legit token if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true) { int tarPos = inTokenList[index].GetPos(); // correct term is the highest ranked candidates MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); if (mergeObj == null) // no merge correction { index++; } else // has merge correction { mergeObjList.Add(mergeObj); // next token after end token, this ensures no overlap merge index = mergeObj.GetEndIndex() + 1; } } else // space token // update index { index++; } } // update the output for merge for the whole inTokenList, // has to update after the loop bz merge might // happen to the previous token // update the tokenObj up to the merge, then go to the next token // update operation info also List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi); return(outTokenList); }
public static string MakeToken(string uid, string psw, int type) { TokenObj obj = new TokenObj(); obj.uid = uid; obj.psw = psw; obj.type = type; obj.timestamp = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"); obj.randomstr = Guid.NewGuid().ToString().Replace("-", ""); string strToken = JsonConvert.SerializeObject(obj); string Token = DES.EncryptDES(strToken); return(Token); }
public static TokenObj Process(TokenObj inTokenObj) { string inTokenStr = inTokenObj.GetTokenStr(); string outTokenStr = Process(inTokenStr); TokenObj outTokenObj = new TokenObj(inTokenObj); //update info if there is a process if (inTokenStr.Equals(outTokenStr) == false) { outTokenObj.SetTokenStr(outTokenStr); outTokenObj.AddProcToHist(TokenObj.HIST_ND_S_E_P); } return(outTokenObj); }
public static TokenObj Process(TokenObj inTokenObj, Dictionary <string, string> informalExpMap, bool debugFlag) { string inTokenStr = inTokenObj.GetTokenStr(); string outTokenStr = ProcessWord(inTokenStr, informalExpMap); TokenObj outTokenObj = new TokenObj(inTokenObj); //update info if there is a process if (inTokenStr.Equals(outTokenStr) == false) { outTokenObj.SetTokenStr(outTokenStr); outTokenObj.AddProcToHist(TokenObj.HIST_ND_INFORMAL_EXP); DebugPrint.PrintCorrect("ND", "InformalExpHandler", inTokenStr, outTokenStr, debugFlag); } return(outTokenObj); }
public static TokenObj Process(TokenObj inTokenObj, int maxProcess, bool debugFlag) { string inTokenStr = inTokenObj.GetTokenStr(); string outTokenStr = Process(inTokenStr, maxProcess); TokenObj outTokenObj = new TokenObj(inTokenObj); //update info if there is a process if (inTokenStr.Equals(outTokenStr) == false) { outTokenObj.SetTokenStr(outTokenStr); outTokenObj.AddProcToHist(TokenObj.HIST_ND_S_E_P); DebugPrint.PrintCorrect("ND", "EndingPuncSplitter", inTokenStr, outTokenStr, debugFlag); } return(outTokenObj); }
private static void TestGetCorrectTerm(CSpellApi cSpellApi) { // init // all lowerCase string inText = "hotflashes"; // test process: TokenObj inToken = new TokenObj(inText); TokenObj outToken = NonWordCorrector.GetCorrectTerm(inToken, cSpellApi); // result string outText = outToken.GetTokenStr(); // print out Console.WriteLine("--------- GetCorrectTerm( ) -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); }
public static TokenObj Process(TokenObj inTokenObj, bool debugFlag) { // get string from tokenObj string inTokenStr = inTokenObj.GetTokenStr(); string outTokenStr = ProcessWord(inTokenStr); //update info if there is a XMl/Html process TokenObj outTokenObj = new TokenObj(inTokenObj); if (inTokenStr.Equals(outTokenStr) == false) { outTokenObj.SetTokenStr(outTokenStr); outTokenObj.AddProcToHist(TokenObj.HIST_ND_XML_HTML); DebugPrint.PrintCorrect("ND", "XmlHtmlHandler", inTokenStr, outTokenStr, debugFlag); } return(outTokenObj); }
public JsonResult <object> Login(dynamic user) { using (var db = new LiteDB.LiteDatabase(AppDomain.CurrentDomain.BaseDirectory + "\\mydb.db")) { string password = user.password; string userid = user.user; var users = db.GetCollection <User>("Users"); var first = users.Find(o => o.name == userid).FirstOrDefault(); if (first != null && Actions.IsValide(password, first.pwdhash)) { var to = new TokenObj { userid = userid, Expires = DateTime.Now.AddSeconds(20) }; return(Json <object>(new { token = to.ToTokenString() })); } } return(Json <object>(new { token = "" })); }
// public method // Use: for loop, the latest and greatest implementation // original implementation with for loop, To be deleted // the core of spell-correction, include split // inTokenList is the whole text public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // init the output TokenList List <TokenObj> outTokenList = new List <TokenObj>(); // process: go through each token for detection and correction // for the 1-to-1 and split correction int tarPos = 0; // the position of the tokenObj in the inTokenList // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // use the inTokenList to keep the same spcae token TokenObj outTokenObj = null; int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); foreach (TokenObj tokenObj in inTokenList) { /// <summary> /// no context /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj, /// cSpellApi, debugFlag); /// /// </summary> // skip empty space tokens and long tokens // SCR-3, use legit token if (tokenObj.IsLegitToken(maxLegitTokenLength) == true) { // correct term is the highest ranked candidate outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // used tarPos for context module tarPos++; } else { outTokenObj = tokenObj; } // add the corrected tokenObj to the output token list // use FlatMap because there might be a split Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj); } return(outTokenList); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get split candidates /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> cSpell API object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position of the target token to be split </param> /// <param name="nonSpaceTokenList"> the token list without space tokens /// </param> /// <returns> the split words in tokenObj. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get split candidates set from correction int maxSplitNo = cSpellApi.GetCanRwMaxSplitNo(); HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // get candidates from split // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_S); //split DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag); } } return(outTokenObj); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to removeEndPuncStr /// <li>detect if misspell (OOV) - non-word, exclude Aa /// <li>get candidates /// <ul> /// <li>get candidates from merge. /// </ul> /// <li>Rank candidates /// <ul> /// <li>orthographic /// <li>frequency /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="tarPos"> postion of target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print /// </param> /// <returns> the corrected merged word in MergeObj if the token is OOV /// and suggested merged word found. /// Otherwise, a null of MergeObj is returned. </returns> // return the original term if no good correctin are found public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag) { // get tarWord from tarTokenObj and init outTokenObj TokenObj tarTokenObj = nonSpaceTokenList[tarPos]; string tarWord = tarTokenObj.GetTokenStr(); MergeObj outMergeObj = null; // no merge if it is null // 1. only remove ending punctuation for coreTerm string coreStr = TermUtil.StripEndPuncSpace(tarWord).ToLower(); // 2. non-word correction // check if tarWord and removeEndPuncStr is OOV if (NonWordMergeDetector.IsDetect(tarWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // 3. get candidates from merge HashSet <MergeObj> mergeSet = NonWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi); // 4. Ranking: get top ranked candidates as corrected terms // 4.1 just use frenquency or context, no orthoGraphic // in case of using context outMergeObj = RankNonWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); } return(outMergeObj); }
public static TokenObj CheckToken(string Token, out int code) { code = MessageCode.UNKONWN; int token_last_day = 99999; TokenObj tokenObj = null; try { string token = DES.DecryptDES(Token); tokenObj = JsonConvert.DeserializeObject <TokenObj>(token); } catch (Exception ex) { code = MessageCode.ERROR_TOKEN_VALIDATE; return(null); } //判断Token是否过期 string date = tokenObj.timestamp; DateTime token_date = DateTime.Parse(date); TimeSpan sp = DateTime.Now - token_date; if (sp.Days > token_last_day) { code = MessageCode.ERROR_TOKEN_TIMEOUT; } //判断Token是否合法 if (tokenCache.Keys.Contains(tokenObj.uid)) { string oldToken = tokenCache[tokenObj.uid]; if (oldToken == Token) { code = MessageCode.SUCCESS; return(tokenObj); } } SGoodsDB db = new SGoodsDB(); DataSet ds = db.ExeQuery("select uid,psw from [guser] where uid=@uid and psw=@psw", new SqlParameter("uid", tokenObj.uid), new SqlParameter("psw", tokenObj.psw)); db.Close(); if (ds == null) { code = MessageCode.ERROR_EXECUTE_SQL; return(null); } if (ds.Tables[0].Rows.Count == 0) { code = MessageCode.ERROR_TOKEN_VALIDATE; return(null); } tokenCache[tokenObj.uid] = Token; code = MessageCode.SUCCESS; return(tokenObj); }
// private method private static void Add1To1Correction(List <TokenObj> inList, TokenObj inToken) { inList.Add(inToken); }
/// <summary> /// This method uses context scores to find the correct term. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position for target token </param> /// <param name="nonSpaceTokenList"> token list without space token(s) /// </param> /// <returns> the corrected word in tokenObj if the coreTerm is OOV /// and suggested word found. Otherwise, the original input token /// is returned. </returns> public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is spelling errors - non-word //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag); // TBD .. need to separate 1-to-1 and split if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3.1 get 1-to-1 candidates set from correction, no split HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi); // add split // TBD ... if (funcMode != CSpellApi.FUNC_MODE_NW_1) { // 3.2 get candidates from split int maxSplitNo = cSpellApi.GetCanNwMaxSplitNo(); HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // 3.4 set split candidates to candidate if (funcMode == CSpellApi.FUNC_MODE_NW_S) { candSet = new HashSet <string>(splitSet); } else // 3.4 add split candidates { candSet.addAll(splitSet); } } // 4. Ranking: get top ranked candidates as corrected terms // 4.1 from orthoGraphic /* * // not used context * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet, * cSpellApi, debugFlag); */ // in case of using context string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a process if (inWord.Equals(outWord) == false) { outTokenObj.SetTokenStr(outWord); if (TermUtil.IsMultiword(outWord) == true) { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_S); //split DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag); } else // 1To1 correct { cSpellApi.UpdateCorrectNo(); outTokenObj.AddProcToHist(TokenObj.HIST_NW_1); DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag); } } } return(outTokenObj); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if misspell (OOV) - non-word /// <li>get candidates /// <ul> /// <li>get candidates from 1To1. /// <li>get candidates from split. /// </ul> /// <li>Rank candidates /// <li>Update information /// </ul> /// /// This method does not use context scores. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> CSpell Api object /// </param> /// <returns> the corrected word in tokenObj if the coreTerm is OOV /// and suggested word found. Otherwise, the original input token /// is returned. </returns> public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi) { bool debugFlag = false; return(GetCorrectTerm(inTokenObj, cSpellApi, debugFlag)); }
// recursively process public static TokenObj Process(TokenObj inTokenObj, int maxProcess) { bool debugFlag = false; return(Process(inTokenObj, maxProcess, debugFlag)); }
// protected method // get merge word by merge no, including shift window, fixed window size protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic) { // output merge object list HashSet <MergeObj> mergeSet = new HashSet <MergeObj>(); // find the merge object int startPos = tarPos - mergeNo; // start pos index startPos = ((startPos > 0) ? startPos : 0); int size = nonSpaceTextList.Count; // find the merge word, merged by remove spcae or repalce with "-" // shift window by i int startIndex = 0; int tarIndex = nonSpaceTextList[tarPos].GetIndex(); string tarWord = nonSpaceTextList[tarPos].GetTokenStr(); int endIndex = 0; // these are vars to be used to MergeObj int objStartPos = 0; int objTarPos = tarPos; int objEndPos = 0; // all possible merges for (int i = startPos; i <= tarPos; i++) { // get the merged word with fixed window size (mergeNo) string mergeWordBySpace = ""; string mergeWordByHyphen = ""; string orgMergeWord = ""; // the original word b4 merge bool completeFlag = true; startIndex = nonSpaceTextList[i].GetIndex(); bool firstToken = true; objStartPos = i; objEndPos = i + mergeNo; int shortWordNo = 0; // merge operations for (int j = 0; j <= mergeNo; j++) { int curPos = i + j; if (curPos < size) // check window size { TokenObj curTokenObj = nonSpaceTextList[curPos]; string tokenStr = curTokenObj.GetTokenStr(); // should move to a Util function file // don't combine if exception of puntuaction if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true)) // eMail { //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true)) completeFlag = false; break; } else // where merege operation happen // don't put the "-" or " " for the first token { if (firstToken == true) { mergeWordBySpace = tokenStr; mergeWordByHyphen = tokenStr; orgMergeWord = tokenStr; firstToken = false; shortWordNo = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo); } else { mergeWordBySpace += tokenStr; mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr; orgMergeWord += GlobalVars.SPACE_STR + tokenStr; shortWordNo = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo); } endIndex = curTokenObj.GetIndex(); } } else // end of the text list, break out of the loop { completeFlag = false; break; } } // must complete the fixed window for merging if (completeFlag == true) { // the orginal word (before merge) can't be a multiword // such as "non clinical" if (mwDic.IsDicWord(orgMergeWord) == false) { // check short word merge if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO)) // real-word { AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic); // Add merge with hyphen to candidate set if (mergeWithHyphen == true) { AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic); } } } } } return(mergeSet); }
// public method /// <summary> /// A method to process mapping frominformal expression to corrected word. /// The lowercase of inWord is used as key for the mapping. /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="informalExpMap"> the map of informal expression /// </param> /// <returns> the mapped corrected word (lowercase only) if mappnig found, /// toherwise, the original input token is returned. </returns> public static TokenObj Process(TokenObj inTokenObj, Dictionary <string, string> informalExpMap) { bool debugFlag = false; return(Process(inTokenObj, informalExpMap, debugFlag)); }