// private methods private static int RunTest(bool detailFlag, long limitNo) { // init dic string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); double wf1 = cSpellApi.GetOrthoScoreEdDistFac(); double wf2 = cSpellApi.GetOrthoScorePhoneticFac(); double wf3 = cSpellApi.GetOrthoScoreOverlapFac(); cSpellApi.SetRankMode(CSpellApi.RANK_MODE_NOISY_CHANNEL); // provide cmdLine interface int returnValue = 0; NoisyChannelScoreComparator <NoisyChannelScore> ncsc = new NoisyChannelScoreComparator <NoisyChannelScore>(); try { StreamReader stdInput = new StreamReader(Console.OpenStandardInput()); try { string inText = null; Console.WriteLine("- Please input a text (type \"Ctl-d\" to quit) > "); while (!string.ReferenceEquals((inText = stdInput.ReadLine()), null)) { // --------------------------------- // Get spell correction on the input // --------------------------------- // get all possible candidates HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(inText, cSpellApi); Console.WriteLine("-- canSet.size(): " + candSet.Count); // get final suggestion string topRankStr = GetTopRankStr(inText, candSet, wordWcMap, wf1, wf2, wf3); Console.WriteLine("- top tank str: " + topRankStr); // print details if (detailFlag == true) { HashSet <NoisyChannelScore> candScoreSet = GetCandidateScoreSet(inText, candSet, wordWcMap, wf1, wf2, wf3); Console.WriteLine("------ Suggestion List ------"); var list = candScoreSet.OrderBy(x => x, ncsc).Take((int)limitNo).Select(obj => obj.ToString()).ToList(); foreach (var item in list) { Console.WriteLine(item); } } } } catch (Exception e2) { Console.Error.WriteLine(e2.Message); returnValue = -1; } } catch (Exception e) { Console.Error.WriteLine(e.Message); returnValue = -1; } return(returnValue); }
private static void TestTpStr(string str1, string str2, CSpellApi cSpellApi) { HashSet <string> candSet = GetCandidates(str1.ToLower(), cSpellApi); bool flag = candSet.Contains(str2); if (flag == true) { totalTpNo_++; } totalTpStrNo_++; Console.WriteLine(flag + "|" + totalTpNo_ + "|" + totalTpStrNo_ + "|" + str1 + "|" + str2 + "|" + EditDistance.GetDistanceForRealWord(str1, str2) + "|" + RefinedSoundex.GetDistanceDetailStr(str1, str2) + "|" + Metaphone2.GetDistanceDetailStr(str1, str2, 10)); }
// public method // process public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // pre-porcess // update Pos for the inTokenList TextObj.UpdateIndexPos(inTokenList); // 1. remove non space-token and convert to non-space-token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // 2. process: go through each token for detection and correction // to find merge corrections (mergeObjList) int index = 0; List <MergeObj> mergeObjList = new List <MergeObj>(); int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); while (index < inTokenList.Count) { TokenObj curTokenObj = inTokenList[index]; // update the tarPos // SCR-3, use legit token if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true) { int tarPos = inTokenList[index].GetPos(); // correct term is the highest ranked candidates MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); if (mergeObj == null) // no merge correction { index++; } else // has merge correction { mergeObjList.Add(mergeObj); // next token after end token, this ensures no overlap merge index = mergeObj.GetEndIndex() + 1; } } else // space token // update index { index++; } } // update the output for merge for the whole inTokenList, // has to update after the loop bz merge might // happen to the previous token // update the tokenObj up to the merge, then go to the next token // update operation info also List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi); return(outTokenList); }
private static bool TestCand(string inWord, string cand, CSpellApi cSpellApi) { HashSet <string> candSet = GetCandidates(inWord, cSpellApi); bool hasCand = candSet.Contains(cand); totalNo_++; if (hasCand == true) { totalCandNo_++; Console.WriteLine(inWord + ", " + cand); } return(hasCand); }
// public method public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag) { /* * // use frequency score for merge * MergeObj mergeObj = GetTopRankMergeObjByFrequency(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); * // use context score for merge * MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); */ // use combination MergeObj mergeObj = GetTopRankMergeObjByCSpell(candidates, cSpellApi, tarPos, nonSpaceTokenList, debugFlag); return(mergeObj); }
// test driver public static void MainTest(string[] args) { if (args.Length > 0) { Console.Error.WriteLine("*** Usage: java RealWord1To1Candidates"); Environment.Exit(1); } // init string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); //Tests(cSpellApi); //TestDists(); TestTestSet(cSpellApi); // test candidate rule for TP and FP }
private static void TestPnDic(CSpellApi cSpellApi) { // test split dictionary RootDictionary pnDic = cSpellApi.GetPnDic(); // test words List <string> wordList = new List <string>(); wordList.Add("hu"); wordList.Add("Hu"); foreach (string word in wordList) { Console.WriteLine("-- pnDic(" + word + "): " + pnDic.IsDicWord(word)); } }
// test driver public static void MainTest(string[] args) { string configFile = "../data/Config/cSpell.properties"; if (args.Length > 0) { Console.WriteLine("Usage: java SpellCorrection <configFile>"); Environment.Exit(0); } // init CSpellApi cSpellApi = new CSpellApi(configFile); // test TestProcess(cSpellApi); }
private static void TestGetCorrectTerm(CSpellApi cSpellApi) { // init // all lowerCase string inText = "hotflashes"; // test process: TokenObj inToken = new TokenObj(inText); TokenObj outToken = NonWordCorrector.GetCorrectTerm(inToken, cSpellApi); // result string outText = outToken.GetTokenStr(); // print out Console.WriteLine("--------- GetCorrectTerm( ) -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); }
private static void TestSplitDic(CSpellApi cSpellApi) { // test split dictionary RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); // test words List <string> wordList = new List <string>(); wordList.Add("do"); wordList.Add("i"); wordList.Add("ng"); wordList.Add("ilove"); foreach (string word in wordList) { Console.WriteLine("-- SplitDic(" + word + "): " + splitWordDic.IsDicWord(word)); } }
// real-word candidate has more restriction than non-word // TBD, need to organize the code ... // the check should be done in the ranking // Core process for real-word candidates private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi) { RootDictionary suggestDic = cSpellApi.GetSuggestDic(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); // real-word, check phonetic and suggDic // 1. check suggDic // 1.1 edDist <= 1 // 1.2 edDist <= 2 && phonetic dist <= 1 // 2. check if inflections, not a candidate real-word, not correct bool flag = false; int rw1To1CandMinWc = cSpellApi.GetCanRw1To1CandMinWc(); int rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength(); string inWordLc = inWord.ToLower(); int inWordLen = inWordLc.Length; int candLen = cand.Length; int lenDiff = inWordLen - candLen; // 1. check suggDic and inflVars if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false)) // not inflVars { //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1 // more restriction for real-word candidates int pmDist = Metaphone2.GetDistance(inWordLc, cand); int prDist = RefinedSoundex.GetDistance(inWordLc, cand); int leadDist = GetLeadCharDist(inWordLc, cand); int endDist = GetEndCharDist(inWordLc, cand); int lengthDist = GetLengthDist(inWordLc, cand); int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist; int editDist = EditDistance.GetDistanceForRealWord(inWordLc, cand); int totalDist2 = editDist + pmDist + prDist; // if they sound the same if ((pmDist == 0) && (prDist == 0)) { flag = true; } // if they sound similar and orthographic is also similar // fixed from empierical test, not configuable else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0)) { flag = true; } } return(flag); }
// TBD... this is the bottle neck because so many real-words call this // needs to speed up // // public method // Get candidates from dictionary by Edit-distance: // 1. get all possible combinations from insert, remove, replace, switch // chars. However, it does not include space (so no split). // 2. check if the combination is in dictionary public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi) { int maxLength = cSpellApi.GetCanRw1To1WordMaxLength(); string inWordLc = inWord.ToLower(); // 1. get it from the memoery to speed up running time HashSet <string> candidates = candMap_.GetValueOrNull(inWordLc); // 2. generate candidates on the fly, find all possibile candidates if (candidates == null) { // 2.1. get all possible candidates // bottle neck for real-word: 7 min. HashSet <string> candidatesByEd = CandidatesUtil1To1.GetCandidatesByEd(inWord, maxLength); // filter out those are not valid words candidates = new HashSet <string>(); // 2.2. bottle neck for real-word: 2 min. foreach (string candByEd in candidatesByEd) { // check if valid one-to-one candidate word if (IsValid1To1Cand(inWordLc, candByEd, cSpellApi) == true) { candidates.Add(candByEd); } } // update candMap_ and save to memory to speed up runing time // TBD, need to set the maxKeyNo for candMap_ to prevent // max. key size need to be <= 2**31-1 = 2,147,483,647 // slow performance and crash could happen if too many keys if (candMap_.ContainsKey(inWordLc) == false) { candMap_[inWordLc] = candidates; // warning msg< suggest value: < 1,500,000,000 for performance int maxHashKeySize = cSpellApi.GetCanRw1To1CandMaxKeySize(); int hashKeySize = candMap_.Keys.Count; if (hashKeySize > maxHashKeySize) { if ((hashKeySize % 100) == 0) { Console.Error.WriteLine("** [email protected]: the size of key in RW-1To1-Cand-HashMap is too big (" + hashKeySize + " > " + maxHashKeySize + "). Please rerun the cSpell and increase the max. hash key size in the cSpell config (must < 2,147,483,647)."); } } } } return(candidates); }
private static void Tests(CSpellApi cSpellApi) { List <string> testList = new List <string>(); TestCand("too", "to", cSpellApi); TestCand("then", "than", cSpellApi); TestCand("thing", "think", cSpellApi); TestCand("sisters", "sisters'", cSpellApi); TestCand("know", "now", cSpellApi); TestCand("tried", "tired", cSpellApi); TestCand("specially", "especially", cSpellApi); TestCand("law", "lat", cSpellApi); TestCand("domestic", "damaged", cSpellApi); TestCand("Weather", "whether", cSpellApi); TestCand("there", "their", cSpellApi); TestCand("then", "than", cSpellApi); TestCand("fine", "find", cSpellApi); TestCand("bowl", "bowel", cSpellApi); TestCand("off", "of", cSpellApi); TestCand("Dies", "Does", cSpellApi); TestCand("descended", "undescended", cSpellApi); TestCand("effect", "affect", cSpellApi); TestCand("pregnancy", "pregnant", cSpellApi); TestCand("leave", "live", cSpellApi); TestCand("affects", "effects", cSpellApi); TestCand("their", "there", cSpellApi); TestCand("you", "your", cSpellApi); TestCand("medical", "medicine", cSpellApi); TestCand("medical", "medicine", cSpellApi); TestCand("swollen", "swelling", cSpellApi); TestCand("swollen", "swelling", cSpellApi); TestCand("well", "swell", cSpellApi); TestCand("FRIENDS", "friend's", cSpellApi); TestCand("access", "excess", cSpellApi); TestCand("where", "were", cSpellApi); TestCand("spot", "stop", cSpellApi); TestCand("weather", "whether", cSpellApi); TestCand("were", "we're", cSpellApi); TestCand("small", "smell", cSpellApi); TestCand("bond", "bone", cSpellApi); TestCand("then", "than", cSpellApi); TestCand("leave", "live", cSpellApi); TestCand("meningitidis", "meningitis", cSpellApi); Console.WriteLine(totalNo_ + "|" + totalCandNo_); }
// test driver public static void MainTest(string[] args) { string configFile = "../data/Config/cSpell.properties"; if (args.Length > 0) { Console.WriteLine("Usage: java BasicDictionary"); Environment.Exit(0); } // init CSpellApi cSpellApi = new CSpellApi(configFile); // test case and print out //Test(); TestSplitDic(cSpellApi); TestPnDic(cSpellApi); }
// public method public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // only have time to test Word2Vec score, yet to test other scores MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); /* * // use frequency score for merge * MergeObj mergeObj = GetTopRankMergeObjByFrequency(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); * // use context score for merge * MergeObj mergeObj = GetTopRankMergeObjByContext(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); * // use combination * MergeObj mergeObj = GetTopRankMergeObjByCSpell(candidates, * cSpellApi, debugFlag, tarPos, nonSpaceTokenList); */ return(mergeObj); }
// public method // Get candidates from dictionary by Edit-distance: // 1. get all possible combinations from insert, remove, replace, switch // chars. However, it does not include space (so no split). // 2. check if the combination is in dictionary public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi) { int maxLength = cSpellApi.GetCanNw1To1WordMaxLength(); // find all possibility HashSet <string> candidatesByEd = CandidatesUtil1To1.GetCandidatesByEd(inWord, maxLength); // filter out those are not valid words HashSet <string> candidates = new HashSet <string>(); foreach (string candByEd in candidatesByEd) { // check if valid one-to-one candidate word if (IsValid1To1Cand(inWord, candByEd, cSpellApi) == true) { candidates.Add(candByEd); } } return(candidates); }
private static void TestGetCorrectTerm(CSpellApi cSpellApi) { // init // all lowerCase string inText = "Dur ing my absent."; bool debugFlag = false; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); // 1. convert to the non-empty token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // result int tarPos = 0; MergeObj mergeObj = NonWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); // print out Console.WriteLine("--------- GetCorrectTerm( ) -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("In nonSpaceTokenList: [" + nonSpaceTokenList.Count + "]"); Console.WriteLine("Out MergeObj: [" + mergeObj.ToString() + "]"); }
// check dic and exception private static bool IsRealWordMerge(string inWord, CSpellApi cSpellApi, bool debugFlag) { // init RootDictionary checkDic = cSpellApi.GetSplitWordDic(); // merge Dic RootDictionary unitDic = cSpellApi.GetUnitDic(); // real word merge must: // 1. known in the dictionary // 2. not exception, such as url, email, digit, ... // => if excpetion, even is a non-word, still not a misspelt bool realWordMergeFlag = (checkDic.IsValidWord(inWord)) && (!IsRealWordExceptions(inWord, unitDic)); if (debugFlag == true) { bool wordInDicFlag = checkDic.IsValidWord(inWord); bool wordExceptionFlag = IsRealWordExceptions(inWord, unitDic); DebugPrint.PrintRwMergeDetect(inWord, realWordMergeFlag, wordInDicFlag, wordExceptionFlag, debugFlag); } return(realWordMergeFlag); }
// public test driver public static void MainTest(string[] args) { string configFile = "../data/Config/cSpell.properties"; if (args.Length == 1) { configFile = args[0]; } if (args.Length > 0) { Console.Error.WriteLine("Usage: java NonWordDetector <config>"); Environment.Exit(1); } // init, read in from config CSpellApi cSpellApi = new CSpellApi(configFile); // Test Tests(cSpellApi); }
public static bool IsNonWord(string inWord, CSpellApi cSpellApi, bool debugFlag) { // init RootDictionary checkDic = cSpellApi.GetCheckDic(); RootDictionary unitDic = cSpellApi.GetUnitDic(); // non-word must be: // 1. not known in the dictionary // 2. not exception, such as url, email, digit, ... // => if excpetion, even is a nor-word, still not a misspelt bool nonWordFlag = (!checkDic.IsValidWord(inWord)) && (!IsNonWordExceptions(inWord, unitDic)); if (debugFlag == true) { bool wordDicFlag = checkDic.IsValidWord(inWord); bool wordExceptionFlag = IsNonWordExceptions(inWord, unitDic); DebugPrint.PrintNwDetect(inWord, nonWordFlag, wordDicFlag, wordExceptionFlag, debugFlag); } return(nonWordFlag); }
// public method // Use: for loop, the latest and greatest implementation // original implementation with for loop, To be deleted // the core of spell-correction, include split // inTokenList is the whole text public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // init the output TokenList List <TokenObj> outTokenList = new List <TokenObj>(); // process: go through each token for detection and correction // for the 1-to-1 and split correction int tarPos = 0; // the position of the tokenObj in the inTokenList // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // use the inTokenList to keep the same spcae token TokenObj outTokenObj = null; int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); foreach (TokenObj tokenObj in inTokenList) { /// <summary> /// no context /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj, /// cSpellApi, debugFlag); /// /// </summary> // skip empty space tokens and long tokens // SCR-3, use legit token if (tokenObj.IsLegitToken(maxLegitTokenLength) == true) { // correct term is the highest ranked candidate outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // used tarPos for context module tarPos++; } else { outTokenObj = tokenObj; } // add the corrected tokenObj to the output token list // use FlatMap because there might be a split Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj); } return(outTokenList); }
// test driver public static void MainTest(string[] args) { string configFile = "../data/Config/cSpell.properties"; if (args.Length == 1) { configFile = args[0]; } else if (args.Length > 0) { Console.WriteLine("Usage: java RealWordCorrector <configFile>"); Environment.Exit(0); } // init CSpellApi cSpellApi = new CSpellApi(configFile); // test TestSplit(cSpellApi); }
private static void TestProcess(CSpellApi cSpellApi) { // init // all lowerCase string inText = "hotflashes and knowaboutare not forr playsure."; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
// private method private static void TestProcess(CSpellApi cSpellApi) { // init // test non-word, one-to-one, split, and merge correction, all lowerCase string inText = "hotflashes and knowaboutare not forr playsure dur ing my disa ppoint ment."; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
private static void TestProcess(CSpellApi cSpellApi) { // init // all lowerCase string inText = "She had problems dur ing her pregnancies. That is a dis appoint ment. Good!"; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
// public method /// <summary> /// The core method to correct a word by following steps: /// <ul> /// <li>Convert inToken to coreTerm /// <li>detect if real-word /// <li>get split candidates /// <li>Rank candidates /// <ul> /// <li>context /// </ul> /// <li>Update information /// /// </ul> /// </summary> /// <param name="inTokenObj"> the input tokenObj (single word) </param> /// <param name="cSpellApi"> cSpell API object </param> /// <param name="debugFlag"> flag for debug print </param> /// <param name="tarPos"> position of the target token to be split </param> /// <param name="nonSpaceTokenList"> the token list without space tokens /// </param> /// <returns> the split words in tokenObj. </returns> // return the original term if no good correctin are found public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList) { // init int funcMode = cSpellApi.GetFuncMode(); // get inWord from inTokenObj and init outTokenObj string inWord = inTokenObj.GetTokenStr(); TokenObj outTokenObj = new TokenObj(inTokenObj); // 1. convert a word to coreTerm (no leading/ending space, punc, digit) int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType); string coreStr = coreTermObj.GetCoreTerm(); // 2. non-word detection and correction // check if the coreTerm is real-word if ((inTokenObj.GetProcHist().Count == 0) && (RealWordSplitDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)) { cSpellApi.UpdateDetectNo(); // TBD, should take care of possessive xxx's here // 3. get split candidates set from correction int maxSplitNo = cSpellApi.GetCanRwMaxSplitNo(); HashSet <string> splitSet = RealWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo); // get candidates from split // 4. Ranking: get top ranked candidates as corrected terms // in case of using context string topRankStr = RankRealWordSplitByMode.GetTopRankStr(coreStr, splitSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // 5 update coreTerm and convert back to tokenObj coreTermObj.SetCoreTerm(topRankStr); string outWord = coreTermObj.ToString(); // 6. update info if there is a real-word correction if (inWord.Equals(outWord) == false) { cSpellApi.UpdateCorrectNo(); outTokenObj.SetTokenStr(outWord); outTokenObj.AddProcToHist(TokenObj.HIST_RW_S); //split DebugPrint.PrintCorrect("RW", "RealWordSplitCorrector", inWord, outWord, debugFlag); } } return(outTokenObj); }
// check all split words private static bool CheckSplitWords(string inTerm, CSpellApi cSpellApi) { // convert to word list List <string> splitWordList = TermUtil.ToWordList(inTerm); // go through all split words, they can be: // 1. digit (pure number) // 2. unit // 3. word in the split word dictionary: English + ProperNoun (not Aa) // if any splitWord is not above, the split is false bool flag = true; foreach (string splitWord in splitWordList) { // check each split word if (IsValidSplitWord(splitWord, cSpellApi) == false) { flag = false; break; } } return(flag); }
// public method // filter out with dictionary // Use no Abb/Acr dictionary to exclude terms are abb/acr // The inWord must be a coreTerm. public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi, int maxSplitNo) { // init from cSpellApi RootDictionary mwDic = cSpellApi.GetMwDic(); // 1. find all possibie split combination by spaces // must be <= maxSplitNo HashSet <string> splitSet = CandidatesUtilSplit.GetSplitSet(inWord, maxSplitNo); // filter out those are not valid HashSet <string> candidates = new HashSet <string>(); // 2. multiwords: check the whole list of split terms // only inlcude dictionary that have multiword - lexicon // TBD: this will find "perse" to "per se", however, "perse" is // a valid word in eng_medical.dic so cSpell can't correct it. // Need to refine the dictionary later! foreach (string split in splitSet) { if (mwDic.IsDicWord(split) == true) { candidates.Add(split); } } // 3. if no multiwords found from step 2. // check each split terms, mark as candidate if they are in Dic, // Acr/Abb are excluded to eliminate noise such as 'a', 'ab', etc. if (candidates.Count == 0) { // go through each split words foreach (string split in splitSet) { // add to candidate if all split words are valid if (IsValidSplitCand(split, cSpellApi) == true) { candidates.Add(split); } } } return(candidates); }
// for the split, we don't want Aa as a valid word // because it will cause too much noise (less precision) // TBD ... re-organize private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi) { // splitWord uses LexiconNoAa for Dic RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); RootDictionary unitDic = cSpellApi.GetUnitDic(); RootDictionary pnDic = cSpellApi.GetPnDic(); //RootDictionary aaDic = cSpellApi.GetAaDic(); int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc(); // real-word cand split word must: // 1. check if in the splitWordDic, No Aa with a small length // such as cel is an overlap, it is aa or not-aa // 2. has word2Vec // 3. has WC // 4. not unit, mg -> ... // 5. not properNoun, human -> Hu man, where Hu is pn // children -> child ren, where ren is pn bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord)); return(flag); }
// private methods private static void Test() { // init cSpellApi string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); Console.WriteLine("===== Unit Test of MergeCandidates ====="); //String inText = "He was dia gnosed early onset deminita 3 year ago."; // example from 73.txt //String inText = "I have seven live births with no problems dur ing my pregnancies. That is a dis appoint ment"; string inText = "That is a disa ppoint ment."; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); string inStr = String.Join("|", inTextList.Select(obj => obj.GetTokenStr())); Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]"); Console.WriteLine("-------------------------"); foreach (TokenObj tokenObj in inTextList) { Console.WriteLine(tokenObj.ToString()); } int tarPos = 4; Console.WriteLine("-------------------------"); Console.WriteLine("- tarPos: " + tarPos); Console.WriteLine("- maxMergeNo: " + cSpellApi.GetCanNwMaxMergeNo()); Console.WriteLine("------ merge set -------"); // pre-Process: convert to the non-empty token list List <TokenObj> nonSpaceTextList = TextObj.GetNonSpaceTokenObjList(inTextList); // get the candidate for a specified target position HashSet <MergeObj> mergeSet = GetCandidates(tarPos, nonSpaceTextList, cSpellApi); // print out foreach (MergeObj mergeObj in mergeSet) { Console.WriteLine(mergeObj.ToString()); } }