public static bool IsRealWord(string inWord, CSpellApi cSpellApi, bool debugFlag) { // init RootDictionary checkDic = cSpellApi.GetCheckDic(); RootDictionary unitDic = cSpellApi.GetUnitDic(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); int inWordLen = inWord.Length; // TBD, change method name int rwSplitWordMinLength = cSpellApi.GetDetectorRwSplitWordMinLength(); int rwSplitWordMinWc = cSpellApi.GetDetectorRwSplitWordMinWc(); // realword must be: // 1. known in the dictionary // 2. not exception, such as url, email, digit, ... // => if excpetion, even is a non-word, no correction // 3. must have word2Vector value (inWord is auto converted to LC) // 4. frequency must be above a threshhold (inWord is auto to LC) // TBD, need to be configureable 200 bool realWordFlag = (checkDic.IsValidWord(inWord)) && (!IsRealWordExceptions(inWord, unitDic) && (inWordLen >= rwSplitWordMinLength) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc)); if (debugFlag == true) { bool wordInDicFlag = checkDic.IsValidWord(inWord); bool wordExceptionFlag = IsRealWordExceptions(inWord, unitDic); bool lengthFlag = (inWordLen >= rwSplitWordMinLength); bool word2VecFlag = word2VecOm.HasWordVec(inWord); bool wcFlag = (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitWordMinWc); DebugPrint.PrintRwSplitDetect(inWord, realWordFlag, wordInDicFlag, wordExceptionFlag, lengthFlag, word2VecFlag, wcFlag, debugFlag); } return(realWordFlag); }
// private methods private static bool IsValidMergeCand(MergeObj mergeObj, CSpellApi cSpellApi) { // WC is not used here WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); string coreMergeStr = mergeObj.GetCoreMergeWord(); int rwMergeCandMinWc = cSpellApi.GetCanRwMergeCandMinWc(); bool flag = ((word2VecOm.HasWordVec(coreMergeStr)) && (WordCountScore.GetWc(coreMergeStr, wordWcMap) >= rwMergeCandMinWc)); return(flag); }
// for the split, we don't want Aa as a valid word // because it will cause too much noise (less precision) // TBD ... re-organize private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi) { // splitWord uses LexiconNoAa for Dic RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); RootDictionary unitDic = cSpellApi.GetUnitDic(); RootDictionary pnDic = cSpellApi.GetPnDic(); //RootDictionary aaDic = cSpellApi.GetAaDic(); int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc(); // real-word cand split word must: // 1. check if in the splitWordDic, No Aa with a small length // such as cel is an overlap, it is aa or not-aa // 2. has word2Vec // 3. has WC // 4. not unit, mg -> ... // 5. not properNoun, human -> Hu man, where Hu is pn // children -> child ren, where ren is pn bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord)); return(flag); }
// real-word candidate has more restriction than non-word // TBD, need to organize the code ... // the check should be done in the ranking // Core process for real-word candidates private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi) { RootDictionary suggestDic = cSpellApi.GetSuggestDic(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); // real-word, check phonetic and suggDic // 1. check suggDic // 1.1 edDist <= 1 // 1.2 edDist <= 2 && phonetic dist <= 1 // 2. check if inflections, not a candidate real-word, not correct bool flag = false; int rw1To1CandMinWc = cSpellApi.GetCanRw1To1CandMinWc(); int rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength(); string inWordLc = inWord.ToLower(); int inWordLen = inWordLc.Length; int candLen = cand.Length; int lenDiff = inWordLen - candLen; // 1. check suggDic and inflVars if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false)) // not inflVars { //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1 // more restriction for real-word candidates int pmDist = Metaphone2.GetDistance(inWordLc, cand); int prDist = RefinedSoundex.GetDistance(inWordLc, cand); int leadDist = GetLeadCharDist(inWordLc, cand); int endDist = GetEndCharDist(inWordLc, cand); int lengthDist = GetLengthDist(inWordLc, cand); int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist; int editDist = EditDistance.GetDistanceForRealWord(inWordLc, cand); int totalDist2 = editDist + pmDist + prDist; // if they sound the same if ((pmDist == 0) && (prDist == 0)) { flag = true; } // if they sound similar and orthographic is also similar // fixed from empierical test, not configuable else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0)) { flag = true; } } return(flag); }