private static bool IsProperNoun(string inWord, RootDictionary pnDic) { // Check proper noun from Lexicon, case sensitive bool pnFlag = pnDic.IsDicWord(inWord); return(pnFlag); }
private static bool IsAbbAcr(string inWord, RootDictionary aaDic) { // Check abbreviation and acronym from Lexicon, case sensitive bool aaFlag = aaDic.IsDicWord(inWord); return(aaFlag); }
private static bool IsSpVar(string inWord, RootDictionary svDic) { // Check spVar from Lexicon, case sensitive bool svFlag = svDic.IsDicWord(inWord); return(svFlag); }
// for the split, we don't want Aa as a valid word // because it will cause too much noise (less precision) private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi) { // splitWord uses LexiconNoAa for Dic RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); // 1. check if in the splitWordDic, No Aa bool flag = splitWordDic.IsDicWord(inWord); // 2. is obsolete code because Aa is check in splitWordDic // 2. check pure Aa, further remove Aa // pureAaDic are words exlcude those overlap with not-Aa // such as cel is an overlap, it is aa or not-aa if (flag == true) { // if Aa and length < Mix. Split Aa word length // Set minSplitAaWordLength to a large number for excluding all paa // // This is already done in splitWordDic // no need, it reduced recall and precision (ofcourse => incourse) /// <summary> /// if((inWord.length() < minSplitAaWordLength) /// && (aaDic.IsDicWord(inWord) == true)) /// { /// flag = false; /// } /// /// </summary> } return(flag); }
private static bool IsAbbAcr(string inWord, RootDictionary aaDic) { // Check abbreviation and acronym from Lexicon, case sensitive // should be case sensitive, but here, we implmented aggresive match bool aaFlag = aaDic.IsDicWord(inWord); return(aaFlag); }
// check if it is a prperNoun private static bool IsProperNoun(string inWord, RootDictionary pnDic) { // Check proper noun from Lexicon, case sensitive // should be case sensitive, but here, we implmented aggresive match // and ignroe the case bool pnFlag = pnDic.IsDicWord(inWord); return(pnFlag); }
private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi) { RootDictionary suggestDic = cSpellApi.GetSuggestDic(); // real-word, check phonetic and suggDic // non-word, check if it is in the suggestion Dic bool flag = suggestDic.IsDicWord(cand); return(flag); }
private static void TestPnDic(CSpellApi cSpellApi) { // test split dictionary RootDictionary pnDic = cSpellApi.GetPnDic(); // test words List <string> wordList = new List <string>(); wordList.Add("hu"); wordList.Add("Hu"); foreach (string word in wordList) { Console.WriteLine("-- pnDic(" + word + "): " + pnDic.IsDicWord(word)); } }
private static void AddMergeObj(string tarWord, string orgMergeWord, string mergeWord, int mergeNo, int startIndex, int tarIndex, int endIndex, int startPos, int tarPos, int endPos, HashSet <MergeObj> mergeSet, RootDictionary suggestDic, RootDictionary aADic) { // 1. convert merged word to coreTerm int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC; bool lcFlag = true; // only take care of the end punctuation for the coreTerm string coreStr = TermUtil.StripEndPuncSpace(mergeWord); // 2. check if the coreStr of mergeWord is in suggest Dic // the merge word is not a Aa, assuming no merge for Aa // becase Aa is short enough if ((suggestDic.IsDicWord(coreStr) == true) && (aADic.IsDicWord(coreStr) == false)) { MergeObj mergeObj = new MergeObj(tarWord, orgMergeWord, mergeWord, coreStr, mergeNo, startIndex, tarIndex, endIndex, startPos, tarPos, endPos); mergeSet.Add(mergeObj); } }
private static void TestSplitDic(CSpellApi cSpellApi) { // test split dictionary RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); // test words List <string> wordList = new List <string>(); wordList.Add("do"); wordList.Add("i"); wordList.Add("ng"); wordList.Add("ilove"); foreach (string word in wordList) { Console.WriteLine("-- SplitDic(" + word + "): " + splitWordDic.IsDicWord(word)); } }
// real-word candidate has more restriction than non-word // TBD, need to organize the code ... // the check should be done in the ranking // Core process for real-word candidates private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi) { RootDictionary suggestDic = cSpellApi.GetSuggestDic(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); // real-word, check phonetic and suggDic // 1. check suggDic // 1.1 edDist <= 1 // 1.2 edDist <= 2 && phonetic dist <= 1 // 2. check if inflections, not a candidate real-word, not correct bool flag = false; int rw1To1CandMinWc = cSpellApi.GetCanRw1To1CandMinWc(); int rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength(); string inWordLc = inWord.ToLower(); int inWordLen = inWordLc.Length; int candLen = cand.Length; int lenDiff = inWordLen - candLen; // 1. check suggDic and inflVars if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false)) // not inflVars { //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1 // more restriction for real-word candidates int pmDist = Metaphone2.GetDistance(inWordLc, cand); int prDist = RefinedSoundex.GetDistance(inWordLc, cand); int leadDist = GetLeadCharDist(inWordLc, cand); int endDist = GetEndCharDist(inWordLc, cand); int lengthDist = GetLengthDist(inWordLc, cand); int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist; int editDist = EditDistance.GetDistanceForRealWord(inWordLc, cand); int totalDist2 = editDist + pmDist + prDist; // if they sound the same if ((pmDist == 0) && (prDist == 0)) { flag = true; } // if they sound similar and orthographic is also similar // fixed from empierical test, not configuable else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0)) { flag = true; } } return(flag); }
// public method // filter out with dictionary // Use no Abb/Acr dictionary to exclude terms are abb/acr // The inWord must be a coreTerm. public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi, int maxSplitNo) { // init from cSpellApi RootDictionary mwDic = cSpellApi.GetMwDic(); // 1. find all possibie split combination by spaces // must be <= maxSplitNo HashSet <string> splitSet = CandidatesUtilSplit.GetSplitSet(inWord, maxSplitNo); // filter out those are not valid HashSet <string> candidates = new HashSet <string>(); // 2. multiwords: check the whole list of split terms // only inlcude dictionary that have multiword - lexicon // TBD: this will find "perse" to "per se", however, "perse" is // a valid word in eng_medical.dic so cSpell can't correct it. // Need to refine the dictionary later! foreach (string split in splitSet) { if (mwDic.IsDicWord(split) == true) { candidates.Add(split); } } // 3. if no multiwords found from step 2. // check each split terms, mark as candidate if they are in Dic, // Acr/Abb are excluded to eliminate noise such as 'a', 'ab', etc. if (candidates.Count == 0) { // go through each split words foreach (string split in splitSet) { // add to candidate if all split words are valid if (IsValidSplitCand(split, cSpellApi) == true) { candidates.Add(split); } } } return(candidates); }
// for the split, we don't want Aa as a valid word // because it will cause too much noise (less precision) // TBD ... re-organize private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi) { // splitWord uses LexiconNoAa for Dic RootDictionary splitWordDic = cSpellApi.GetSplitWordDic(); WordWcMap wordWcMap = cSpellApi.GetWordWcMap(); Word2Vec word2VecOm = cSpellApi.GetWord2VecOm(); RootDictionary unitDic = cSpellApi.GetUnitDic(); RootDictionary pnDic = cSpellApi.GetPnDic(); //RootDictionary aaDic = cSpellApi.GetAaDic(); int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc(); // real-word cand split word must: // 1. check if in the splitWordDic, No Aa with a small length // such as cel is an overlap, it is aa or not-aa // 2. has word2Vec // 3. has WC // 4. not unit, mg -> ... // 5. not properNoun, human -> Hu man, where Hu is pn // children -> child ren, where ren is pn bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord)); return(flag); }
// protected method // get merge word by merge no, including shift window, fixed window size protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic) { // output merge object list HashSet <MergeObj> mergeSet = new HashSet <MergeObj>(); // find the merge object int startPos = tarPos - mergeNo; // start pos index startPos = ((startPos > 0) ? startPos : 0); int size = nonSpaceTextList.Count; // find the merge word, merged by remove spcae or repalce with "-" // shift window by i int startIndex = 0; int tarIndex = nonSpaceTextList[tarPos].GetIndex(); string tarWord = nonSpaceTextList[tarPos].GetTokenStr(); int endIndex = 0; // these are vars to be used to MergeObj int objStartPos = 0; int objTarPos = tarPos; int objEndPos = 0; // all possible merges for (int i = startPos; i <= tarPos; i++) { // get the merged word with fixed window size (mergeNo) string mergeWordBySpace = ""; string mergeWordByHyphen = ""; string orgMergeWord = ""; // the original word b4 merge bool completeFlag = true; startIndex = nonSpaceTextList[i].GetIndex(); bool firstToken = true; objStartPos = i; objEndPos = i + mergeNo; int shortWordNo = 0; // merge operations for (int j = 0; j <= mergeNo; j++) { int curPos = i + j; if (curPos < size) // check window size { TokenObj curTokenObj = nonSpaceTextList[curPos]; string tokenStr = curTokenObj.GetTokenStr(); // should move to a Util function file // don't combine if exception of puntuaction if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true)) // eMail { //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true)) completeFlag = false; break; } else // where merege operation happen // don't put the "-" or " " for the first token { if (firstToken == true) { mergeWordBySpace = tokenStr; mergeWordByHyphen = tokenStr; orgMergeWord = tokenStr; firstToken = false; shortWordNo = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo); } else { mergeWordBySpace += tokenStr; mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr; orgMergeWord += GlobalVars.SPACE_STR + tokenStr; shortWordNo = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo); } endIndex = curTokenObj.GetIndex(); } } else // end of the text list, break out of the loop { completeFlag = false; break; } } // must complete the fixed window for merging if (completeFlag == true) { // the orginal word (before merge) can't be a multiword // such as "non clinical" if (mwDic.IsDicWord(orgMergeWord) == false) { // check short word merge if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO)) // real-word { AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic); // Add merge with hyphen to candidate set if (mergeWithHyphen == true) { AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic); } } } } } return(mergeSet); }
private static void Test() { Console.WriteLine("===== Unit Test of BasicDictionary ====="); bool caseFlag = false; RootDictionary dic0 = DictionaryFactory.GetDictionary(DictionaryFactory.DIC_BASIC, caseFlag); // dic0 baselin dictionary Console.WriteLine("------- Words from Baseline 11 dicitoaries -------"); string dicStrs = "../data/Dictionary/eng_medical.dic:../data/Dictionary/center.dic:../data/Dictionary/centre.dic:../data/Dictionary/color.dic:../data/Dictionary/colour.dic:../data/Dictionary/ise.dic:../data/Dictionary/ize.dic:../data/Dictionary/labeled.dic:../data/Dictionary/labelled.dic:../data/Dictionary/yse.dic:../data/Dictionary/yze.dic"; string[] dicStrArray = dicStrs.Split(":", true); List <string> dicStrList = dicStrArray.ToList(); foreach (string dicStr in dicStrList) { dic0.AddDictionary(dicStr); Console.WriteLine("- Dic0 File: " + dicStr); Console.WriteLine("- Dic0 size: " + dic0.GetSize()); } Console.WriteLine("------- Lexicon element words -------"); string lexDicEwStr = "../data/Dictionary/lexiconDic.data.ewLc"; RootDictionary dic1 = new BasicDictionary(lexDicEwStr); Console.WriteLine("- Dic1 File: " + lexDicEwStr); Console.WriteLine("- Dic1 size: " + dic1.GetSize()); Console.WriteLine("------- Lexicon words --------"); string lexDicStr = "../data/Dictionary/lexiconDic.data"; int fieldNo = 1; RootDictionary dic2 = new BasicDictionary(lexDicStr, fieldNo); Console.WriteLine("- Dic2 File: " + lexDicStr); Console.WriteLine("- Dic2 size: " + dic2.GetSize()); string numDicStr = "../data/Dictionary/NRVAR.1.uSort.data"; dic2.AddDictionary(numDicStr); Console.WriteLine("- Dic2 File: " + numDicStr); Console.WriteLine("- Dic2 size: " + dic2.GetSize()); Console.WriteLine("----------------------"); // test words List <string> wordList = new List <string>(); wordList.Add("test"); wordList.Add("Test"); wordList.Add("TEST"); wordList.Add("liter"); wordList.Add("litre"); wordList.Add("odor"); wordList.Add("odour"); wordList.Add("iodise"); wordList.Add("iodize"); wordList.Add("beveled"); wordList.Add("bevelled"); wordList.Add("hemolyse"); wordList.Add("hemolyze"); wordList.Add("ella"); wordList.Add("centillionths"); wordList.Add("Down's"); wordList.Add("Downs'"); wordList.Add("spot(s)"); wordList.Add("fetus(es)"); wordList.Add("box(es)"); wordList.Add("waltz(es)"); wordList.Add("mtach(es)"); wordList.Add("splash(es)"); wordList.Add("fly(ies)"); wordList.Add("extremity(ies)"); wordList.Add("CASE/TEST"); wordList.Add("John's/Chris's"); wordList.Add("50mg/100mg"); wordList.Add("case/test"); wordList.Add("neck-lesion"); wordList.Add("day-night"); wordList.Add("pneumonoultramicroscopicsilicovolcanoconiosis"); wordList.Add("Walmart"); wordList.Add("test321"); Console.WriteLine("input|baseline|L-element|Lexicon|L-RealWord"); foreach (string w in wordList) { Console.WriteLine("- IsDicWord(" + w + "): " + dic0.IsDicWord(w) + ", " + dic1.IsDicWord(w) + ", " + dic2.IsDicWord(w) + ", " + dic2.IsValidWord(w)); } string word = "test321"; Console.WriteLine("------ Add [" + word + "] to dictionary ------"); dic0.AddWord(word); dic1.AddWord(word); dic2.AddWord(word); Console.WriteLine("- Dic0 size: " + dic0.GetSize()); Console.WriteLine("- Dic1 size: " + dic1.GetSize()); Console.WriteLine("- Dic2 size: " + dic2.GetSize()); Console.WriteLine("- IsInDic(" + word + "): " + dic0.IsDicWord(word) + ", " + dic1.IsDicWord(word) + ", " + dic2.IsDicWord(word)); Console.WriteLine("===== End of Unit Test ====="); }