public static string NormWordForWord2Vec(string inWord) { // 1. CoreTerm int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC; bool lcFlag = true; string inWordCtLc = CoreTermUtil.GetCoreTerm(inWord, ctType, lcFlag); // 2. find patterns of [NUM], [URL], [EMAIL] string inWordPat = inWordCtLc; if (InternetTokenUtil.IsUrl(inWordCtLc) == true) { inWordPat = PAT_URL; } else if (InternetTokenUtil.IsEmail(inWordCtLc) == true) { inWordPat = PAT_EMAIL; } else if (DigitPuncTokenUtil.IsPunc(inWordCtLc) == true) { inWordPat = ""; // remove puctuation } else if (DigitPuncTokenUtil.IsDigitPunc(inWordCtLc) == true) { inWordPat = PAT_NUM; // add puctuation test to remove } // Add test set special case // TBD: convert the format [CONTACT] to [EMAIL] // TBD: not to implemented, because it is better to // sync the format in PreProcess: [CONTACT], [NUM], ... // TBD: make sure the coreTerm does not take out above pattern /* * else if(inWord.equals("[CONTACT]") == true) * { * inWordPat = PAT_EMAIL; // could be Telephone number [PAT_NUM] * } */ // 3. TBD: take care of xxx's return(inWordPat); }
// TBD: remove pnDic, aaDic // Valid Exceptions: valid English words, but not in the dictionary. // Such as digit, punc, digitPunc (no letter), Url, eMail // measurement, unit, // abbreviation, acronym, proper nouns: do not change the F1 after test private static bool IsNonWordExceptions(string inWord, RootDictionary unitDic) { bool validExceptionFlag = (DigitPuncTokenUtil.IsDigit(inWord) == true) || (DigitPuncTokenUtil.IsPunc(inWord) == true) || (DigitPuncTokenUtil.IsDigitPunc(inWord) == true) || (InternetTokenUtil.IsUrl(inWord) == true) || (InternetTokenUtil.IsEmail(inWord) == true) || (IsEmptyString(inWord) == true) || (MeasurementTokenUtil.IsMeasurements(inWord, unitDic) == true); return(validExceptionFlag); }
// protected method // get merge word by merge no, including shift window, fixed window size protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic) { // output merge object list HashSet <MergeObj> mergeSet = new HashSet <MergeObj>(); // find the merge object int startPos = tarPos - mergeNo; // start pos index startPos = ((startPos > 0) ? startPos : 0); int size = nonSpaceTextList.Count; // find the merge word, merged by remove spcae or repalce with "-" // shift window by i int startIndex = 0; int tarIndex = nonSpaceTextList[tarPos].GetIndex(); string tarWord = nonSpaceTextList[tarPos].GetTokenStr(); int endIndex = 0; // these are vars to be used to MergeObj int objStartPos = 0; int objTarPos = tarPos; int objEndPos = 0; // all possible merges for (int i = startPos; i <= tarPos; i++) { // get the merged word with fixed window size (mergeNo) string mergeWordBySpace = ""; string mergeWordByHyphen = ""; string orgMergeWord = ""; // the original word b4 merge bool completeFlag = true; startIndex = nonSpaceTextList[i].GetIndex(); bool firstToken = true; objStartPos = i; objEndPos = i + mergeNo; int shortWordNo = 0; // merge operations for (int j = 0; j <= mergeNo; j++) { int curPos = i + j; if (curPos < size) // check window size { TokenObj curTokenObj = nonSpaceTextList[curPos]; string tokenStr = curTokenObj.GetTokenStr(); // should move to a Util function file // don't combine if exception of puntuaction if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true)) // eMail { //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true)) completeFlag = false; break; } else // where merege operation happen // don't put the "-" or " " for the first token { if (firstToken == true) { mergeWordBySpace = tokenStr; mergeWordByHyphen = tokenStr; orgMergeWord = tokenStr; firstToken = false; shortWordNo = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo); } else { mergeWordBySpace += tokenStr; mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr; orgMergeWord += GlobalVars.SPACE_STR + tokenStr; shortWordNo = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo); } endIndex = curTokenObj.GetIndex(); } } else // end of the text list, break out of the loop { completeFlag = false; break; } } // must complete the fixed window for merging if (completeFlag == true) { // the orginal word (before merge) can't be a multiword // such as "non clinical" if (mwDic.IsDicWord(orgMergeWord) == false) { // check short word merge if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO)) // real-word { AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic); // Add merge with hyphen to candidate set if (mergeWithHyphen == true) { AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic); } } } } } return(mergeSet); }
// broader matcher private static bool IsQualified(string inWord) { bool qFlag = false; // use coreTerm for URL and eMail int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; CoreTermObj cto = new CoreTermObj(inWord, ctType); string inCoreTerm = cto.GetCoreTerm(); // check if pass the matcher to be qualified if ((ContainsEndingPunc(inWord) == true) && (InternetTokenUtil.IsEmail(inCoreTerm) == false) && (InternetTokenUtil.IsUrl(inCoreTerm) == false) && (DigitPuncTokenUtil.IsDigitPunc(inWord) == false)) //skip if digitPunc { qFlag = true; } return(qFlag); }