示例#1
0
        public static string NormWordForWord2Vec(string inWord)
        {
            // 1. CoreTerm
            int    ctType     = CoreTermUtil.CT_TYPE_SPACE_PUNC;
            bool   lcFlag     = true;
            string inWordCtLc = CoreTermUtil.GetCoreTerm(inWord, ctType, lcFlag);
            // 2. find patterns of [NUM], [URL], [EMAIL]
            string inWordPat = inWordCtLc;

            if (InternetTokenUtil.IsUrl(inWordCtLc) == true)
            {
                inWordPat = PAT_URL;
            }
            else if (InternetTokenUtil.IsEmail(inWordCtLc) == true)
            {
                inWordPat = PAT_EMAIL;
            }
            else if (DigitPuncTokenUtil.IsPunc(inWordCtLc) == true)
            {
                inWordPat = "";                 // remove puctuation
            }
            else if (DigitPuncTokenUtil.IsDigitPunc(inWordCtLc) == true)
            {
                inWordPat = PAT_NUM;                 // add puctuation test to remove
            }
            // Add test set special case
            // TBD: convert the format [CONTACT] to [EMAIL]
            // TBD: not to implemented, because it is better to
            // sync the format in PreProcess: [CONTACT], [NUM], ...
            // TBD: make sure the coreTerm does not take out above pattern

            /*
             * else if(inWord.equals("[CONTACT]") == true)
             * {
             *  inWordPat = PAT_EMAIL;    // could be Telephone number [PAT_NUM]
             * }
             */
            // 3. TBD: take care of xxx's
            return(inWordPat);
        }
        // TBD: remove pnDic, aaDic
        // Valid Exceptions: valid English words, but not in the dictionary.
        // Such as digit, punc, digitPunc (no letter), Url, eMail
        // measurement, unit,
        // abbreviation, acronym, proper nouns: do not change the F1 after test
        private static bool IsNonWordExceptions(string inWord, RootDictionary unitDic)
        {
            bool validExceptionFlag = (DigitPuncTokenUtil.IsDigit(inWord) == true) || (DigitPuncTokenUtil.IsPunc(inWord) == true) || (DigitPuncTokenUtil.IsDigitPunc(inWord) == true) || (InternetTokenUtil.IsUrl(inWord) == true) || (InternetTokenUtil.IsEmail(inWord) == true) || (IsEmptyString(inWord) == true) || (MeasurementTokenUtil.IsMeasurements(inWord, unitDic) == true);

            return(validExceptionFlag);
        }
示例#3
0
        // protected method
        // get merge word by merge no, including shift window, fixed window size
        protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic)
        {
            // output merge object list
            HashSet <MergeObj> mergeSet = new HashSet <MergeObj>();
            // find the merge object
            int startPos = tarPos - mergeNo;             // start pos index

            startPos = ((startPos > 0) ? startPos : 0);
            int size = nonSpaceTextList.Count;
            // find the merge word, merged by remove spcae or repalce with "-"
            // shift window by i
            int    startIndex = 0;
            int    tarIndex   = nonSpaceTextList[tarPos].GetIndex();
            string tarWord    = nonSpaceTextList[tarPos].GetTokenStr();
            int    endIndex   = 0;
            // these are vars to be used to MergeObj
            int objStartPos = 0;
            int objTarPos   = tarPos;
            int objEndPos   = 0;

            // all possible merges
            for (int i = startPos; i <= tarPos; i++)
            {
                // get the merged word with fixed window size (mergeNo)
                string mergeWordBySpace  = "";
                string mergeWordByHyphen = "";
                string orgMergeWord      = "";            // the original word b4 merge
                bool   completeFlag      = true;
                startIndex = nonSpaceTextList[i].GetIndex();
                bool firstToken = true;
                objStartPos = i;
                objEndPos   = i + mergeNo;
                int shortWordNo = 0;
                // merge operations
                for (int j = 0; j <= mergeNo; j++)
                {
                    int curPos = i + j;
                    if (curPos < size)                       // check window size
                    {
                        TokenObj curTokenObj = nonSpaceTextList[curPos];
                        string   tokenStr    = curTokenObj.GetTokenStr();
                        // should move to a Util function file
                        // don't combine if exception of puntuaction
                        if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true))                         // eMail
                        {
                            //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true))
                            completeFlag = false;
                            break;
                        }
                        else                             // where merege operation happen
                                                         // don't put the "-" or " " for the first token
                        {
                            if (firstToken == true)
                            {
                                mergeWordBySpace  = tokenStr;
                                mergeWordByHyphen = tokenStr;
                                orgMergeWord      = tokenStr;
                                firstToken        = false;
                                shortWordNo       = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            else
                            {
                                mergeWordBySpace  += tokenStr;
                                mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr;
                                orgMergeWord      += GlobalVars.SPACE_STR + tokenStr;
                                shortWordNo        = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            endIndex = curTokenObj.GetIndex();
                        }
                    }
                    else                         // end of the text list, break out of the loop
                    {
                        completeFlag = false;
                        break;
                    }
                }
                // must complete the fixed window for merging
                if (completeFlag == true)
                {
                    // the orginal word (before merge) can't be a multiword
                    // such as "non clinical"
                    if (mwDic.IsDicWord(orgMergeWord) == false)
                    {
                        // check short word merge
                        if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO))                           // real-word
                        {
                            AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            // Add merge with hyphen to candidate set
                            if (mergeWithHyphen == true)
                            {
                                AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            }
                        }
                    }
                }
            }
            return(mergeSet);
        }
        // broader matcher
        private static bool IsQualified(string inWord)
        {
            bool qFlag = false;
            // use coreTerm for URL and eMail
            int         ctType     = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj cto        = new CoreTermObj(inWord, ctType);
            string      inCoreTerm = cto.GetCoreTerm();

            // check if pass the matcher to be qualified
            if ((ContainsEndingPunc(inWord) == true) && (InternetTokenUtil.IsEmail(inCoreTerm) == false) && (InternetTokenUtil.IsUrl(inCoreTerm) == false) && (DigitPuncTokenUtil.IsDigitPunc(inWord) == false))               //skip if digitPunc
            {
                qFlag = true;
            }
            return(qFlag);
        }