Beispiel #1
0
        // protected method
        // get merge word by merge no, including shift window, fixed window size
        protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic)
        {
            // output merge object list
            HashSet <MergeObj> mergeSet = new HashSet <MergeObj>();
            // find the merge object
            int startPos = tarPos - mergeNo;             // start pos index

            startPos = ((startPos > 0) ? startPos : 0);
            int size = nonSpaceTextList.Count;
            // find the merge word, merged by remove spcae or repalce with "-"
            // shift window by i
            int    startIndex = 0;
            int    tarIndex   = nonSpaceTextList[tarPos].GetIndex();
            string tarWord    = nonSpaceTextList[tarPos].GetTokenStr();
            int    endIndex   = 0;
            // these are vars to be used to MergeObj
            int objStartPos = 0;
            int objTarPos   = tarPos;
            int objEndPos   = 0;

            // all possible merges
            for (int i = startPos; i <= tarPos; i++)
            {
                // get the merged word with fixed window size (mergeNo)
                string mergeWordBySpace  = "";
                string mergeWordByHyphen = "";
                string orgMergeWord      = "";            // the original word b4 merge
                bool   completeFlag      = true;
                startIndex = nonSpaceTextList[i].GetIndex();
                bool firstToken = true;
                objStartPos = i;
                objEndPos   = i + mergeNo;
                int shortWordNo = 0;
                // merge operations
                for (int j = 0; j <= mergeNo; j++)
                {
                    int curPos = i + j;
                    if (curPos < size)                       // check window size
                    {
                        TokenObj curTokenObj = nonSpaceTextList[curPos];
                        string   tokenStr    = curTokenObj.GetTokenStr();
                        // should move to a Util function file
                        // don't combine if exception of puntuaction
                        if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true))                         // eMail
                        {
                            //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true))
                            completeFlag = false;
                            break;
                        }
                        else                             // where merege operation happen
                                                         // don't put the "-" or " " for the first token
                        {
                            if (firstToken == true)
                            {
                                mergeWordBySpace  = tokenStr;
                                mergeWordByHyphen = tokenStr;
                                orgMergeWord      = tokenStr;
                                firstToken        = false;
                                shortWordNo       = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            else
                            {
                                mergeWordBySpace  += tokenStr;
                                mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr;
                                orgMergeWord      += GlobalVars.SPACE_STR + tokenStr;
                                shortWordNo        = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            endIndex = curTokenObj.GetIndex();
                        }
                    }
                    else                         // end of the text list, break out of the loop
                    {
                        completeFlag = false;
                        break;
                    }
                }
                // must complete the fixed window for merging
                if (completeFlag == true)
                {
                    // the orginal word (before merge) can't be a multiword
                    // such as "non clinical"
                    if (mwDic.IsDicWord(orgMergeWord) == false)
                    {
                        // check short word merge
                        if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO))                           // real-word
                        {
                            AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            // Add merge with hyphen to candidate set
                            if (mergeWithHyphen == true)
                            {
                                AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            }
                        }
                    }
                }
            }
            return(mergeSet);
        }