private static bool IsProperNoun(string inWord, RootDictionary pnDic)
        {
            // Check proper noun from Lexicon, case sensitive
            bool pnFlag = pnDic.IsDicWord(inWord);

            return(pnFlag);
        }
        private static bool IsAbbAcr(string inWord, RootDictionary aaDic)
        {
            // Check abbreviation and acronym from Lexicon, case sensitive
            bool aaFlag = aaDic.IsDicWord(inWord);

            return(aaFlag);
        }
        private static bool IsSpVar(string inWord, RootDictionary svDic)
        {
            // Check spVar from Lexicon, case sensitive
            bool svFlag = svDic.IsDicWord(inWord);

            return(svFlag);
        }
        // for the split, we don't want Aa as a valid word
        // because it will cause too much noise (less precision)
        private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi)
        {
            // splitWord uses LexiconNoAa for Dic
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();
            // 1. check if in the splitWordDic, No Aa
            bool flag = splitWordDic.IsDicWord(inWord);

            // 2. is obsolete code because Aa is check in splitWordDic
            // 2. check pure Aa, further remove Aa
            // pureAaDic are words exlcude those overlap with not-Aa
            // such as cel is an overlap, it is aa or not-aa
            if (flag == true)
            {
                // if Aa and length < Mix. Split Aa word length
                // Set minSplitAaWordLength to a large number for excluding all paa
                //
                // This is already done in splitWordDic
                // no need, it reduced recall and precision (ofcourse => incourse)
                /// <summary>
                /// if((inWord.length() < minSplitAaWordLength)
                /// && (aaDic.IsDicWord(inWord) == true))
                /// {
                ///    flag = false;
                /// }
                ///
                /// </summary>
            }

            return(flag);
        }
        private static bool IsAbbAcr(string inWord, RootDictionary aaDic)
        {
            // Check abbreviation and acronym from Lexicon, case sensitive
            // should be case sensitive, but here, we implmented aggresive match
            bool aaFlag = aaDic.IsDicWord(inWord);

            return(aaFlag);
        }
        // check if it is a prperNoun
        private static bool IsProperNoun(string inWord, RootDictionary pnDic)
        {
            // Check proper noun from Lexicon, case sensitive
            // should be case sensitive, but here, we implmented aggresive match
            // and ignroe the case
            bool pnFlag = pnDic.IsDicWord(inWord);

            return(pnFlag);
        }
        private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi)
        {
            RootDictionary suggestDic = cSpellApi.GetSuggestDic();
            // real-word, check phonetic and suggDic
            // non-word, check if it is in the suggestion Dic
            bool flag = suggestDic.IsDicWord(cand);

            return(flag);
        }
        private static void TestPnDic(CSpellApi cSpellApi)
        {
            // test split dictionary
            RootDictionary pnDic = cSpellApi.GetPnDic();

            // test words
            List <string> wordList = new List <string>();

            wordList.Add("hu");
            wordList.Add("Hu");
            foreach (string word in wordList)
            {
                Console.WriteLine("-- pnDic(" + word + "): " + pnDic.IsDicWord(word));
            }
        }
Esempio n. 9
0
        private static void AddMergeObj(string tarWord, string orgMergeWord, string mergeWord, int mergeNo, int startIndex, int tarIndex, int endIndex, int startPos, int tarPos, int endPos, HashSet <MergeObj> mergeSet, RootDictionary suggestDic, RootDictionary aADic)
        {
            // 1. convert merged word to coreTerm
            int  ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC;
            bool lcFlag = true;
            // only take care of the end punctuation for the coreTerm
            string coreStr = TermUtil.StripEndPuncSpace(mergeWord);

            // 2. check if the coreStr of mergeWord is in suggest Dic
            // the merge word is not a Aa, assuming no merge for Aa
            // becase Aa is short enough
            if ((suggestDic.IsDicWord(coreStr) == true) && (aADic.IsDicWord(coreStr) == false))
            {
                MergeObj mergeObj = new MergeObj(tarWord, orgMergeWord, mergeWord, coreStr, mergeNo, startIndex, tarIndex, endIndex, startPos, tarPos, endPos);
                mergeSet.Add(mergeObj);
            }
        }
Esempio n. 10
0
        private static void TestSplitDic(CSpellApi cSpellApi)
        {
            // test split dictionary
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();

            // test words
            List <string> wordList = new List <string>();

            wordList.Add("do");
            wordList.Add("i");
            wordList.Add("ng");
            wordList.Add("ilove");
            foreach (string word in wordList)
            {
                Console.WriteLine("-- SplitDic(" + word + "): " + splitWordDic.IsDicWord(word));
            }
        }
        // real-word candidate has more restriction than non-word
        // TBD, need to organize the code ...
        // the check should be done in the ranking
        // Core process for real-word candidates
        private static bool IsValid1To1Cand(string inWord, string cand, CSpellApi cSpellApi)
        {
            RootDictionary suggestDic = cSpellApi.GetSuggestDic();
            Word2Vec       word2VecOm = cSpellApi.GetWord2VecOm();
            WordWcMap      wordWcMap  = cSpellApi.GetWordWcMap();
            // real-word, check phonetic and suggDic
            // 1. check suggDic
            // 1.1 edDist <= 1
            // 1.2 edDist <= 2 && phonetic dist <= 1
            // 2. check if inflections, not a candidate real-word, not correct
            bool   flag                = false;
            int    rw1To1CandMinWc     = cSpellApi.GetCanRw1To1CandMinWc();
            int    rw1To1CandMinLength = cSpellApi.GetCanRw1To1CandMinLength();
            string inWordLc            = inWord.ToLower();
            int    inWordLen           = inWordLc.Length;
            int    candLen             = cand.Length;
            int    lenDiff             = inWordLen - candLen;

            // 1. check suggDic and inflVars
            if ((suggestDic.IsDicWord(cand) == true) && (word2VecOm.HasWordVec(cand) == true) && (candLen >= rw1To1CandMinLength) && (WordCountScore.GetWc(cand, wordWcMap) >= rw1To1CandMinWc) && (InflVarsUtil.IsInflectionVar(inWordLc, cand) == false))             // not inflVars
            {
                //&& ((lenDiff <= 1) && (lenDiff >= -1))) // length diff <= 1
                // more restriction for real-word candidates
                int pmDist     = Metaphone2.GetDistance(inWordLc, cand);
                int prDist     = RefinedSoundex.GetDistance(inWordLc, cand);
                int leadDist   = GetLeadCharDist(inWordLc, cand);
                int endDist    = GetEndCharDist(inWordLc, cand);
                int lengthDist = GetLengthDist(inWordLc, cand);
                int totalDist1 = leadDist + endDist + lengthDist + pmDist + prDist;
                int editDist   = EditDistance.GetDistanceForRealWord(inWordLc, cand);
                int totalDist2 = editDist + pmDist + prDist;
                // if they sound the same
                if ((pmDist == 0) && (prDist == 0))
                {
                    flag = true;
                }
                // if they sound similar and orthographic is also similar
                // fixed from empierical test, not configuable
                else if ((totalDist1 < 3) && (totalDist2 < 4) && (pmDist * prDist == 0))
                {
                    flag = true;
                }
            }
            return(flag);
        }
        // public method
        // filter out with dictionary
        // Use no Abb/Acr dictionary to exclude terms are abb/acr
        // The inWord must be a coreTerm.
        public static HashSet <string> GetCandidates(string inWord, CSpellApi cSpellApi, int maxSplitNo)
        {
            // init from cSpellApi
            RootDictionary mwDic = cSpellApi.GetMwDic();
            // 1. find all possibie split combination by spaces
            // must be <= maxSplitNo
            HashSet <string> splitSet = CandidatesUtilSplit.GetSplitSet(inWord, maxSplitNo);
            // filter out those are not valid
            HashSet <string> candidates = new HashSet <string>();

            // 2. multiwords: check the whole list of split terms
            // only inlcude dictionary that have multiword - lexicon
            // TBD: this will find "perse" to "per se", however, "perse" is
            // a valid word in eng_medical.dic so cSpell can't correct it.
            // Need to refine the dictionary later!
            foreach (string split in splitSet)
            {
                if (mwDic.IsDicWord(split) == true)
                {
                    candidates.Add(split);
                }
            }
            // 3. if no multiwords found from step 2.
            // check each split terms, mark as candidate if they are in Dic,
            // Acr/Abb are excluded to eliminate noise such as 'a', 'ab', etc.
            if (candidates.Count == 0)
            {
                // go through each split words
                foreach (string split in splitSet)
                {
                    // add to candidate if all split words are valid
                    if (IsValidSplitCand(split, cSpellApi) == true)
                    {
                        candidates.Add(split);
                    }
                }
            }
            return(candidates);
        }
        // for the split, we don't want Aa as a valid word
        // because it will cause too much noise (less precision)
        // TBD ... re-organize
        private static bool IsValidSplitWord(string inWord, CSpellApi cSpellApi)
        {
            // splitWord uses LexiconNoAa for Dic
            RootDictionary splitWordDic = cSpellApi.GetSplitWordDic();
            WordWcMap      wordWcMap    = cSpellApi.GetWordWcMap();
            Word2Vec       word2VecOm   = cSpellApi.GetWord2VecOm();
            RootDictionary unitDic      = cSpellApi.GetUnitDic();
            RootDictionary pnDic        = cSpellApi.GetPnDic();
            //RootDictionary aaDic = cSpellApi.GetAaDic();
            int rwSplitCandMinWc = cSpellApi.GetCanRwSplitCandMinWc();
            // real-word cand split word must:
            // 1. check if in the splitWordDic, No Aa with a small length
            // such as cel is an overlap, it is aa or not-aa
            // 2. has word2Vec
            // 3. has WC
            // 4. not unit, mg -> ...
            // 5. not properNoun, human -> Hu man, where Hu is pn
            // children -> child ren, where ren is pn
            bool flag = (splitWordDic.IsDicWord(inWord)) && (word2VecOm.HasWordVec(inWord) == true) && (WordCountScore.GetWc(inWord, wordWcMap) >= rwSplitCandMinWc) && (!unitDic.IsDicWord(inWord)) && (!pnDic.IsDicWord(inWord));

            return(flag);
        }
Esempio n. 14
0
        // protected method
        // get merge word by merge no, including shift window, fixed window size
        protected internal static HashSet <MergeObj> GetMergeSetByMergeNo(int tarPos, List <TokenObj> nonSpaceTextList, int mergeNo, bool mergeWithHyphen, bool shortWordMerge, RootDictionary suggestDic, RootDictionary aADic, RootDictionary mwDic)
        {
            // output merge object list
            HashSet <MergeObj> mergeSet = new HashSet <MergeObj>();
            // find the merge object
            int startPos = tarPos - mergeNo;             // start pos index

            startPos = ((startPos > 0) ? startPos : 0);
            int size = nonSpaceTextList.Count;
            // find the merge word, merged by remove spcae or repalce with "-"
            // shift window by i
            int    startIndex = 0;
            int    tarIndex   = nonSpaceTextList[tarPos].GetIndex();
            string tarWord    = nonSpaceTextList[tarPos].GetTokenStr();
            int    endIndex   = 0;
            // these are vars to be used to MergeObj
            int objStartPos = 0;
            int objTarPos   = tarPos;
            int objEndPos   = 0;

            // all possible merges
            for (int i = startPos; i <= tarPos; i++)
            {
                // get the merged word with fixed window size (mergeNo)
                string mergeWordBySpace  = "";
                string mergeWordByHyphen = "";
                string orgMergeWord      = "";            // the original word b4 merge
                bool   completeFlag      = true;
                startIndex = nonSpaceTextList[i].GetIndex();
                bool firstToken = true;
                objStartPos = i;
                objEndPos   = i + mergeNo;
                int shortWordNo = 0;
                // merge operations
                for (int j = 0; j <= mergeNo; j++)
                {
                    int curPos = i + j;
                    if (curPos < size)                       // check window size
                    {
                        TokenObj curTokenObj = nonSpaceTextList[curPos];
                        string   tokenStr    = curTokenObj.GetTokenStr();
                        // should move to a Util function file
                        // don't combine if exception of puntuaction
                        if ((DigitPuncTokenUtil.IsDigit(tokenStr) == true) || (DigitPuncTokenUtil.IsPunc(tokenStr) == true) || (DigitPuncTokenUtil.IsDigitPunc(tokenStr) == true) || (InternetTokenUtil.IsUrl(tokenStr) == true) || (InternetTokenUtil.IsEmail(tokenStr) == true))                         // eMail
                        {
                            //|| (MeasurementTokenUtil.IsMeasurements(tokenStr, unitDic) == true))
                            completeFlag = false;
                            break;
                        }
                        else                             // where merege operation happen
                                                         // don't put the "-" or " " for the first token
                        {
                            if (firstToken == true)
                            {
                                mergeWordBySpace  = tokenStr;
                                mergeWordByHyphen = tokenStr;
                                orgMergeWord      = tokenStr;
                                firstToken        = false;
                                shortWordNo       = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            else
                            {
                                mergeWordBySpace  += tokenStr;
                                mergeWordByHyphen += GlobalVars.HYPHEN_STR + tokenStr;
                                orgMergeWord      += GlobalVars.SPACE_STR + tokenStr;
                                shortWordNo        = UpdateShortWordNo(tokenStr, SHORT_WORD_LENGTH, shortWordNo);
                            }
                            endIndex = curTokenObj.GetIndex();
                        }
                    }
                    else                         // end of the text list, break out of the loop
                    {
                        completeFlag = false;
                        break;
                    }
                }
                // must complete the fixed window for merging
                if (completeFlag == true)
                {
                    // the orginal word (before merge) can't be a multiword
                    // such as "non clinical"
                    if (mwDic.IsDicWord(orgMergeWord) == false)
                    {
                        // check short word merge
                        if ((shortWordMerge == true) || (shortWordNo <= MAX_SHORT_WORD_NO))                           // real-word
                        {
                            AddMergeObj(tarWord, orgMergeWord, mergeWordBySpace, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            // Add merge with hyphen to candidate set
                            if (mergeWithHyphen == true)
                            {
                                AddMergeObj(tarWord, orgMergeWord, mergeWordByHyphen, mergeNo, startIndex, tarIndex, endIndex, objStartPos, objTarPos, objEndPos, mergeSet, suggestDic, aADic);
                            }
                        }
                    }
                }
            }
            return(mergeSet);
        }
Esempio n. 15
0
        private static void Test()
        {
            Console.WriteLine("===== Unit Test of BasicDictionary =====");
            bool           caseFlag = false;
            RootDictionary dic0     = DictionaryFactory.GetDictionary(DictionaryFactory.DIC_BASIC, caseFlag);

            // dic0 baselin dictionary
            Console.WriteLine("------- Words from Baseline 11 dicitoaries -------");
            string dicStrs = "../data/Dictionary/eng_medical.dic:../data/Dictionary/center.dic:../data/Dictionary/centre.dic:../data/Dictionary/color.dic:../data/Dictionary/colour.dic:../data/Dictionary/ise.dic:../data/Dictionary/ize.dic:../data/Dictionary/labeled.dic:../data/Dictionary/labelled.dic:../data/Dictionary/yse.dic:../data/Dictionary/yze.dic";

            string[]      dicStrArray = dicStrs.Split(":", true);
            List <string> dicStrList  = dicStrArray.ToList();

            foreach (string dicStr in dicStrList)
            {
                dic0.AddDictionary(dicStr);
                Console.WriteLine("- Dic0 File: " + dicStr);
                Console.WriteLine("- Dic0 size: " + dic0.GetSize());
            }
            Console.WriteLine("------- Lexicon element words -------");
            string         lexDicEwStr = "../data/Dictionary/lexiconDic.data.ewLc";
            RootDictionary dic1        = new BasicDictionary(lexDicEwStr);

            Console.WriteLine("- Dic1 File: " + lexDicEwStr);
            Console.WriteLine("- Dic1 size: " + dic1.GetSize());
            Console.WriteLine("------- Lexicon words --------");
            string         lexDicStr = "../data/Dictionary/lexiconDic.data";
            int            fieldNo   = 1;
            RootDictionary dic2      = new BasicDictionary(lexDicStr, fieldNo);

            Console.WriteLine("- Dic2 File: " + lexDicStr);
            Console.WriteLine("- Dic2 size: " + dic2.GetSize());
            string numDicStr = "../data/Dictionary/NRVAR.1.uSort.data";

            dic2.AddDictionary(numDicStr);
            Console.WriteLine("- Dic2 File: " + numDicStr);
            Console.WriteLine("- Dic2 size: " + dic2.GetSize());
            Console.WriteLine("----------------------");
            // test words
            List <string> wordList = new List <string>();

            wordList.Add("test");
            wordList.Add("Test");
            wordList.Add("TEST");
            wordList.Add("liter");
            wordList.Add("litre");
            wordList.Add("odor");
            wordList.Add("odour");
            wordList.Add("iodise");
            wordList.Add("iodize");
            wordList.Add("beveled");
            wordList.Add("bevelled");
            wordList.Add("hemolyse");
            wordList.Add("hemolyze");
            wordList.Add("ella");
            wordList.Add("centillionths");
            wordList.Add("Down's");
            wordList.Add("Downs'");
            wordList.Add("spot(s)");
            wordList.Add("fetus(es)");
            wordList.Add("box(es)");
            wordList.Add("waltz(es)");
            wordList.Add("mtach(es)");
            wordList.Add("splash(es)");
            wordList.Add("fly(ies)");
            wordList.Add("extremity(ies)");
            wordList.Add("CASE/TEST");
            wordList.Add("John's/Chris's");
            wordList.Add("50mg/100mg");
            wordList.Add("case/test");
            wordList.Add("neck-lesion");
            wordList.Add("day-night");
            wordList.Add("pneumonoultramicroscopicsilicovolcanoconiosis");
            wordList.Add("Walmart");
            wordList.Add("test321");
            Console.WriteLine("input|baseline|L-element|Lexicon|L-RealWord");
            foreach (string w in wordList)
            {
                Console.WriteLine("- IsDicWord(" + w + "): " + dic0.IsDicWord(w) + ", " + dic1.IsDicWord(w) + ", " + dic2.IsDicWord(w) + ", " + dic2.IsValidWord(w));
            }
            string word = "test321";

            Console.WriteLine("------ Add [" + word + "] to dictionary ------");
            dic0.AddWord(word);
            dic1.AddWord(word);
            dic2.AddWord(word);
            Console.WriteLine("- Dic0 size: " + dic0.GetSize());
            Console.WriteLine("- Dic1 size: " + dic1.GetSize());
            Console.WriteLine("- Dic2 size: " + dic2.GetSize());
            Console.WriteLine("- IsInDic(" + word + "): " + dic0.IsDicWord(word) + ", " + dic1.IsDicWord(word) + ", " + dic2.IsDicWord(word));
            Console.WriteLine("===== End of Unit Test =====");
        }