Ejemplo n.º 1
0
        // include multiwords, multiwords = avg. score
        private static double GetScoreDev2(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;

            //double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            foreach (string word in wordList)
            {
                //double curScore = GetScoreByChurch(word, wordWcMap);
                //double curScore = GetScoreByCrowell(word, wordWcMap);
                //double curScore = GetScoreByPeter(word, wordWcMap);
                //double curScore = GetUnigramFreqScore(word, wordWcMap);
                //double curScore = GetWc(word, wordWcMap);
                double curScore = GetScoreDev1(word, wordWcMap);
                totalScore += curScore;
            }
            if (totalScore > 0.0)
            {
                score = totalScore / totalWords;
            }
            return(score);
        }
        // These are hueristic rule for real-wrod split
        // check the total no of short word for split words in inTerm (candidate)
        // short word is configurable, such as 2 or 3
        // the total no of split shot word must less than a number, default is 2
        // This rule is added to filter out: some -> so me,
        // filter out: another -> a not her (shortSplitWordNo = 3)
        // filter out: anyone -> any one (shortSplitWordNo = 2)
        // 1. keep: away -> a way (shortSplitWordNo = 1)
        // 2. filter: out soon -> so on (shortSplitWordNo = 2)
        // 3. filter: out anyway -> any way (shortSplitWordNo = 2)
        private static bool CheckShortSplitWords(string inTerm, CSpellApi cSpellApi)
        {
            // init
            int shortSplitWordLength = cSpellApi.GetCanRwShortSplitWordLength();
            int maxShortSplitWordNo  = cSpellApi.GetCanRwMaxShortSplitWordNo();
            // convert to word list
            List <string> wordList         = TermUtil.ToWordList(inTerm);
            bool          flag             = true;
            int           shortSplitWordNo = 0;   // total no of short split word 1

            foreach (string word in wordList)
            {
                // find shor word
                if (word.Length <= shortSplitWordLength)
                {
                    shortSplitWordNo++;
                }
            }
            // check the total no of short split words (length <= 2)
            if (shortSplitWordNo >= maxShortSplitWordNo)
            {
                flag = false;
            }
            return(flag);
        }
        // check all split words form a term to verify it is a valid
        // inTerm is the term to be split
        // the inTerm is a coreTerm
        public static bool IsValidSplitWords(string inTerm, CSpellApi cSpellApi)
        {
            //RootDictionary unitDic = cSpellApi.GetUnitDic();
            List <string> splitWordList = TermUtil.ToWordList(inTerm);
            bool          validFlag     = true;

            // go through all split words, they can be:
            // 1. digit (pure number)
            // 2. unit
            // 3. word in the split word dictionary: English + ProperNoun (not Aa)
            // if any splitWord is not above, the split is false
            foreach (string splitWord in splitWordList)
            {
                /* remove unit and digit beacuse:
                 * 1. they are handled in ND
                 * 2. some unit are Aa, such as ng, cause noise [FP]
                 * - seing => se i ng, no good
                 * if((DigitPuncTokenUtil.IsDigit(splitWord) == false) // digit
                 * && (unitDic.IsDicWord(splitWord) == false) // unit
                 * && (IsValidSplitWord(splitWord, cSpellApi) == false))// split word
                 */
                if (IsValidSplitWord(splitWord, cSpellApi) == false)
                {
                    validFlag = false;
                    break;
                }
            }
            return(validFlag);
        }
Ejemplo n.º 4
0
        // Use Avg. word2Vec Om for each word in the inTerm
        private static DoubleVec GetWordVecForTerm(string inTerm, Word2Vec w2vOm)
        {
            List <string> inWordList = TermUtil.ToWordList(inTerm);
            // avg. the wordVec if inTerm is a multiword
            DoubleVec outWordVec = GetAvgWordVecForList(inWordList, w2vOm);

            // TBD: take care of possesive
            return(outWordVec);
        }
Ejemplo n.º 5
0
        // private method
        // Test merge and Split
        private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm)
        {
            // 0. process the inText
            TextObj         textObj    = new TextObj(inText);
            List <TokenObj> inTextList = textObj.GetTokenList();
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            Console.WriteLine("==========================================");
            Console.WriteLine("-- inTextList: [" + inText + "]");
            bool word2VecSkipWord = true;
            bool debugFlag        = false;
            // 1.a context with window radius
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
            // 1.b context with all inText
            DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
            // 1.c get score1
            ContextScore score1  = new ContextScore(mergedWord, contextVec, w2vOm);
            ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm);

            Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore()));
            // 2. split words
            ContextScore score2  = new ContextScore(splitWords, contextVec, w2vOm);
            ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm);

            Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore()));
            // 3. 3. 3. Use avg. score on single words
            // This method use different context for each single word
            List <string> splitWordList = TermUtil.ToWordList(splitWords);
            int           index         = 0;
            double        scoreSAvg     = 0.0d;  // radius
            double        scoreSAAvg    = 0.0d;  // all inText

            //debugFlag = false;
            foreach (string splitWord in splitWordList)
            {
                // window radius
                DoubleVec    contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
                ContextScore scoreS      = new ContextScore(splitWord, contextVecS, w2vOm);
                //System.out.println("-- " + scoreS.ToString());
                scoreSAvg += scoreS.GetScore();
                // all text
                DoubleVec    contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
                ContextScore scoreSA      = new ContextScore(splitWord, contextVecSA, w2vOm);
                //System.out.println("-- " + scoreSA.ToString());
                scoreSAAvg += scoreSA.GetScore();
                index++;
            }
            scoreSAvg  = scoreSAvg / index;            // window
            scoreSAAvg = scoreSAAvg / index;           // all text
            Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg));
        }
        // These are hueristic rule for real-word one-to-one correction
        // check if all one-to-one words in inTerm (candidate)
        // 1. must have wordVec.
        private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm)
        {
            List <string> wordList = TermUtil.ToWordList(inTerm);
            bool          flag     = true;

            foreach (string word in wordList)
            {
                if (word2VecOm.HasWordVec(word) == false)
                {
                    flag = false;
                    break;
                }
            }
            return(flag);
        }
Ejemplo n.º 7
0
        // this method is to be deleted because it has same result as GetScore()
        public static double GetScore2(string inTerm, DoubleVec contextVec, Word2Vec w2vOm)
        {
            List <string> inWordList = TermUtil.ToWordList(inTerm);
            double        score      = 0.0d;
            int           count      = 0;

            foreach (string word in inWordList)
            {
                DoubleVec wordVec = w2vOm.GetWordVec(word);
                if (wordVec != null)
                {
                    score += GetCwobScore(wordVec, contextVec);
                }
                count++;
            }
            // add score first, then calculate the avg.
            score = score / count;
            return(score);
        }
Ejemplo n.º 8
0
        // get socre for single word and multiwords (for split cases)
        // 1). multiword: score = avg. score of allwords
        // 2). single word: score =  log(adjust WC) / log (adjust Max. WC).
        public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            long          totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());

            // use the average score for the multiwords
            foreach (string word in wordList)
            {
                totalScore += GetWordScore(word, maxWc, wordWcMap);
            }
            if (totalWords > 0)
            {
                score = totalScore / totalWords;
            }
            return(score);
        }
        // check all split words
        private static bool CheckSplitWords(string inTerm, CSpellApi cSpellApi)
        {
            // convert to word list
            List <string> splitWordList = TermUtil.ToWordList(inTerm);
            // go through all split words, they can be:
            // 1. digit (pure number)
            // 2. unit
            // 3. word in the split word dictionary: English + ProperNoun (not Aa)
            // if any splitWord is not above, the split is false
            bool flag = true;

            foreach (string splitWord in splitWordList)
            {
                // check each split word
                if (IsValidSplitWord(splitWord, cSpellApi) == false)
                {
                    flag = false;
                    break;
                }
            }
            return(flag);
        }
Ejemplo n.º 10
0
        public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap)
        {
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            double minScore = int.MaxValue;

            foreach (string word in wordList)
            {
                double curScore = GetWordScore(word, maxWc, wordWcMap);
                minScore = (curScore < minScore ? curScore : minScore);
            }
            if (minScore < int.MaxValue)
            {
                score = minScore;
            }
            return(score);
        }