Ejemplo n.º 1
        // include multiwords, multiwords = avg. score
        private static double GetScoreDev2(string inWord, WordWcMap wordWcMap)
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;

            //double maxWc = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            foreach (string word in wordList)
                //double curScore = GetScoreByChurch(word, wordWcMap);
                //double curScore = GetScoreByCrowell(word, wordWcMap);
                //double curScore = GetScoreByPeter(word, wordWcMap);
                //double curScore = GetUnigramFreqScore(word, wordWcMap);
                //double curScore = GetWc(word, wordWcMap);
                double curScore = GetScoreDev1(word, wordWcMap);
                totalScore += curScore;
            if (totalScore > 0.0)
                score = totalScore / totalWords;
        // check all split words form a term to verify it is a valid
        // inTerm is the term to be split
        // the inTerm is a coreTerm
        public static bool IsValidSplitWords(string inTerm, CSpellApi cSpellApi)
            //RootDictionary unitDic = cSpellApi.GetUnitDic();
            List <string> splitWordList = TermUtil.ToWordList(inTerm);
            bool          validFlag     = true;

            // go through all split words, they can be:
            // 1. digit (pure number)
            // 2. unit
            // 3. word in the split word dictionary: English + ProperNoun (not Aa)
            // if any splitWord is not above, the split is false
            foreach (string splitWord in splitWordList)
                /* remove unit and digit beacuse:
                 * 1. they are handled in ND
                 * 2. some unit are Aa, such as ng, cause noise [FP]
                 * - seing => se i ng, no good
                 * if((DigitPuncTokenUtil.IsDigit(splitWord) == false) // digit
                 * && (unitDic.IsDicWord(splitWord) == false) // unit
                 * && (IsValidSplitWord(splitWord, cSpellApi) == false))// split word
                if (IsValidSplitWord(splitWord, cSpellApi) == false)
                    validFlag = false;
        // These are hueristic rule for real-wrod split
        // check the total no of short word for split words in inTerm (candidate)
        // short word is configurable, such as 2 or 3
        // the total no of split shot word must less than a number, default is 2
        // This rule is added to filter out: some -> so me,
        // filter out: another -> a not her (shortSplitWordNo = 3)
        // filter out: anyone -> any one (shortSplitWordNo = 2)
        // 1. keep: away -> a way (shortSplitWordNo = 1)
        // 2. filter: out soon -> so on (shortSplitWordNo = 2)
        // 3. filter: out anyway -> any way (shortSplitWordNo = 2)
        private static bool CheckShortSplitWords(string inTerm, CSpellApi cSpellApi)
            // init
            int shortSplitWordLength = cSpellApi.GetCanRwShortSplitWordLength();
            int maxShortSplitWordNo  = cSpellApi.GetCanRwMaxShortSplitWordNo();
            // convert to word list
            List <string> wordList         = TermUtil.ToWordList(inTerm);
            bool          flag             = true;
            int           shortSplitWordNo = 0;   // total no of short split word 1

            foreach (string word in wordList)
                // find shor word
                if (word.Length <= shortSplitWordLength)
            // check the total no of short split words (length <= 2)
            if (shortSplitWordNo >= maxShortSplitWordNo)
                flag = false;
        // 3 operations:
        // convert a tokenObj to a arrayList of tokenObjs:
        // 1. merge (delete) a tokenObj if the str is empty (length = 0)
        // 2. keep the same tokenObj if str is a single word
        // 3. split a tokenObj if the str contains space
        public static void AddSplit1To1Correction(List <TokenObj> inList, TokenObj inToken)
            string tokenStr = inToken.GetTokenStr();

            // 1. do not add to the list if the token is empty
            if ((string.ReferenceEquals(tokenStr, null)) || (tokenStr.Length == 0))
                // do nothing
            // 2. keep the same tokenObj if there is no change
            // 1-to-1 correction
            else if (TermUtil.IsMultiword(tokenStr) == false)
                Add1To1Correction(inList, inToken);
                // TB Deleted
            // 3. split a tokenObj to an arrayList if the str has space
                AddSplitCorrection(inList, inToken);

                /* TB deleted
                 * ArrayList<TokenObj> tempTokenList = new ArrayList<TokenObj>();
                 * // keep token and delimiters
                 * String[] tokenArray = tokenStr.split(TextObj.patternStrSpace_);
                 * tempTokenList = new ArrayList<TokenObj>(Arrays.stream(tokenArray)
                 *  .map(token -> new TokenObj(inToken, token))
                 *  .collect(Collectors.toList()));
                 * inList.addAll(tempTokenList);
Ejemplo n.º 5
        /// <summary>
        /// Compare two object o1 and o2.  Both objects o1 and o2 are
        /// NoisyChannelScore.  The compare algorithm:
        /// </summary>
        /// <param name="o1">  first object to be compared </param>
        /// <param name="o2">  second object to be compared
        /// </param>
        /// <returns>  a negative integer, 0, or positive integer to represent the
        ///          object o1 is less, equals, or greater than object 02. </returns>
        public virtual int Compare(NoisyChannelScore o1, NoisyChannelScore o2)
            // 1. compare how many words for the candidates
            // for now, we assume less word is better,
            // i.e. whatever is better than "what ever"
            int    @out    = 0;
            string cand1   = ((NoisyChannelScore)o1).GetCandStr();
            string cand2   = ((NoisyChannelScore)o2).GetCandStr();
            int    wordNo1 = TermUtil.GetWordNo(cand1);
            int    wordNo2 = TermUtil.GetWordNo(cand2);

            if (wordNo1 != wordNo2)
                @out = wordNo1 - wordNo2;                 // less wordNo has higher rank
                // 2. compare noisy Channel score
                double score1 = ((NoisyChannelScore)o1).GetScore();
                double score2 = ((NoisyChannelScore)o2).GetScore();
                // SCR-2: use a fixed number to ensure result is not 0.
                if (score2 > score1)
                    // from high to low
                    @out = 1;
                else if (score2 < score1)
                    @out = -1;
                    // 3. compare by orthographic score
                    OrthographicScore oScore1 = ((NoisyChannelScore)o1).GetOScore();
                    OrthographicScore oScore2 = ((NoisyChannelScore)o2).GetOScore();
                    if (oScore1.GetScore() != oScore2.GetScore())
                        OrthographicScoreComparator <OrthographicScore> osc = new OrthographicScoreComparator <OrthographicScore>();
                        @out = osc.Compare(oScore1, oScore2);
                    else                         // 4. hannelScore
                        FrequencyScore fScore1 = ((NoisyChannelScore)o1).GetFScore();
                        FrequencyScore fScore2 = ((NoisyChannelScore)o2).GetFScore();
                        if (fScore1.GetScore() != fScore2.GetScore())
                            FrequencyScoreComparator <FrequencyScore> fsc = new FrequencyScoreComparator <FrequencyScore>();
                            @out = fsc.Compare(fScore1, fScore2);
                        else                             // 4. alphabetic order
                            @out = cand2.CompareTo(cand1);
Ejemplo n.º 6
        // Use Avg. word2Vec Om for each word in the inTerm
        private static DoubleVec GetWordVecForTerm(string inTerm, Word2Vec w2vOm)
            List <string> inWordList = TermUtil.ToWordList(inTerm);
            // avg. the wordVec if inTerm is a multiword
            DoubleVec outWordVec = GetAvgWordVecForList(inWordList, w2vOm);

            // TBD: take care of possesive
Ejemplo n.º 7
        // private method
        // Test merge and Split
        private static void Test(string inText, int tarPos, int tarSize, int radius, string mergedWord, string splitWords, Word2Vec w2vIm, Word2Vec w2vOm)
            // 0. process the inText
            TextObj         textObj    = new TextObj(inText);
            List <TokenObj> inTextList = textObj.GetTokenList();
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            Console.WriteLine("-- inTextList: [" + inText + "]");
            bool word2VecSkipWord = true;
            bool debugFlag        = false;
            // 1.a context with window radius
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
            // 1.b context with all inText
            DoubleVec contextVecA = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
            // 1.c get score1
            ContextScore score1  = new ContextScore(mergedWord, contextVec, w2vOm);
            ContextScore score1a = new ContextScore(mergedWord, contextVecA, w2vOm);

            Console.WriteLine(score1.ToString() + "|" + string.Format("{0,1:F8}", score1a.GetScore()));
            // 2. split words
            ContextScore score2  = new ContextScore(splitWords, contextVec, w2vOm);
            ContextScore score2a = new ContextScore(splitWords, contextVecA, w2vOm);

            Console.WriteLine(score2.ToString() + "|" + string.Format("{0,1:F8}", score2a.GetScore()));
            // 3. 3. 3. Use avg. score on single words
            // This method use different context for each single word
            List <string> splitWordList = TermUtil.ToWordList(splitWords);
            int           index         = 0;
            double        scoreSAvg     = 0.0d;  // radius
            double        scoreSAAvg    = 0.0d;  // all inText

            //debugFlag = false;
            foreach (string splitWord in splitWordList)
                // window radius
                DoubleVec    contextVecS = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
                ContextScore scoreS      = new ContextScore(splitWord, contextVecS, w2vOm);
                //System.out.println("-- " + scoreS.ToString());
                scoreSAvg += scoreS.GetScore();
                // all text
                DoubleVec    contextVecSA = Word2VecContext.GetContextVec(tarPos + index, 1, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
                ContextScore scoreSA      = new ContextScore(splitWord, contextVecSA, w2vOm);
                //System.out.println("-- " + scoreSA.ToString());
                scoreSAAvg += scoreSA.GetScore();
            scoreSAvg  = scoreSAvg / index;            // window
            scoreSAAvg = scoreSAAvg / index;           // all text
            Console.WriteLine("Avg. Single Word|" + string.Format("{0,1:F8}", scoreSAvg) + "|" + string.Format("{0,1:F8}", scoreSAAvg));
Ejemplo n.º 8
        // get possible split set by replacing hyphen with space
        protected internal static string GetSplitByPunc(string inWord, char puncChar)
            char[] temp = inWord.ToCharArray();
            for (int i = 0; i < temp.Length; i++)
                if (temp[i] == puncChar)
                    temp[i] = ' ';
            string splitStr = TermUtil.Trim(new string(temp));

        // These are hueristic rule for real-word one-to-one correction
        // check if all one-to-one words in inTerm (candidate)
        // 1. must have wordVec.
        private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm)
            List <string> wordList = TermUtil.ToWordList(inTerm);
            bool          flag     = true;

            foreach (string word in wordList)
                if (word2VecOm.HasWordVec(word) == false)
                    flag = false;
Ejemplo n.º 10
        // get all possible split combination by 1 space
        // lowercase only
        // not include duplicates
        // This is the core split process by space
        protected internal static HashSet <string> GetSplitSetBy1Space(string inWord)
            HashSet <string> splitSet = new HashSet <string>();
            string           word     = inWord.ToLower();

            // Insert space inside the word, not on either ends
            for (int i = 1; i < word.Length; i++)
                // Insert space for split
                string insertWord = word.Substring(0, i) + GlobalVars.SPACE_STR + word.Substring(i);
                // remove multiple spaces
                // needed when inserting a space to a space
                // Use this to convert "a  b" to "a b"
Ejemplo n.º 11
        private static void AddMergeObj(string tarWord, string orgMergeWord, string mergeWord, int mergeNo, int startIndex, int tarIndex, int endIndex, int startPos, int tarPos, int endPos, HashSet <MergeObj> mergeSet, RootDictionary suggestDic, RootDictionary aADic)
            // 1. convert merged word to coreTerm
            int  ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC;
            bool lcFlag = true;
            // only take care of the end punctuation for the coreTerm
            string coreStr = TermUtil.StripEndPuncSpace(mergeWord);

            // 2. check if the coreStr of mergeWord is in suggest Dic
            // the merge word is not a Aa, assuming no merge for Aa
            // becase Aa is short enough
            if ((suggestDic.IsDicWord(coreStr) == true) && (aADic.IsDicWord(coreStr) == false))
                MergeObj mergeObj = new MergeObj(tarWord, orgMergeWord, mergeWord, coreStr, mergeNo, startIndex, tarIndex, endIndex, startPos, tarPos, endPos);
Ejemplo n.º 12
        // this method is to be deleted because it has same result as GetScore()
        public static double GetScore2(string inTerm, DoubleVec contextVec, Word2Vec w2vOm)
            List <string> inWordList = TermUtil.ToWordList(inTerm);
            double        score      = 0.0d;
            int           count      = 0;

            foreach (string word in inWordList)
                DoubleVec wordVec = w2vOm.GetWordVec(word);
                if (wordVec != null)
                    score += GetCwobScore(wordVec, contextVec);
            // add score first, then calculate the avg.
            score = score / count;
Ejemplo n.º 13
        // get socre for single word and multiwords (for split cases)
        // 1). multiword: score = avg. score of allwords
        // 2). single word: score =  log(adjust WC) / log (adjust Max. WC).
        public static double GetAdjustScoreAvg(string inWord, WordWcMap wordWcMap)
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            long          totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());

            // use the average score for the multiwords
            foreach (string word in wordList)
                totalScore += GetWordScore(word, maxWc, wordWcMap);
            if (totalWords > 0)
                score = totalScore / totalWords;
        // check all split words
        private static bool CheckSplitWords(string inTerm, CSpellApi cSpellApi)
            // convert to word list
            List <string> splitWordList = TermUtil.ToWordList(inTerm);
            // go through all split words, they can be:
            // 1. digit (pure number)
            // 2. unit
            // 3. word in the split word dictionary: English + ProperNoun (not Aa)
            // if any splitWord is not above, the split is false
            bool flag = true;

            foreach (string splitWord in splitWordList)
                // check each split word
                if (IsValidSplitWord(splitWord, cSpellApi) == false)
                    flag = false;
Ejemplo n.º 15
        // public method
        /// <summary>
        /// The core method to correct a word by following steps:
        /// <ul>
        /// <li>Convert inToken to removeEndPuncStr
        /// <li>detect if misspell (OOV) - non-word, exclude Aa
        /// <li>get candidates
        ///     <ul>
        ///     <li>get candidates from merge.
        ///     </ul>
        /// <li>Rank candidates
        ///     <ul>
        ///     <li>orthographic
        ///     <li>frequency
        ///     <li>context
        ///     </ul>
        /// <li>Update information
        /// </ul>
        /// </summary>
        /// <param name="tarPos">    postion of target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print
        /// </param>
        /// <returns>    the corrected merged word in MergeObj if the token is OOV
        ///             and suggested merged word found.
        ///             Otherwise, a null of MergeObj is returned. </returns>
        // return the original term if no good correctin are found
        public static MergeObj GetCorrectTerm(int tarPos, List <TokenObj> nonSpaceTokenList, CSpellApi cSpellApi, bool debugFlag)
            // get tarWord from tarTokenObj and init outTokenObj
            TokenObj tarTokenObj = nonSpaceTokenList[tarPos];
            string   tarWord     = tarTokenObj.GetTokenStr();
            MergeObj outMergeObj = null;             // no merge if it is null
            // 1. only remove ending punctuation for coreTerm
            string coreStr = TermUtil.StripEndPuncSpace(tarWord).ToLower();

            // 2. non-word correction
            // check if tarWord and removeEndPuncStr is OOV
            if (NonWordMergeDetector.IsDetect(tarWord, coreStr, cSpellApi, debugFlag) == true)
                // 3. get candidates from merge
                HashSet <MergeObj> mergeSet = NonWordMergeCandidates.GetCandidates(tarPos, nonSpaceTokenList, cSpellApi);
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 just use frenquency or context, no orthoGraphic
                // in case of using context
                outMergeObj = RankNonWordMergeByMode.GetTopRankMergeObj(mergeSet, cSpellApi, tarPos, nonSpaceTokenList, debugFlag);
Ejemplo n.º 16
        public static double GetAdjustScoreMin(string inWord, WordWcMap wordWcMap)
            // check multiword case for split
            bool          normFlag   = false;  // don't use punctuation for determiner
            List <string> wordList   = TermUtil.ToWordList(inWord, normFlag);
            double        score      = 0.0;
            double        totalScore = 0.0;
            int           totalWords = wordList.Count;
            double        maxWc      = GetAdjustedWc(wordWcMap.GetMaxWc());
            // use the average score for the multiwords
            double minScore = int.MaxValue;

            foreach (string word in wordList)
                double curScore = GetWordScore(word, maxWc, wordWcMap);
                minScore = (curScore < minScore ? curScore : minScore);
            if (minScore < int.MaxValue)
                score = minScore;
Ejemplo n.º 17
        /// <summary>
        /// Compare two object o1 and o2.  Both objects o1 and o2 are
        /// FrequencyScore.  The compare algorithm:
        /// </summary>
        /// <param name="o1">  first object to be compared </param>
        /// <param name="o2">  second object to be compared
        /// </param>
        /// <returns>  a negative integer, 0, or positive integer to represent the
        ///          object o1 is less, equals, or greater than object 02. </returns>
        public virtual int Compare(FrequencyScore o1, FrequencyScore o2)
            // 1. compare how many words
            // for now, we assume less word is better,
            // i.e. whatever is better than "what ever"
            int    @out    = 0;
            string word1   = ((FrequencyScore)o1).GetWord();
            string word2   = ((FrequencyScore)o2).GetWord();
            int    wordNo1 = TermUtil.GetWordNo(word1);
            int    wordNo2 = TermUtil.GetWordNo(word2);

            if (wordNo1 != wordNo2)
                @out = wordNo1 - wordNo2; // less wordNo has higher rank
            else                          // same word no
                                          // 2. compare total score first
                double score1 = ((FrequencyScore)o1).GetScore();
                double score2 = ((FrequencyScore)o2).GetScore();
                // SCR-2: use a fixed number to ensure result is not 0.
                if (score2 > score1)
                    // from high to low
                    @out = 1;
                else if (score2 < score1)
                    @out = -1;
                else                     // 3. alphabetic order of word
                    @out = word2.CompareTo(word1);
Ejemplo n.º 18
        /// <summary>
        /// This method uses context scores to find the correct term.
        /// </summary>
        /// <param name="inTokenObj">    the input tokenObj (single word) </param>
        /// <param name="cSpellApi"> CSpell Api object </param>
        /// <param name="debugFlag"> flag for debug print </param>
        /// <param name="tarPos"> position for target token </param>
        /// <param name="nonSpaceTokenList"> token list without space token(s)
        /// </param>
        /// <returns>    the corrected word in tokenObj if the coreTerm is OOV
        ///             and suggested word found. Otherwise, the original input token
        ///             is returned. </returns>
        public static TokenObj GetCorrectTerm(TokenObj inTokenObj, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
            // init
            int funcMode = cSpellApi.GetFuncMode();

            // get inWord from inTokenObj and init outTokenObj
            string   inWord      = inTokenObj.GetTokenStr();
            TokenObj outTokenObj = new TokenObj(inTokenObj);
            // 1. convert a word to coreTerm (no leading/ending space, punc, digit)
            int         ctType      = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;
            CoreTermObj coreTermObj = new CoreTermObj(inWord, ctType);
            string      coreStr     = coreTermObj.GetCoreTerm();

            // 2. non-word detection and correction
            // check if the coreTerm is spelling errors - non-word
            //!NonWordDetector.IsValidWord(inWord, coreStr, cSpellApi, debugFlag);
            // TBD .. need to separate 1-to-1 and split
            if (NonWordDetector.IsDetect(inWord, coreStr, cSpellApi, debugFlag) == true)
                // TBD, should take care of possessive xxx's here
                // 3.1 get 1-to-1 candidates set from correction, no split
                HashSet <string> candSet = NonWord1To1Candidates.GetCandidates(coreStr, cSpellApi);
                // add split
                // TBD ...
                if (funcMode != CSpellApi.FUNC_MODE_NW_1)
                    // 3.2 get candidates from split
                    int maxSplitNo            = cSpellApi.GetCanNwMaxSplitNo();
                    HashSet <string> splitSet = NonWordSplitCandidates.GetCandidates(coreStr, cSpellApi, maxSplitNo);
                    // 3.4 set split candidates to candidate
                    if (funcMode == CSpellApi.FUNC_MODE_NW_S)
                        candSet = new HashSet <string>(splitSet);
                    else                         // 3.4 add split candidates
                // 4. Ranking: get top ranked candidates as corrected terms
                // 4.1 from orthoGraphic

                 * // not used context
                 * String topRankStr = RankByMode.GetTopRankStr(coreStr, candSet,
                 *  cSpellApi, debugFlag);
                // in case of using context
                string topRankStr = RankNonWordByMode.GetTopRankStr(coreStr, candSet, cSpellApi, debugFlag, tarPos, nonSpaceTokenList);
                // 5 update coreTerm and convert back to tokenObj
                string outWord = coreTermObj.ToString();
                // 6. update info if there is a process
                if (inWord.Equals(outWord) == false)
                    if (TermUtil.IsMultiword(outWord) == true)
                        outTokenObj.AddProcToHist(TokenObj.HIST_NW_S);                         //split
                        DebugPrint.PrintCorrect("NW", "NonWordCorrector-Split", inWord, outWord, debugFlag);
                    else                         // 1To1 correct
                        DebugPrint.PrintCorrect("NW", "NonWordCorrector-1To1", inWord, outWord, debugFlag);