示例#1
0
        // test Driver
        public static void MainTest(string[] args)
        {
            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java ContextScore");
                Environment.Exit(0);
            }
            // TBD: test
            // test real word and non word on split and merge case
            //
            // nonword split: knowabout -> know about, hotflahes -> hot flashes
            // alot -> a lot
            // nonword merge: stiff n ess -> stiffness
            //
            // realword: whatever vs. what ever
            // realword: onset vs. on set
            /// <summary>
            /// String configFile = "../data/Config/cSpell.properties";
            /// CSpellApi cSpellApi = new CSpellApi(configFile);
            /// Word2Vec w2vIm = cSpellApi.GetWord2VecIm();
            /// Word2Vec w2vOm = cSpellApi.GetWord2VecOm();
            ///
            /// </summary>
            string   inImFile    = "../data/Context/syn0.data";
            string   inOmFile    = "../data/Context/syn1n.data";
            bool     verboseFlag = true;
            Word2Vec w2vIm       = new Word2Vec(inImFile, verboseFlag);
            Word2Vec w2vOm       = new Word2Vec(inOmFile, verboseFlag);

            //Tests(w2vIm, w2vOm);
            TestOnSet(w2vIm, w2vOm);
        }
示例#2
0
        // unit test driver
        public static void MainTest(string[] args)
        {
            //String inFile = "../data/Context/word2Vec.data";
            string inFile = "../data/Context/syn1n.data";

            if (args.Length == 1)
            {
                inFile = args[0];
            }
            else if (args.Length > 0)
            {
                Console.Error.WriteLine("Usage: java Word2Vec <inFile>");
                Environment.Exit(1);
            }
            // test
            try {
                Word2Vec word2Vec = new Word2Vec(inFile);
                Console.WriteLine("Dimension: " + word2Vec.GetDimension());
                Console.WriteLine("Word No: " + word2Vec.GetWordNo());
                Console.WriteLine("Word size in WrodVec: " + word2Vec.GetWordVecMap().Keys.Count);
                Console.WriteLine("HasWordVec(man): " + word2Vec.HasWordVec("man"));
                Console.WriteLine("HasWordVec(king): " + word2Vec.HasWordVec("king"));
                Console.WriteLine("HasWordVec(ago): " + word2Vec.HasWordVec("ago"));
                Console.WriteLine("HasWordVec(a): " + word2Vec.HasWordVec("a"));
                Console.WriteLine("HasWordVec(ia): " + word2Vec.HasWordVec("ia"));
                Console.WriteLine("HasWordVec(m): " + word2Vec.HasWordVec("m"));
                Console.WriteLine("HasWordVec(xyxy): " + word2Vec.HasWordVec("xyxy"));
            } catch (Exception e) {
                Console.WriteLine(e.ToString());
                Console.Write(e.StackTrace);
            }
        }
示例#3
0
        private static void TestOnSet(Word2Vec w2vIm, Word2Vec w2vOm)
        {
            string inText = "He was diagnosed early on set dementia 3 years ago.";

            TextObj         textObj    = new TextObj(inText);
            List <TokenObj> inTextList = textObj.GetTokenList();
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            Console.WriteLine("==========================================");
            Console.WriteLine("-- inTextList: [" + inText + "]");
            int  tarPos           = 4;
            int  tarSize          = 2;   // "on set" has 2 tokens
            int  radius           = 2;
            bool word2VecSkipWord = true;
            bool debugFlag        = false;
            // 1 context with window radius
            DoubleVec    contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);
            string       str1       = "onset";
            ContextScore s1         = new ContextScore(str1, contextVec, w2vOm);
            string       str2       = "on set";
            ContextScore s2         = new ContextScore(str2, contextVec, w2vOm);

            Console.WriteLine("- [" + str1 + "]: " + s1.ToString());
            Console.WriteLine("- [" + str2 + "]: " + s2.ToString());
        }
        private static void Test(string inTerm, DoubleVec contextVec, Word2Vec w2vIm, Word2Vec w2vOm)
        {
            double cwobScore  = GetScore(inTerm, contextVec, w2vOm);
            double cwobScore2 = GetScore2(inTerm, contextVec, w2vOm);
            double cosScore   = GetSimilarityScore(inTerm, contextVec, w2vIm);

            Console.WriteLine(inTerm + "|" + string.Format("{0,1:F8}", cwobScore) + "|" + string.Format("{0,1:F8}", cwobScore2) + "|" + string.Format("{0,1:F8}", cosScore));
        }
示例#5
0
 // public constructor
 /// <summary>
 /// Public constructor for ContextScore
 /// </summary>
 /// <param name="inTerm"> target token or candidate (can be multiword) </param>
 /// <param name="contextVec"> wordVec of context from IM </param>
 /// <param name="word2Vec"> word2Vec matrix of OM </param>
 public ContextScore(string inTerm, DoubleVec contextVec, Word2Vec word2Vec)
 {
     term_ = inTerm;
     // Use Cosine Similarity between IM and OM
     score_ = Word2VecScore.GetScore(inTerm, contextVec, word2Vec);
     // TBD: use 2-3 gram
     //score_ = NgramScore.GetScore(word, ngram);
 }
示例#6
0
        // tarPos: start from 0, not include empty space token
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            int       rankMode         = cSpellApi.GetRankMode();
            double    wf1              = cSpellApi.GetOrthoScoreEdDistFac();
            double    wf2              = cSpellApi.GetOrthoScorePhoneticFac();
            double    wf3              = cSpellApi.GetOrthoScoreOverlapFac();
            WordWcMap wordWcMap        = cSpellApi.GetWordWcMap();
            string    topRankStr       = inStr;
            int       maxCandNo        = cSpellApi.GetCanMaxCandNo();
            Word2Vec  word2VecIm       = cSpellApi.GetWord2VecIm();
            Word2Vec  word2VecOm       = cSpellApi.GetWord2VecOm();
            int       contextRadius    = cSpellApi.GetNw1To1ContextRadius();
            bool      word2VecSkipWord = cSpellApi.GetWord2VecSkipWord();
            double    rangeFactor      = cSpellApi.GetRankNwS1RankRangeFac();
            double    nwS1MinOScore    = cSpellApi.GetRankNwS1MinOScore();
            int       tarSize          = 1; // only for one-to-one or split, no merge here

            // get the top ranked candidate
            if (candidates.Count > 0)
            {
                // get the top rank str by scores
                switch (rankMode)
                {
                case CSpellApi.RANK_MODE_ORTHOGRAPHIC:
                    topRankStr = RankByOrthographic.GetTopRankStr(inStr, candidates, wf1, wf2, wf3);
                    ScoreDetailByMode.PrintOrthographicScore(inStr, candidates, maxCandNo, wf1, wf2, wf3, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_FREQUENCY:
                    topRankStr = RankByFrequency.GetTopRankStr(candidates, wordWcMap);
                    ScoreDetailByMode.PrintFrequencyScore(candidates, wordWcMap, maxCandNo, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_CONTEXT:
                    topRankStr = RankByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius);
                    ScoreDetailByMode.PrintContextScore(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, maxCandNo, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_NOISY_CHANNEL:
                    topRankStr = RankByNoisyChannel.GetTopRankStr(inStr, candidates, wordWcMap, wf1, wf2, wf3);
                    ScoreDetailByMode.PrintNoisyChannelScore(inStr, candidates, wordWcMap, maxCandNo, wf1, wf2, wf3, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_ENSEMBLE:
                    topRankStr = RankByEnsemble.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, wf1, wf2, wf3);
                    // ensemble use same basic socre as CSpell
                    ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);
                    break;

                case CSpellApi.RANK_MODE_CSPELL:
                    topRankStr = RankByCSpellNonWord.GetTopRankStr(inStr, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, rangeFactor, nwS1MinOScore, wf1, wf2, wf3);
                    ScoreDetailByMode.PrintCSpellScore(inStr, candidates, wordWcMap, maxCandNo, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);
                    break;
                }
            }
            return(topRankStr);
        }
示例#7
0
        // not completed with contextScore
        private static void Tests(WordWcMap wordWcMap, Word2Vec w2vOm)
        {
            List <string> testStrList = new List <string>();

            Test("spel", "spell", wordWcMap);
            Test("spel", "speil", wordWcMap);
            Test("spelld", "spell", wordWcMap);
            Test("spelld", "spelled", wordWcMap);
        }
        // from ensemble paper, use the word vector (Input Matrix) for w2v
        // word2VecIm: word2Vec input matrix - syn0
        // Similarity score use word2Vec Im
        public static double GetSimilarityScore(string inTerm, DoubleVec contextVec, Word2Vec word2VecIm)
        {
            // 1. Get Avg. score for inTerm
            DoubleVec termVec = GetWordVecForTerm(inTerm, word2VecIm);
            // 2. Get Cosine similarity between contextVec and tarVec
            double score = GetSimilarityScore(termVec, contextVec);

            return(score);
        }
        // Use Avg. word2Vec Om for each word in the inTerm
        private static DoubleVec GetWordVecForTerm(string inTerm, Word2Vec w2vOm)
        {
            List <string> inWordList = TermUtil.ToWordList(inTerm);
            // avg. the wordVec if inTerm is a multiword
            DoubleVec outWordVec = GetAvgWordVecForList(inWordList, w2vOm);

            // TBD: take care of possesive
            return(outWordVec);
        }
示例#10
0
        // public method
        // inTerm: candidate (can be multiword)
        // contextVec: wordVec of context
        // word2VecOm: word2Vec output matrix - syn1neg
        // Use CWOB model to predict the target word = H X OM
        public static double GetScore(string inTerm, DoubleVec contextVec, Word2Vec w2vOm)
        {
            // 1. Get Avg. Vec for term (candidate from prediction)
            DoubleVec termVec = GetWordVecForTerm(inTerm, w2vOm);
            // 2. got the inner dot between hidden layer (context) and OM
            // to predict the output matrix in CBOW
            double score = GetCwobScore(termVec, contextVec);

            return(score);
        }
示例#11
0
        // use context score
        private static MergeObj GetTopRankMergeObjByContext(HashSet <MergeObj> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag)
        {
            // init
            Word2Vec word2VecIm       = cSpellApi.GetWord2VecIm();
            Word2Vec word2VecOm       = cSpellApi.GetWord2VecOm();
            int      contextRadius    = cSpellApi.GetNwMergeContextRadius();
            bool     word2VecSkipWord = cSpellApi.GetWord2VecSkipWord();
            int      maxCandNo        = cSpellApi.GetCanMaxCandNo();
            MergeObj topRankMergeObj  = RankNonWordMergeByContext.GetTopRankMergeObj(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag);

            return(topRankMergeObj);
        }
示例#12
0
        private static void TestWin(string inTerm, DoubleVec contextVec, Word2Vec w2vIm, Word2Vec w2vOm)
        {
            double cwobScore  = GetScore(inTerm, contextVec, w2vOm);
            double cwobScore2 = GetScore2(inTerm, contextVec, w2vOm);
            double cosScore   = GetSimilarityScore(inTerm, contextVec, w2vIm);

            Console.Write(string.Format("{0,1:F4}", cwobScore) + "-" + string.Format("{0,1:F4}", cwobScore2) + "|");

            /*
             * System.out.println(inTerm + "|" + String.format("%1.8f", cwobScore)
             + "|" + String.format("%1.8f", cosScore));
             */
        }
        // These are hueristic rule for real-word one-to-one correction
        // check if all one-to-one words in inTerm (candidate)
        // 1. must have wordVec.
        private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm)
        {
            List <string> wordList = TermUtil.ToWordList(inTerm);
            bool          flag     = true;

            foreach (string word in wordList)
            {
                if (word2VecOm.HasWordVec(word) == false)
                {
                    flag = false;
                    break;
                }
            }
            return(flag);
        }
示例#14
0
        // test Driver
        public static void MainTest(string[] args)
        {
            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java Word2VecContext");
                Environment.Exit(0);
            }
            // test
            string   inImFile    = "../data/Context/syn0.data";
            string   inOmFile    = "../data/Context/syn1n.data";
            bool     verboseFlag = true;
            Word2Vec w2vIm       = new Word2Vec(inImFile, verboseFlag);
            Word2Vec w2vOm       = new Word2Vec(inOmFile, verboseFlag);

            Tests(w2vIm, w2vOm);
        }
示例#15
0
        // test Driver
        public static void MainTest(string[] args)
        {
            if (args.Length > 0)
            {
                Console.WriteLine("Usage: java CSpellScore");
                Environment.Exit(0);
            }
            // test
            string    inFile      = "../data/Frequency/wcWord.data";
            bool      verboseFlag = true;
            WordWcMap wordWcMap   = new WordWcMap(inFile, verboseFlag);
            string    inOmFile    = "../data/Context/syn1n.data";
            Word2Vec  w2vOm       = new Word2Vec(inOmFile, verboseFlag);

            Tests(wordWcMap, w2vOm);
        }
        private static string GetTopRankStrByContext(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, bool debugFlag, int tarPos, List <TokenObj> nonSpaceTokenList)
        {
            // init
            Word2Vec word2VecIm = cSpellApi.GetWord2VecIm();
            Word2Vec word2VecOm = cSpellApi.GetWord2VecOm();
            //WordWcMap wordWcMap = cSpellApi.GetWordWcMap();
            int    contextRadius        = cSpellApi.GetRwSplitContextRadius();
            bool   word2VecSkipWord     = cSpellApi.GetWord2VecSkipWord();
            int    maxCandNo            = cSpellApi.GetCanMaxCandNo();
            int    tarSize              = 1; // only for split, the target size = 1
            double rwSplitCFactor       = cSpellApi.GetRankRwSplitCFac();
            int    shortSplitWordLength = cSpellApi.GetCanRwShortSplitWordLength();
            int    maxShortSplitWordNo  = cSpellApi.GetCanRwMaxShortSplitWordNo();
            // include detail print
            string topRankStr = RankRealWordSplitByContext.GetTopRankStr(inStr, candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, shortSplitWordLength, maxShortSplitWordNo, rwSplitCFactor, maxCandNo, debugFlag);

            return(topRankStr);
        }
示例#17
0
        // this method is to be deleted because it has same result as GetScore()
        public static double GetScore2(string inTerm, DoubleVec contextVec, Word2Vec w2vOm)
        {
            List <string> inWordList = TermUtil.ToWordList(inTerm);
            double        score      = 0.0d;
            int           count      = 0;

            foreach (string word in inWordList)
            {
                DoubleVec wordVec = w2vOm.GetWordVec(word);
                if (wordVec != null)
                {
                    score += GetCwobScore(wordVec, contextVec);
                }
                count++;
            }
            // add score first, then calculate the avg.
            score = score / count;
            return(score);
        }
示例#18
0
        // Average wordVec for a list of words
        public static DoubleVec GetAvgWordVecForList(IList <string> wordList, Word2Vec word2Vec)
        {
            // init the matrix to all zero
            int       dimension  = word2Vec.GetDimension();
            DoubleVec aveWordVec = new DoubleVec(dimension);
            int       count      = 0;

            foreach (string word in wordList)
            {
                DoubleVec wordVec = word2Vec.GetWordVec(word);
                if (wordVec != null)
                {
                    aveWordVec.Add(wordVec);
                }
                count++;
            }
            // calculate the avg.
            if (count != 0)
            {
                aveWordVec.Divide(count);
            }
            return(aveWordVec);
        }
示例#19
0
        // specify the radius
        public static List <string> GetContext(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, bool word2VecSkipWord, bool debugFlag)
        {
            int  radius     = 0;        // raidus is not needed when Context = true
            bool allContext = true;

            return(GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag, allContext));
        }
示例#20
0
        // context from all inTextList, no specify on window radius
        public static DoubleVec GetContextVec(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, bool word2VecSkipWord, bool debugFlag)
        {
            // 1. get the context
            List <string> contextList = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
            // 2. get the wordVec for the context
            DoubleVec contextVec = Word2VecScore.GetAvgWordVecForList(contextList, w2vIm);

            return(contextVec);
        }
示例#21
0
        private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm)
        {
            string          inText     = "... last 10 years #$% was dianosed test123 yahoo.com early on set deminita 3 year ago.";
            List <TokenObj> inTextList = TextObj.TextToTokenList(inText);

            Console.WriteLine("======= Word2VecContext ======================");
            Console.WriteLine(" - inText: [" + inText + "]");
            string inStr = String.Join("|", inTextList.Select(obj => obj.GetTokenStr()));

            Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]");

            int  tarPos    = 0;
            int  tarSize   = 1;
            int  index     = 0;
            int  radius    = 3;
            bool debugFlag = false;

            Console.WriteLine("------ Test GetContext (no skip), radius=3 ...");
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);

            foreach (TokenObj tokenObj in inTextList)
            {
                // not the space token
                if (tokenObj.IsSpaceToken() == false)
                {
                    string tokenStr = tokenObj.GetTokenStr();
                    // word2VecSkipWord = false (no skip)
                    List <string> contextList = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, false, debugFlag);
                    string        contextStr  = String.Join("|", contextList);
                    Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr + "]");
                    tarPos++;
                }
                index++;
            }
            Console.WriteLine("------ Test GetContext (skip) , radius=3 ...");
            Console.WriteLine(" - inText: [" + inText + "]");
            tarPos = 0;
            foreach (TokenObj tokenObj in inTextList)
            {
                // not the space token
                if (tokenObj.IsSpaceToken() == false)
                {
                    string tokenStr = tokenObj.GetTokenStr();
                    // word2VecSkipWord = true (skip)
                    List <string> contextList2 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, true, debugFlag);
                    string        contextStr2  = String.Join("|", contextList2);
                    Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr2 + "]");
                    tarPos++;
                }
                index++;
            }
            Console.WriteLine("------ Test GetContext (skip) , all ...");
            Console.WriteLine(" - inText: [" + inText + "]");
            tarPos = 0;
            // not the space token
            foreach (TokenObj tokenObj in nonSpaceTokenList)
            {
                string tokenStr = tokenObj.GetTokenStr();
                // word2VecSkipWord = true (skip)
                List <string> contextList3 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, true, debugFlag);
                string        contextStr3  = String.Join("|", contextList3);
                Console.WriteLine(tarPos + "|" + tokenStr + ": [" + contextStr3 + "]");
                tarPos++;
            }
        }
示例#22
0
 // private method
 private static void Test(Word2Vec w2vIm, Word2Vec w2vOm)
 {
 }
示例#23
0
        // Get context:
        // tarPos: target word position
        // tarSize: no. of tokens for target word (merge should be > 1)
        // inTextList: No empty space token
        // w2vIm: context must use word2Vec input matrix
        // radius: number of tokens before / after the tarPos
        // boolean word2VecSkipWord: skip word if the word does not have wordVec
        private static List <string> GetContextForTar(int tarPos, int tarSize, List <string> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool allContext)
        {
            // output context
            List <string> outContextList = new List <string>();
            // 2. find context before the tar token
            int tokenNo = 0;

            for (int i = tarPos - 1; i >= 0; i--)
            {
                string inWord = nonSpaceTokenList[i];
                // check if has wordVec if word2VecSkipWord = true
                if ((word2VecSkipWord == false) || (w2vIm.HasWordVec(inWord) == true))
                {
                    tokenNo++;
                    if ((tokenNo <= radius) || (allContext == true))
                    {
                        outContextList.Insert(0, inWord);
                    }
                    else
                    {
                        break;
                    }
                }
            }
            // 3. find context after the tar token
            int endPos = tarPos + tarSize;             // target could be multiwords

            tokenNo = 0;
            for (int i = endPos; i < nonSpaceTokenList.Count; i++)
            {
                string inWord = nonSpaceTokenList[i];
                if ((word2VecSkipWord == false) || (w2vIm.HasWordVec(inWord) == true))
                {
                    tokenNo++;
                    if ((tokenNo <= radius) || (allContext == true))
                    {
                        outContextList.Add(inWord);
                    }
                    else
                    {
                        break;
                    }
                }
            }

            return(outContextList);
        }
        private static bool CheckRealWord1To1Rules(ContextScore topContextScore, string inStr, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rw1To1Factor, bool debugFlag)
        {
            // return false if no topCand found
            if ((topContextScore == null) || (topContextScore.GetTerm().Equals(inStr)))
            {
                return(false);
            }
            // 1. get the word2Vec score for the org inStr b4 one-to-one
            // 1.1 wordVec for context
            DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
            // 1.2 wordVec for the original words before one-to-one
            ContextScore orgCs = new ContextScore(inStr, contextVec, word2VecOm);

            DebugPrint.Println("--- Real-Word One-To-One Context Score Detail: ---", debugFlag);
            DebugPrint.Println("- Score - orgTerm: " + orgCs.ToString(), debugFlag);
            DebugPrint.Println("- Score - top 1-to-1: " + topContextScore.ToString(), debugFlag);
            DebugPrint.Println("- rw1To1Factor: " + rw1To1Factor, debugFlag);
            // Score rules for one-to-one
            double orgScore = orgCs.GetScore();
            double topScore = topContextScore.GetScore();
            bool   flag     = false;

            // 2.1 no one-to-one correction if orgScore is 0.0d, no word2Vec information
            if (orgScore < 0.0d)
            {
                // 2.2a one-to-one if the org score is negative and top score is positive
                if (topScore > 0.0d)
                {
                    // another rule for word2Vec on real-word
                    if (((topScore - orgScore) > 0.085) && (orgScore > -0.085))                       // help from 0.6812 to 0.6877
                    {
                        flag = true;
                    }
                }
                // 2.2b one-to-one if the org score is negative and top score is better
                else if ((topScore < 0.0d) && (topScore > orgScore * rw1To1Factor))
                {
                    flag = true;
                }
            }
            else if (orgScore > 0.0d)
            {
                // 2.3a merge if the org score is positive and better 0.01*topScore
                if (topScore * rw1To1Factor > orgScore)
                {
                    flag = true;
                }
            }
            return(flag);
        }
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return null if no candidate is found to correct
        public static MergeObj GetTopRankMergeObj(HashSet <MergeObj> candidates, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, double rwMergeFactor, bool debugFlag)
        {
            // init the topRankMergeObj
            MergeObj topRankMergeObj = null;

            if (candidates.Count > 0)
            {
                // 1. find sorted score list for each candidates ...
                List <ContextScore> candScoreList = GetCandidateScoreList(candidates, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag);
                // 2. find the top ranked str
                // the 0 element has the highest score because it is sorted
                // only 1 candidate, use it for nonWord
                ContextScore topContextScore = null;
                if (candScoreList.Count > 0)
                {
                    topContextScore = candScoreList[0];
                }
                // 3. find the mergeObj from the topRankStr (if exist)
                if (topContextScore != null)
                {
                    // 3.1. convert mergeObj set to string set
                    // key: coreMergeWord, MergeObj
                    Dictionary <string, MergeObj> candStrMergeObjMap = new Dictionary <string, MergeObj>();
                    foreach (MergeObj mergeObj in candidates)
                    {
                        string mergeWord = mergeObj.GetCoreMergeWord();
                        candStrMergeObjMap[mergeWord] = mergeObj;
                    }
                    HashSet <string> andStrSet = new HashSet <string>(candStrMergeObjMap.Keys);
                    // 3.2 convert back from top rank str to MergeObj
                    // topRankStr should never be null because candidates is > 0
                    string topRankStr = topContextScore.GetTerm();
                    topRankMergeObj = candStrMergeObjMap.GetValueOrNull(topRankStr);
                    // 4. compare the top rank merge to the original string b4 merge
                    // 1. get the word2Vec score for the orgMergeTerm b4 merge
                    // 1.1 wordVec for context
                    int tarPos = topRankMergeObj.GetStartPos();
                    // tarSize is the total token No of the orgMergeWords
                    int       tarSize    = topRankMergeObj.GetEndPos() - topRankMergeObj.GetStartPos() + 1;
                    DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                    // 1.2 wordVec for the original words before merge
                    string       orgMergeWord    = topRankMergeObj.GetOrgMergeWord();
                    ContextScore orgContextScore = new ContextScore(orgMergeWord, contextVec, word2VecOm);
                    // validate top merge candidate, set to null if false
                    if (IsTopCandValid(orgContextScore, topContextScore, rwMergeFactor, debugFlag) == false)
                    {
                        // set to null if score is not good enough for corection
                        topRankMergeObj = null;
                    }
                }
            }
            return(topRankMergeObj);
        }
示例#26
0
        private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm)
        {
            string          inText     = "for the last 10 years    was dianosed\n early on set deminita 3 years ago";
            List <TokenObj> inTextList = TextObj.TextToTokenList(inText);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList);
            List <string>   testStrList       = new List <string>();

            testStrList.Add("diagnosed");
            testStrList.Add("diagnose");
            testStrList.Add("dianosed");
            // init context
            int       tarPos           = 6;
            int       tarSize          = 1;
            int       radius           = 2;
            bool      word2VecSkipWord = true;
            bool      debugFlag        = false;
            DoubleVec contextVec       = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag);

            Console.WriteLine("===== Test diagnosed|diagnose|dianosed (window-2) =====");
            Console.WriteLine("inText: [" + inText + "]");
            Console.WriteLine("============================================");
            Console.WriteLine("Candidates|CBOW score|CBOW score 2|Similarity score");
            Console.WriteLine("============================================");
            foreach (string testStr in testStrList)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            Console.WriteLine("===== Test diagnosed|diagnose|dianosed (whole text) =====");
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag);
            foreach (string testStr in testStrList)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            string          inText1     = "Not all doctors know about this syndrome.";
            List <TokenObj> inTextList1 = TextObj.TextToTokenList(inText1);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList1 = TextObj.GetNonSpaceTokenObjList(inTextList1);

            Console.WriteLine("===== Test know about|know|about (window) =====");
            List <string> testStrList1 = new List <string>();

            testStrList1.Add("know about");
            testStrList1.Add("know");
            testStrList1.Add("about");
            tarPos     = 3;
            tarSize    = 2;
            radius     = 2;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test(testStrList1[0], contextVec, w2vIm, w2vOm);
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag);
            Test(testStrList1[0], contextVec, w2vIm, w2vOm);
            tarPos     = 3;
            tarSize    = 1;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test(testStrList1[1], contextVec, w2vIm, w2vOm);
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag);
            Test(testStrList1[1], contextVec, w2vIm, w2vOm);
            tarPos     = 4;
            tarSize    = 1;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test(testStrList1[2], contextVec, w2vIm, w2vOm);
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag);
            Test(testStrList1[2], contextVec, w2vIm, w2vOm);

            string          inText2     = "for the last   10 years was diagnosed early on set dementia 3 years ago.";
            List <TokenObj> inTextList2 = TextObj.TextToTokenList(inText2);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList2 = TextObj.GetNonSpaceTokenObjList(inTextList2);
            List <string>   testStrList2       = new List <string>();

            testStrList2.Add("onset");
            testStrList2.Add("on set");
            Console.WriteLine("===== Test onset|on set (window-3) =====");
            Console.WriteLine("inText: [" + inText + "]");
            tarPos     = 8;
            tarSize    = 2;
            radius     = 3;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag);
            foreach (string testStr in testStrList2)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            tarPos     = 8;
            tarSize    = 1;
            radius     = 3;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test("on", contextVec, w2vIm, w2vOm);
            tarPos     = 9;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag);
            Test("set", contextVec, w2vIm, w2vOm);
            Console.WriteLine("===== Test onset|on set (whole text) =====");
            radius     = nonSpaceTokenList2.Count;
            contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, word2VecSkipWord, debugFlag);
            foreach (string testStr in testStrList2)
            {
                Test(testStr, contextVec, w2vIm, w2vOm);
            }
            Console.WriteLine("===== Go through each tokens with diff radius 1-9) =====");
            Console.WriteLine("tarPos|tarWord|r=1|r=2|r=3|r=4|r=5|r=6|r=7|r=8|r=9");
            //String inText3 = "Broken bones can not sleep at night!";
            string          inText3     = "not xyxy all doctors know about this syndrome.";
            List <TokenObj> inTextList3 = TextObj.TextToTokenList(inText3);
            // remove space token from the list
            List <TokenObj> nonSpaceTokenList3 = TextObj.GetNonSpaceTokenObjList(inTextList3);

            tarPos  = 0;
            tarSize = 1;
            radius  = 0;
            foreach (TokenObj tokenObj in nonSpaceTokenList3)
            {
                // skip the space token
                string tokenStr = tokenObj.GetTokenStr();
                string inStr    = Word2VecContext.NormWordForWord2Vec(tokenStr);
                Console.Write(tarPos + "|" + tokenStr + "|");
                // print out all radius
                for (int r = 1; r < 10; r++)
                {
                    contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, inTextList2, w2vIm, r, word2VecSkipWord, debugFlag);
                    TestWin(inStr, contextVec, w2vIm, w2vOm);
                }
                Console.WriteLine("");
                tarPos++;
            }
        }
示例#27
0
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return the orignal inStr if no candidate has score > 0.0d
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec word2VecIm, Word2Vec word2VecOm, bool word2VecSkipWord, int contextRadius, int shortSplitWordLength, int maxShortSplitWordNo, double rwSplitFactor, int maxCandNo, bool debugFlag)
        {
            // init
            string topRankStr = inStr;

            // Find the correction str
            if (candidates.Count > 0)
            {
                // 1. sorted score list for each candidates ...
                // This ranking can be improved if n-gram model (frequecny) is used
                List <ContextScore> candScoreList = RankByContext.GetCandidateScoreList(candidates, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, debugFlag);
                // 1.1 get the top tank candidate
                ContextScore topContextScore = candScoreList[0];
                // 2. validate the top rank
                // 2.1 wordVec for context
                DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                // 2.2 wordVec for the original words before split
                ContextScore orgContextScore = new ContextScore(inStr, contextVec, word2VecOm);
                // 2.3 compare the top rank split to the original string b4 split
                if (IsTopCandValid(inStr, orgContextScore, topContextScore, rwSplitFactor, debugFlag) == true)
                {
                    // no correction: if score is not good enough for corection
                    topRankStr = topContextScore.GetTerm();
                }
                // debug print
                if (debugFlag == true)
                {
                    // print focus token (original)
                    DebugPrint.PrintCScore(orgContextScore.ToString(), debugFlag);
                    // print candidates
                    ContextScoreComparator <ContextScore> csc = new ContextScoreComparator <ContextScore>();
                    var list = candScoreList.OrderBy(x => x, csc).Take(maxCandNo).Select(obj => obj.ToString()).ToList();
                    foreach (var item in list)
                    {
                        DebugPrint.PrintCScore(item, debugFlag);
                    }
                }
            }
            return(topRankStr);
        }
示例#28
0
        public static List <string> GetContext(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool debugFlag)
        {
            bool allContext = false;

            return(GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag, allContext));
        }
示例#29
0
        private static List <string> GetContext(int tarPos, int tarSize, List <TokenObj> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool debugFlag, bool allContext)
        {
            // normal TokenObj to string, use coreTerm.lc
            List <string> normTextList = new List <string>();

            if (nonSpaceTokenList != null)
            {
                foreach (TokenObj tokenObj in nonSpaceTokenList)
                {
                    // norm the token, such as [NUM], [URL], [EMAIL]
                    // TBD, should be done in pre-correction, preProcess
                    string normWord = NormWordForWord2Vec(tokenObj.GetTokenStr());
                    normTextList.Add(normWord);
                }
            }
            // get the context list by normStr (becasue normStr is key in w2v)
            List <string> contextList = GetContextForTar(tarPos, tarSize, normTextList, w2vIm, radius, word2VecSkipWord, allContext);

            DebugPrint.PrintContext(contextList, debugFlag);
            return(contextList);
        }
        // return the best ranked str from candidates using word2Vec score
        // inTokenList, includes space token, is not coreTerm.Lc
        // return the orignal inStr if no candidate has score > 0.0d
        public static string GetTopRankStr(string inStr, HashSet <string> candidates, CSpellApi cSpellApi, int tarPos, List <TokenObj> nonSpaceTokenList, bool debugFlag)
        {
            // init
            WordWcMap wordWcMap        = cSpellApi.GetWordWcMap();
            Word2Vec  word2VecIm       = cSpellApi.GetWord2VecIm();
            Word2Vec  word2VecOm       = cSpellApi.GetWord2VecOm();
            int       contextRadius    = cSpellApi.GetRw1To1ContextRadius();
            bool      word2VecSkipWord = cSpellApi.GetWord2VecSkipWord();
            int       maxCandNo        = cSpellApi.GetCanMaxCandNo();
            double    wf1        = cSpellApi.GetOrthoScoreEdDistFac();
            double    wf2        = cSpellApi.GetOrthoScorePhoneticFac();
            double    wf3        = cSpellApi.GetOrthoScoreOverlapFac();
            int       tarSize    = 1;    // only for one-to-one, no merge here
            string    topRankStr = inStr;
            // use cSpell top candidates
            int                topNo           = 1; // top sort
            string             inStrLc         = inStr.ToLower();
            List <CSpellScore> cSpellScoreList = RankByCSpellRealWord1To1.GetCandidateScoreList(inStrLc, candidates, wordWcMap, tarPos, tarSize, nonSpaceTokenList, word2VecIm, word2VecOm, word2VecSkipWord, contextRadius, wf1, wf2, wf3, debugFlag);

            // Find the correction str and correct
            if (cSpellScoreList.Count > 0)
            {
                // the rw top rank must be in both NC and orthographic
                CSpellScore  topScore        = cSpellScoreList[0];
                double       topFScore       = topScore.GetFScore().GetScore();         //frequency
                double       topTScore       = topScore.GetOScore().GetTokenScore();    // Token
                double       topPScore       = topScore.GetOScore().GetPhoneticScore(); //Phone
                double       topOScore       = topScore.GetOScore().GetOverlapScore();  //overlap
                ContextScore orgContextScore = null;
                // check the frequency
                // get the max score of frequency, eidt, phonetic, and overlap
                // the top rank must have all top score for above
                if ((topFScore == CSpellScore.GetMaxFScore(cSpellScoreList)) && (topTScore == CSpellScore.GetMaxEScore(cSpellScoreList)) && (topPScore == CSpellScore.GetMaxPScore(cSpellScoreList)) && (topOScore == CSpellScore.GetMaxOScore(cSpellScoreList)))
                {
                    ContextScore topContextScore = topScore.GetCScore();
                    // 1.1 wordVec for context
                    DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, word2VecIm, contextRadius, word2VecSkipWord, debugFlag);
                    // 1.2 wordVec for the original words before one-to-one
                    orgContextScore = new ContextScore(inStr, contextVec, word2VecOm);
                    FrequencyScore orgFScore = new FrequencyScore(inStr, wordWcMap);
                    // pass the orgContextScore
                    if (IsTopCandValid(inStr, orgContextScore, topScore, orgFScore, cSpellApi, debugFlag) == true)
                    {
                        // no correction: if score is not good enough for corection
                        topRankStr = topScore.GetCandStr();
                        // debug print for ananlysis
                        /// <summary>
                        ///*
                        /// System.out.println("======= cSpellScoreList.size(): "
                        ///    + cSpellScoreList.size() + " ========");
                        /// System.out.println(inStr
                        ///    + "," + String.format("%1.8f", orgFScore.GetScore())
                        ///    + "," + String.format("%1.8f", orgContextScore.GetScore()));
                        /// System.out.println(CSpellScore.GetScoreHeader());
                        /// for(CSpellScore cSpellScore: cSpellScoreList)
                        /// {
                        ///    System.out.println(cSpellScore.ToString(","));
                        /// }
                        /// **
                        /// </summary>
                    }
                }
                // debug print
                if (debugFlag == true)
                {
                    // print focus token (original)
                    if (orgContextScore != null)
                    {
                        DebugPrint.PrintScore(orgContextScore.ToString(), debugFlag);
                    }
                    else
                    {
                        DebugPrint.PrintScore("No score for focus (" + inStr + ")", debugFlag);
                    }
                    // print candidate
                    var list = cSpellScoreList.Take(maxCandNo).Select(obj => obj.ToString()).ToList();
                    foreach (var item in list)
                    {
                        DebugPrint.PrintScore(item, debugFlag);
                    }
                }
            }
            return(topRankStr);
        }