Beispiel #1
0
        // Get context:
        // tarPos: target word position
        // tarSize: no. of tokens for target word (merge should be > 1)
        // inTextList: No empty space token
        // w2vIm: context must use word2Vec input matrix
        // radius: number of tokens before / after the tarPos
        // boolean word2VecSkipWord: skip word if the word does not have wordVec
        private static List <string> GetContextForTar(int tarPos, int tarSize, List <string> nonSpaceTokenList, Word2Vec w2vIm, int radius, bool word2VecSkipWord, bool allContext)
        {
            // output context
            List <string> outContextList = new List <string>();
            // 2. find context before the tar token
            int tokenNo = 0;

            for (int i = tarPos - 1; i >= 0; i--)
            {
                string inWord = nonSpaceTokenList[i];
                // check if has wordVec if word2VecSkipWord = true
                if ((word2VecSkipWord == false) || (w2vIm.HasWordVec(inWord) == true))
                {
                    tokenNo++;
                    if ((tokenNo <= radius) || (allContext == true))
                    {
                        outContextList.Insert(0, inWord);
                    }
                    else
                    {
                        break;
                    }
                }
            }
            // 3. find context after the tar token
            int endPos = tarPos + tarSize;             // target could be multiwords

            tokenNo = 0;
            for (int i = endPos; i < nonSpaceTokenList.Count; i++)
            {
                string inWord = nonSpaceTokenList[i];
                if ((word2VecSkipWord == false) || (w2vIm.HasWordVec(inWord) == true))
                {
                    tokenNo++;
                    if ((tokenNo <= radius) || (allContext == true))
                    {
                        outContextList.Add(inWord);
                    }
                    else
                    {
                        break;
                    }
                }
            }

            return(outContextList);
        }
Beispiel #2
0
        // unit test driver
        public static void MainTest(string[] args)
        {
            //String inFile = "../data/Context/word2Vec.data";
            string inFile = "../data/Context/syn1n.data";

            if (args.Length == 1)
            {
                inFile = args[0];
            }
            else if (args.Length > 0)
            {
                Console.Error.WriteLine("Usage: java Word2Vec <inFile>");
                Environment.Exit(1);
            }
            // test
            try {
                Word2Vec word2Vec = new Word2Vec(inFile);
                Console.WriteLine("Dimension: " + word2Vec.GetDimension());
                Console.WriteLine("Word No: " + word2Vec.GetWordNo());
                Console.WriteLine("Word size in WrodVec: " + word2Vec.GetWordVecMap().Keys.Count);
                Console.WriteLine("HasWordVec(man): " + word2Vec.HasWordVec("man"));
                Console.WriteLine("HasWordVec(king): " + word2Vec.HasWordVec("king"));
                Console.WriteLine("HasWordVec(ago): " + word2Vec.HasWordVec("ago"));
                Console.WriteLine("HasWordVec(a): " + word2Vec.HasWordVec("a"));
                Console.WriteLine("HasWordVec(ia): " + word2Vec.HasWordVec("ia"));
                Console.WriteLine("HasWordVec(m): " + word2Vec.HasWordVec("m"));
                Console.WriteLine("HasWordVec(xyxy): " + word2Vec.HasWordVec("xyxy"));
            } catch (Exception e) {
                Console.WriteLine(e.ToString());
                Console.Write(e.StackTrace);
            }
        }
        // These are hueristic rule for real-word one-to-one correction
        // check if all one-to-one words in inTerm (candidate)
        // 1. must have wordVec.
        private static bool Check1To1Words(string inTerm, Word2Vec word2VecOm)
        {
            List <string> wordList = TermUtil.ToWordList(inTerm);
            bool          flag     = true;

            foreach (string word in wordList)
            {
                if (word2VecOm.HasWordVec(word) == false)
                {
                    flag = false;
                    break;
                }
            }
            return(flag);
        }