예제 #1
0
        static void Main(string[] args)
        {
            
            if (args.Length == 0)
            {
                ShowMan();
            }

            string outputFileName;
            Word2VecBuilder builder = _getBuilder(args, out outputFileName);
            builder.WithDebug(2).WithThreads(8).WithSize(300).WithIter(5);
            builder.WithTrainFile(@"D:\text8_upd.txt");
            outputFileName = @"D:\out.txt";
            builder.WithOutputFile(outputFileName);

            Word2Vec.Net.Word2Vec word2Vec = builder.Build();
            word2Vec.TrainModel();

            var distance = new Distance(outputFileName);
            while (true)
            {
                Console.WriteLine("Distance: Enter word or sentence (EXIT to break): ");
                string text = Console.ReadLine();
                if (text == null || text.ToLower().Equals("exit"))
                    break;
                var result = distance.Search(text);
                Console.WriteLine("\n Word Cosine distance\n------------------------------------------------------------------------");
                foreach (var bestWord in result.Where(x => !string.IsNullOrEmpty(x.Word)))
                {
                    Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance);
                }
                Console.WriteLine();
            }
        }
예제 #2
0
        static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("WORD VECTOR estimation toolkit v 0.1c\n");
                Console.WriteLine("Options:");
                Console.WriteLine("Parameters for training:");
                Console.WriteLine("\t-train <file>");
                Console.WriteLine("\t\tUse text data from <file> to train the model");
                Console.WriteLine("\t-output <file>");
                Console.WriteLine("\t\tUse <file> to save the resulting word vectors / word clusters");
                Console.WriteLine("\t-size <int>");
                Console.WriteLine("\t\tSet size of word vectors; default is 100");
                Console.WriteLine("\t-window <int>");
                Console.WriteLine("\t\tSet max skip length between words; default is 5");
                Console.WriteLine("\t-sample <float>");
                Console.WriteLine(
                    "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data");
                Console.WriteLine("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)");
                Console.WriteLine("\t-hs <int>");
                Console.WriteLine("\t\tUse Hierarchical Softmax; default is 0 (not used)");
                Console.WriteLine("\t-negative <int>");
                Console.WriteLine(
                    "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)");
                Console.WriteLine("\t-threads <int>");
                Console.WriteLine("\t\tUse <int> threads (default 12)");
                Console.WriteLine("\t-iter <int>");
                Console.WriteLine("\t\tRun more training iterations (default 5)");
                Console.WriteLine("\t-min-count <int>");
                Console.WriteLine("\t\tThis will discard words that appear less than <int> times; default is 5");
                Console.WriteLine("\t-alpha <float>");
                Console.WriteLine("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW");
                Console.WriteLine("\t-classes <int>");
                Console.WriteLine(
                    "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)");
                Console.WriteLine("\t-debug <int>");
                Console.WriteLine("\t\tSet the debug mode (default = 2 = more info during training)");
                Console.WriteLine("\t-binary <int>");
                Console.WriteLine("\t\tSave the resulting vectors in binary moded; default is 0 (off)");
                Console.WriteLine("\t-save-vocab <file>");
                Console.WriteLine("\t\tThe vocabulary will be saved to <file>");
                Console.WriteLine("\t-read-vocab <file>");
                Console.WriteLine("\t\tThe vocabulary will be read from <file>, not constructed from the training data");
                Console.WriteLine("\t-cbow <int>");
                Console.WriteLine("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)");
                Console.WriteLine("Examples:");
                Console.WriteLine(
                    "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3");

                return;
            }
            int i;
            var builder = Word2VecBuilder.Create();
            string outputFileName = String.Empty;
            if ((i = ArgPos("-train", args)) > -1)
                builder.WithTrainFile(args[i + 1]);
            if ((i = ArgPos("-output", args)) > -1)
            {
                outputFileName = args[i + 1];
                builder.WithOutputFile(outputFileName);
            }

            if ((i = ArgPos("-size", args)) > -1)
                builder.WithSize(int.Parse(args[i + 1]));
            if ((i = ArgPos("-save-vocab", args)) > -1)
                builder.WithSaveVocubFile(args[i + 1]);
            if ((i = ArgPos("-read-vocab", args)) > -1)
                builder.WithReadVocubFile(args[i + 1]);
            if ((i = ArgPos("-debug", args)) > -1)
                builder.WithDebug(int.Parse(args[i + 1]));
            if ((i = ArgPos("-binary", args)) > -1)
                builder.WithBinary(int.Parse(args[i + 1]));
            if ((i = ArgPos("-cbow", args)) > -1)
                builder.WithCBow(int.Parse(args[i + 1]));
            if ((i = ArgPos("-alpha", args)) > -1)
                builder.WithAlpha(float.Parse(args[i + 1]));

            if ((i = ArgPos("-window", args)) > -1)
                builder.WithWindow(int.Parse(args[i + 1]));
            if ((i = ArgPos("-sample", args)) > -1)
                builder.WithSample(float.Parse(args[i + 1]));
            if ((i = ArgPos("-hs", args)) > -1)
                builder.WithHs(int.Parse(args[i + 1]));
            if ((i = ArgPos("-negative", args)) > -1)
                builder.WithNegative(int.Parse(args[i + 1]));
            if ((i = ArgPos("-threads", args)) > -1)
                builder.WithThreads(int.Parse(args[i + 1]));
            if ((i = ArgPos("-iter", args)) > -1)
                builder.WithIter(int.Parse(args[i + 1]));
            if ((i = ArgPos("-min-count", args)) > -1)
                builder.WithMinCount(int.Parse(args[i + 1]));
            if ((i = ArgPos("-classes", args)) > -1)
                builder.WithClasses(int.Parse(args[i + 1]));
            Word2Vec.Net.Word2Vec word2Vec = builder.Build();
            word2Vec.TrainModel();

            var distance = new Distance(outputFileName);
            while (true)
            {
                Console.WriteLine("Distance: Enter word or sentence (EXIT to break): ");
                string text = Console.ReadLine();
                if (text == null || text.ToLower().Equals("exit"))
                    break;
                var result = distance.Search(text);
                Console.WriteLine("\n                                              Word       Cosine distance\n------------------------------------------------------------------------");
                foreach (var bestWord in result.Where(x => !string.IsNullOrEmpty(x.Word)))
                {
                    Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance);
                }
                Console.WriteLine();
            }
        }