Exemple #1
0
 private void Button_Click(object sender, RoutedEventArgs e)
 {
     if (t != null && !t.IsCompleted)
     {
         warnings.Content = "Training in progress, please wait for it to finish";
         return;
     }
     Word2Vec.Net.Word2Vec word2Vec = null;
     try
     {
         word2Vec = Word2VecBuilder.Create()
                    .WithTrainFile(trainSet.Text)                          // Use text data to train the model;
                    .WithOutputFile(@"MODELS\" + outputName.Text + ".bin") //Use to save the resulting word vectors / word clusters
                    .WithSize(int.Parse(vectorSizeTB.Text))                //Set size of word vectors; default is 100
                                                                           //.WithSaveVocubFile()//The vocabulary will be saved to <file>
                    .WithDebug(2)                                          //Set the debug mode (default = 2 = more info during training)
                    .WithBinary(1)                                         //Save the resulting vectors in binary moded; default is 0 (off)
                    .WithCBow(modelCB.SelectedIndex)                       //Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
                    .WithAlpha(float.Parse(learningRateTB.Text))           //Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
                    .WithWindow(7)                                         //Set max skip length between words; default is 5
                    .WithSample((float)1e-3)                               //Set threshold for occurrence of words. Those that appear with higher frequency in the training data twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
                    .WithHs(0)                                             //Use Hierarchical Softmax; default is 0 (not used)
                    .WithNegative(int.Parse(negtb.Text))                   //Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
                    .WithThreads(int.Parse(threadsCB.Text))                //Use <int> threads (default 12)
                    .WithIter(int.Parse(iterTB.Text))                      //Run more training iterations (default 5)
                    .WithMinCount(30)                                      //This will discard words that appear less than <int> times; default is 5
                    .WithClasses(0)                                        //Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
                    .Build();
     }
     catch (Exception e1)
     {
         warnings.Content = "Wrong input";
         return;
     }
     warnings.Content = "Training started.";
     t = new Task(word2Vec.TrainModel);
     t.Start();
 }
Exemple #2
0
        static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("WORD VECTOR estimation toolkit v 0.1c\n");
                Console.WriteLine("Options:");
                Console.WriteLine("Parameters for training:");
                Console.WriteLine("\t-train <file>");
                Console.WriteLine("\t\tUse text data from <file> to train the model");
                Console.WriteLine("\t-output <file>");
                Console.WriteLine("\t\tUse <file> to save the resulting word vectors / word clusters");
                Console.WriteLine("\t-size <int>");
                Console.WriteLine("\t\tSet size of word vectors; default is 100");
                Console.WriteLine("\t-window <int>");
                Console.WriteLine("\t\tSet max skip length between words; default is 5");
                Console.WriteLine("\t-sample <float>");
                Console.WriteLine(
                    "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data");
                Console.WriteLine("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)");
                Console.WriteLine("\t-hs <int>");
                Console.WriteLine("\t\tUse Hierarchical Softmax; default is 0 (not used)");
                Console.WriteLine("\t-negative <int>");
                Console.WriteLine(
                    "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)");
                Console.WriteLine("\t-threads <int>");
                Console.WriteLine("\t\tUse <int> threads (default 12)");
                Console.WriteLine("\t-iter <int>");
                Console.WriteLine("\t\tRun more training iterations (default 5)");
                Console.WriteLine("\t-min-count <int>");
                Console.WriteLine("\t\tThis will discard words that appear less than <int> times; default is 5");
                Console.WriteLine("\t-alpha <float>");
                Console.WriteLine("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW");
                Console.WriteLine("\t-classes <int>");
                Console.WriteLine(
                    "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)");
                Console.WriteLine("\t-debug <int>");
                Console.WriteLine("\t\tSet the debug mode (default = 2 = more info during training)");
                Console.WriteLine("\t-binary <int>");
                Console.WriteLine("\t\tSave the resulting vectors in binary moded; default is 0 (off)");
                Console.WriteLine("\t-save-vocab <file>");
                Console.WriteLine("\t\tThe vocabulary will be saved to <file>");
                Console.WriteLine("\t-read-vocab <file>");
                Console.WriteLine("\t\tThe vocabulary will be read from <file>, not constructed from the training data");
                Console.WriteLine("\t-cbow <int>");
                Console.WriteLine("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)");
                Console.WriteLine("Examples:");
                Console.WriteLine(
                    "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3");
                Console.ReadLine();
                return;
            }
            int    i;
            var    builder        = Word2VecBuilder.Create();
            string outputFileName = String.Empty;

            if ((i = ArgPos("-train", args)) > -1)
            {
                builder.WithTrainFile(args[i + 1]);
            }
            if ((i = ArgPos("-output", args)) > -1)
            {
                outputFileName = args[i + 1];
                builder.WithOutputFile(outputFileName);
            }

            if ((i = ArgPos("-size", args)) > -1)
            {
                builder.WithSize(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-save-vocab", args)) > -1)
            {
                builder.WithSaveVocubFile(args[i + 1]);
            }
            if ((i = ArgPos("-read-vocab", args)) > -1)
            {
                builder.WithReadVocubFile(args[i + 1]);
            }
            if ((i = ArgPos("-debug", args)) > -1)
            {
                builder.WithDebug(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-binary", args)) > -1)
            {
                builder.WithBinary(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-cbow", args)) > -1)
            {
                builder.WithCBow(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-alpha", args)) > -1)
            {
                builder.WithAlpha(float.Parse(args[i + 1]));
            }

            if ((i = ArgPos("-window", args)) > -1)
            {
                builder.WithWindow(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-sample", args)) > -1)
            {
                builder.WithSample(float.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-hs", args)) > -1)
            {
                builder.WithHs(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-negative", args)) > -1)
            {
                builder.WithNegative(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-threads", args)) > -1)
            {
                builder.WithThreads(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-iter", args)) > -1)
            {
                builder.WithIter(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-min-count", args)) > -1)
            {
                builder.WithMinCount(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-classes", args)) > -1)
            {
                builder.WithClasses(int.Parse(args[i + 1]));
            }
            Word2Vec.Net.Word2Vec word2Vec = builder.Build();
            word2Vec.TrainModel();

            var distance = new Distance(outputFileName);

            while (true)
            {
                Console.WriteLine("Distance: Enter word or sentence (EXIT to break): ");
                string text = Console.ReadLine();
                if (text == null || text.ToLower().Equals("exit"))
                {
                    break;
                }
                var result = distance.Search(text);
                Console.WriteLine("\n                                              Word       Cosine distance\n------------------------------------------------------------------------");
                foreach (var bestWord in result.Where(x => !string.IsNullOrEmpty(x.Word)))
                {
                    Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance);
                }
                Console.WriteLine();
            }
        }