Пример #1
0
        public static void TrainSpellingModel()
        {
            string trainfile      = AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_train_data.txt";
            string outputFileName = AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin";
            var    word2Vec       = Word2VecBuilder.Create()
                                    .WithTrainFile(trainfile)                                                                 // Use text data to train the model;
                                    .WithOutputFile(outputFileName)                                                           //Use to save the resulting word vectors / word clusters
                                    .WithSize(200)                                                                            //Set size of word vectors; default is 100
                                    .WithSaveVocubFile(AppDomain.CurrentDomain.BaseDirectory + "/wordvec/noisy_spelling.txt") //The vocabulary will be saved to <file>
                                    .WithDebug(2)                                                                             //Set the debug mode (default = 2 = more info during training)
                                    .WithBinary(1)                                                                            //Save the resulting vectors in binary moded; default is 0 (off)
                                    .WithCBow(1)                                                                              //Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
                                    .WithAlpha(0.05f)                                                                         //Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
                                    .WithWindow(10)                                                                           //Set max skip length between words; default is 5
                                    .WithSample((float)1e-3)                                                                  //Set threshold for occurrence of words. Those that appear with higher frequency in the training data twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
                                    .WithHs(0)                                                                                //Use Hierarchical Softmax; default is 0 (not used)
                                    .WithNegative(5)                                                                          //Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
                                    .WithThreads(8)                                                                           //Use <int> threads (default 12)
                                    .WithIter(5)                                                                              //Run more training iterations (default 5)
                                    .WithMinCount(1)                                                                          //This will discard words that appear less than <int> times; default is 5
                                    .WithClasses(0)                                                                           //Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
                                    .Build();

            word2Vec.TrainModel();
        }
Пример #2
0
        public void TrainModel()
        {
            string trainFile  = Configuration.TrainFile;
            string outputFile = Configuration.OutputFile;
            string vocabFile  = Configuration.VocabularyFile;

            if (!File.Exists(outputFile) && !File.Exists(vocabFile) && File.Exists(trainFile))
            {
                var word2Vec = Word2VecBuilder.Create()
                               .WithTrainFile(trainFile)
                               .WithOutputFile(outputFile)
                               .WithSize(200)
                               .WithSaveVocubFile(vocabFile)
                               .Build();

                word2Vec.TrainModel();
            }
        }
Пример #3
0
        public void TestAppInit()
        {
            var word2Vec = Word2VecBuilder.Create()
                           .WithTrainFile("TestData/faust1.txt") // Use text data to train the model;
                           .WithOutputFile("faust1.bin")         //Use to save the resulting word vectors / word clusters
                           .WithSize(200)                        //Set size of word vectors; default is 100
                           .WithSaveVocubFile("faust1.dic")      //The vocabulary will be saved to <file>
                           .WithCBow(1)                          //Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
                           .WithAlpha(0.05f)                     //Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
                           .WithWindow(7)                        //Set max skip length between words; default is 5
                           .WithSample((float)1e-3)              //Set threshold for occurrence of words. Those that appear with higher frequency in the training data twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
                           .WithHs(0)                            //Use Hierarchical Softmax; default is 0 (not used)
                           .WithNegative(5)                      //Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
                           .WithThreads(5)                       //Use <int> threads (default 12)
                           .WithIter(5)                          //Run more training iterations (default 5)
                           .WithMinCount(5)                      //This will discard words that appear less than <int> times; default is 5
                           .WithClasses(0)                       //Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
                           .Build();

            word2Vec.TrainModel();
        }
Пример #4
0
        /// <summary>
        /// This part trains model
        /// </summary>
        /// <author>
        /// This method with libraries in Word2Vec.Net-master was programmed by GitHub user Eabdullin
        /// link to GitHub: https://github.com/eabdullin/Word2Vec.Net
        /// </author>
        /// <param name="trainfile">This file is located in SourceTXT folder</param>



        public static void TrainModel(string trainfile, int sizeOfVectors = 100, int minCount = 5, int iterations = 5)
        {
            string outputFileName = DirectoryManager.GetSpecifiedDirectory("Models") + DirectoryManager.sep + Path.GetFileName(trainfile);


            var word2Vec = Word2VecBuilder.Create()
                           .WithTrainFile(trainfile)       // Use text data to train the model;
                           .WithOutputFile(outputFileName) //Use to save the resulting word vectors / word clusters
                           .WithSize(sizeOfVectors)        //Set size of word vectors; default is 100
                           .WithDebug(2)                   //Set the debug mode (default = 2 = more info during training)
                           .WithCBow(1)                    //Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
                           .WithAlpha(0.05f)               //Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
                           .WithSample((float)1e-3)        //Set threshold for occurrence of words.
                           .WithHs(0)                      //Use Hierarchical Softmax; default is 0 (not used)
                           .WithNegative(5)                //Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
                           .WithThreads(12)                //Use <int> threads (default 12)
                           .WithIter(iterations)           //Run more training iterations (default 5)
                           .WithMinCount(minCount)         //This will discard words that appear less than <int> times; default is 5
                           .Build();

            word2Vec.TrainModel();
        }
Пример #5
0
 private void Button_Click(object sender, RoutedEventArgs e)
 {
     if (t != null && !t.IsCompleted)
     {
         warnings.Content = "Training in progress, please wait for it to finish";
         return;
     }
     Word2Vec.Net.Word2Vec word2Vec = null;
     try
     {
         word2Vec = Word2VecBuilder.Create()
                    .WithTrainFile(trainSet.Text)                          // Use text data to train the model;
                    .WithOutputFile(@"MODELS\" + outputName.Text + ".bin") //Use to save the resulting word vectors / word clusters
                    .WithSize(int.Parse(vectorSizeTB.Text))                //Set size of word vectors; default is 100
                                                                           //.WithSaveVocubFile()//The vocabulary will be saved to <file>
                    .WithDebug(2)                                          //Set the debug mode (default = 2 = more info during training)
                    .WithBinary(1)                                         //Save the resulting vectors in binary moded; default is 0 (off)
                    .WithCBow(modelCB.SelectedIndex)                       //Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
                    .WithAlpha(float.Parse(learningRateTB.Text))           //Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
                    .WithWindow(7)                                         //Set max skip length between words; default is 5
                    .WithSample((float)1e-3)                               //Set threshold for occurrence of words. Those that appear with higher frequency in the training data twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
                    .WithHs(0)                                             //Use Hierarchical Softmax; default is 0 (not used)
                    .WithNegative(int.Parse(negtb.Text))                   //Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
                    .WithThreads(int.Parse(threadsCB.Text))                //Use <int> threads (default 12)
                    .WithIter(int.Parse(iterTB.Text))                      //Run more training iterations (default 5)
                    .WithMinCount(30)                                      //This will discard words that appear less than <int> times; default is 5
                    .WithClasses(0)                                        //Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
                    .Build();
     }
     catch (Exception e1)
     {
         warnings.Content = "Wrong input";
         return;
     }
     warnings.Content = "Training started.";
     t = new Task(word2Vec.TrainModel);
     t.Start();
 }
Пример #6
0
        static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("WORD VECTOR estimation toolkit v 0.1c\n");
                Console.WriteLine("Options:");
                Console.WriteLine("Parameters for training:");
                Console.WriteLine("\t-train <file>");
                Console.WriteLine("\t\tUse text data from <file> to train the model");
                Console.WriteLine("\t-output <file>");
                Console.WriteLine("\t\tUse <file> to save the resulting word vectors / word clusters");
                Console.WriteLine("\t-size <int>");
                Console.WriteLine("\t\tSet size of word vectors; default is 100");
                Console.WriteLine("\t-window <int>");
                Console.WriteLine("\t\tSet max skip length between words; default is 5");
                Console.WriteLine("\t-sample <float>");
                Console.WriteLine(
                    "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data");
                Console.WriteLine("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)");
                Console.WriteLine("\t-hs <int>");
                Console.WriteLine("\t\tUse Hierarchical Softmax; default is 0 (not used)");
                Console.WriteLine("\t-negative <int>");
                Console.WriteLine(
                    "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)");
                Console.WriteLine("\t-threads <int>");
                Console.WriteLine("\t\tUse <int> threads (default 12)");
                Console.WriteLine("\t-iter <int>");
                Console.WriteLine("\t\tRun more training iterations (default 5)");
                Console.WriteLine("\t-min-count <int>");
                Console.WriteLine("\t\tThis will discard words that appear less than <int> times; default is 5");
                Console.WriteLine("\t-alpha <float>");
                Console.WriteLine("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW");
                Console.WriteLine("\t-classes <int>");
                Console.WriteLine(
                    "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)");
                Console.WriteLine("\t-debug <int>");
                Console.WriteLine("\t\tSet the debug mode (default = 2 = more info during training)");
                Console.WriteLine("\t-binary <int>");
                Console.WriteLine("\t\tSave the resulting vectors in binary moded; default is 0 (off)");
                Console.WriteLine("\t-save-vocab <file>");
                Console.WriteLine("\t\tThe vocabulary will be saved to <file>");
                Console.WriteLine("\t-read-vocab <file>");
                Console.WriteLine("\t\tThe vocabulary will be read from <file>, not constructed from the training data");
                Console.WriteLine("\t-cbow <int>");
                Console.WriteLine("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)");
                Console.WriteLine("Examples:");
                Console.WriteLine(
                    "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3");
                Console.ReadLine();
                return;
            }
            int    i;
            var    builder        = Word2VecBuilder.Create();
            string outputFileName = String.Empty;

            if ((i = ArgPos("-train", args)) > -1)
            {
                builder.WithTrainFile(args[i + 1]);
            }
            if ((i = ArgPos("-output", args)) > -1)
            {
                outputFileName = args[i + 1];
                builder.WithOutputFile(outputFileName);
            }

            if ((i = ArgPos("-size", args)) > -1)
            {
                builder.WithSize(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-save-vocab", args)) > -1)
            {
                builder.WithSaveVocubFile(args[i + 1]);
            }
            if ((i = ArgPos("-read-vocab", args)) > -1)
            {
                builder.WithReadVocubFile(args[i + 1]);
            }
            if ((i = ArgPos("-debug", args)) > -1)
            {
                builder.WithDebug(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-binary", args)) > -1)
            {
                builder.WithBinary(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-cbow", args)) > -1)
            {
                builder.WithCBow(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-alpha", args)) > -1)
            {
                builder.WithAlpha(float.Parse(args[i + 1]));
            }

            if ((i = ArgPos("-window", args)) > -1)
            {
                builder.WithWindow(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-sample", args)) > -1)
            {
                builder.WithSample(float.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-hs", args)) > -1)
            {
                builder.WithHs(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-negative", args)) > -1)
            {
                builder.WithNegative(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-threads", args)) > -1)
            {
                builder.WithThreads(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-iter", args)) > -1)
            {
                builder.WithIter(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-min-count", args)) > -1)
            {
                builder.WithMinCount(int.Parse(args[i + 1]));
            }
            if ((i = ArgPos("-classes", args)) > -1)
            {
                builder.WithClasses(int.Parse(args[i + 1]));
            }
            Word2Vec.Net.Word2Vec word2Vec = builder.Build();
            word2Vec.TrainModel();

            var distance = new Distance(outputFileName);

            while (true)
            {
                Console.WriteLine("Distance: Enter word or sentence (EXIT to break): ");
                string text = Console.ReadLine();
                if (text == null || text.ToLower().Equals("exit"))
                {
                    break;
                }
                var result = distance.Search(text);
                Console.WriteLine("\n                                              Word       Cosine distance\n------------------------------------------------------------------------");
                foreach (var bestWord in result.Where(x => !string.IsNullOrEmpty(x.Word)))
                {
                    Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance);
                }
                Console.WriteLine();
            }
        }