예제 #1
0
 private void SetFileName()
 {
     _analogy = new WordAnalogy(_fileName)
     {
         MinimumDistance = 0.0
     };
     _distance = new Distance(_fileName);
 }
예제 #2
0
        public void TestWordAnalogy()
        {
            var analogy = new WordAnalogy("faust1.bin")
            {
                MinimumDistance = 0
            };
            var result = analogy.Search(new[] { "Natur", "Gott" });

            Assert.IsNotNull(result);
            var top = result.OrderByDescending(x => x.Value).Take(10).ToArray();

            Console.WriteLine(top.First().Key);
        }
예제 #3
0
 private void SetFileName()
 {
     _analogy  = new WordAnalogy(_fileName);
     _distance = new Distance(_fileName);
 }
예제 #4
0
        static void Main(string[] args)
        {
            // -train <file> Use text data from <file> to train the model
            string train = "Corpus.txt";

            // -output <file> Use <file> to save the resulting word vectors / word clusters
            string output = "Vectors.bin";

            // -save-vocab <file> The vocabulary will be saved to <file>
            string savevocab = "";

            // -read-vocab <file> The vocabulary will be read from <file>, not constructed from the training data
            string readvocab = "";

            // -size <int> Set size of word vectors; default is 100
            int size = 100;

            // -debug <int> Set the debug mode (default = 2 = more info during training)
            int debug = 2;

            // -binary <int> Save the resulting vectors in binary moded; default is 0 (off)
            int binary = 1;

            // -cbow <int> Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
            int cbow = 1;

            // -alpha <float> Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
            float alpha = 0.05f;

            // -sample <float> Set threshold for occurrence of words. Those that appear with higher frequency in the training data
            float sample = 1e-4f;

            // -hs <int> Use Hierarchical Softmax; default is 0 (not used)
            int hs = 0;

            // -negative <int> Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
            int negative = 25;

            // -threads <int> Use <int> threads (default 12)
            int threads = 12;

            // -iter <int> Run more training iterations (default 5)
            long iter = 15;

            // -min-count <int> This will discard words that appear less than <int> times; default is 5
            int mincount = 5;

            // -classes <int> Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
            long classes = 0;

            // -window <int> Set max skip length between words; default is 5
            int window = 12;

            Word2Vec word2Vec = new Word2Vec(train, output, savevocab, readvocab, size, debug, binary, cbow, alpha, sample, hs, negative, threads, iter, mincount, classes, window);

            var totalTime = Stopwatch.StartNew();
            var highRes   = Stopwatch.IsHighResolution;

            word2Vec.TrainModel();

            totalTime.Stop();

            var trainingTime = totalTime.ElapsedMilliseconds;

            Console.WriteLine("Training took {0}ms", trainingTime);

            path        = @"Vectors.bin";
            distance    = new Distance(path);
            wordAnalogy = new WordAnalogy(path);

            string[] wordList = new string[] { "paris france madrid" };

            var searchTime = Stopwatch.StartNew();

            foreach (string word in wordList)
            {
                distance.Search(word);
                wordAnalogy.Search(word);
            }

            searchTime.Stop();
            var firstSearchTime = searchTime.ElapsedMilliseconds;

            Console.WriteLine("Search took {0}ms", firstSearchTime);

            int outerN = 5;

            for (int outer = 0; outer < outerN; outer++)
            {
                foreach (string word in wordList)
                {
                    int    N             = 11;
                    var    minSearchTime = long.MaxValue;
                    var    maxSearchTime = long.MinValue;
                    long[] searchTimes   = new long[N];

                    Console.WriteLine($"Batch {outer}, searching {word}: running {N} searches");

                    for (int inner = 0; inner < N; inner++)
                    {
                        searchTime.Restart();
                        distance.Search(word);
                        BestWord[] result = wordAnalogy.Search(word);
                        searchTime.Stop();

                        /*foreach (var bestWord in result)
                         * {
                         *  Console.WriteLine("{0}\t\t{1}", bestWord.Word, bestWord.Distance);
                         * }*/

                        long interval = highRes ? searchTime.ElapsedTicks : searchTime.ElapsedMilliseconds;
                        searchTimes[inner] = interval;

                        if (interval < minSearchTime)
                        {
                            minSearchTime = interval;
                        }
                        if (interval > maxSearchTime)
                        {
                            maxSearchTime = interval;
                        }
                    }

                    if (highRes)
                    {
                        double averageSearch = 1000 * ((double)searchTimes.Sum() / N / Stopwatch.Frequency);
                        double medianSearch  = 1000 * ((double)searchTimes.OrderBy(t => t).ElementAt(N / 2) / Stopwatch.Frequency);
                        Console.WriteLine("Steadystate min search time: {0:F2}ms", (1000 * minSearchTime) / Stopwatch.Frequency);
                        Console.WriteLine("Steadystate max search time: {0:F2}ms", (1000 * maxSearchTime) / Stopwatch.Frequency);
                        Console.WriteLine("Steadystate average search time: {0:F2}ms", averageSearch);
                        Console.WriteLine("Steadystate median search time: {0:F2}ms", medianSearch);
                    }
                    else
                    {
                        long averageSearch = searchTimes.Sum() / N;
                        long medianSearch  = searchTimes.OrderBy(t => t).ElementAt(N / 2);
                        Console.WriteLine("Steadystate min search time: {0}ms", minSearchTime);
                        Console.WriteLine("Steadystate max search time: {0}ms", maxSearchTime);
                        Console.WriteLine("Steadystate average search time: {0}ms", (int)averageSearch);
                        Console.WriteLine("Steadystate median search time: {0}ms", (int)medianSearch);
                    }

                    Console.WriteLine("");
                }
            }
        }
예제 #5
0
        static void Main(string[] args)
        {
            // -train <file> Use text data from <file> to train the model
            string train = "Corpus.txt";

            // -output <file> Use <file> to save the resulting word vectors / word clusters
            string output = "Vectors.bin";

            // -save-vocab <file> The vocabulary will be saved to <file>
            string savevocab = "";

            // -read-vocab <file> The vocabulary will be read from <file>, not constructed from the training data
            string readvocab = "";

            // -size <int> Set size of word vectors; default is 100
            int size = 100;

            // -debug <int> Set the debug mode (default = 2 = more info during training)
            int debug = 2;

            // -binary <int> Save the resulting vectors in binary moded; default is 0 (off)
            int binary = 1;

            // -cbow <int> Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
            int cbow = 1;

            // -alpha <float> Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
            float alpha = 0.05f;

            // -sample <float> Set threshold for occurrence of words. Those that appear with higher frequency in the training data
            float sample = 1e-4f;

            // -hs <int> Use Hierarchical Softmax; default is 0 (not used)
            int hs = 0;

            // -negative <int> Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
            int negative = 25;

            // -threads <int> Use <int> threads (default 12)
            int threads = 12;

            // -iter <int> Run more training iterations (default 5)
            long iter = 15;

            // -min-count <int> This will discard words that appear less than <int> times; default is 5
            int mincount = 5;

            // -classes <int> Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
            long classes = 0;

            // -window <int> Set max skip length between words; default is 5
            int window = 12;

            Word2Vec.Word2Vec word2Vec = new Word2Vec.Word2Vec(train, output, savevocab, readvocab, size, debug, binary, cbow, alpha, sample, hs, negative, threads, iter, mincount, classes, window);


            Console.WriteLine("Train Model: Y/N");
            string usr = Console.ReadLine().ToLower();

            if (usr == "y")
            {
                word2Vec.TrainModel();
            }


            path        = @"Vectors.bin";
            distance    = new Distance(path);
            wordAnalogy = new WordAnalogy(path);


            while (true)
            {
                Console.Clear();

                Console.WriteLine("{0}\nPlease enter Word: ", new string('-', 50));

                string input = Console.ReadLine();

                Console.Clear();

                distance.Search(input, 10);

                wordAnalogy.Search(input, 10);

                Console.ReadLine();
            }
        }