Esempio n. 1
0
 public void TestGetCount()
 {
     Assert.AreEqual(309, corpus.GetCount(new Word("mustafa")));
     Assert.AreEqual(109, corpus.GetCount(new Word("kemal")));
     Assert.AreEqual(122, corpus.GetCount(new Word("atatürk")));
     Assert.AreEqual(4, simpleCorpus.GetCount(new Word("ali")));
     Assert.AreEqual(3, simpleCorpus.GetCount(new Word("gitti")));
     Assert.AreEqual(4, simpleCorpus.GetCount(new Word("at")));
 }
        /**
         * <summary>Constructor for the {@link Vocabulary} class. For each distinct word in the corpus, a {@link VocabularyWord}
         * instance is created. After that, words are sorted according to their occurrences. Unigram table is constructed,
         * where after Huffman tree is created based on the number of occurrences of the words.</summary>
         * <param name="corpus">Corpus used to train word vectors using Word2Vec algorithm.</param>
         */
        public Vocabulary(Corpus.Corpus corpus)
        {
            var wordList = corpus.GetWordList();

            _vocabulary = new List <VocabularyWord>();
            foreach (var word in wordList)
            {
                _vocabulary.Add(new VocabularyWord(word.GetName(), corpus.GetCount(word)));
            }
            _vocabulary.Sort();
            CreateUniGramTable();
            ConstructHuffmanTree();
            _vocabulary.Sort(new TurkishWordComparator());
        }