Пример #1
0
 public void TestContains()
 {
     Assert.True(corpus.Contains("atatürk"));
     foreach (Word word in corpus.GetWordList())
     {
         Assert.True(corpus.Contains(word.GetName()));
     }
     Assert.True(simpleCorpus.Contains("mehmet"));
     foreach (Word word in simpleCorpus.GetWordList())
     {
         Assert.True(simpleCorpus.Contains(word.GetName()));
     }
 }
Пример #2
0
        /**
         * <summary>Constructor for the {@link Vocabulary} class. For each distinct word in the corpus, a {@link VocabularyWord}
         * instance is created. After that, words are sorted according to their occurrences. Unigram table is constructed,
         * where after Huffman tree is created based on the number of occurrences of the words.</summary>
         * <param name="corpus">Corpus used to train word vectors using Word2Vec algorithm.</param>
         */
        public Vocabulary(Corpus.Corpus corpus)
        {
            var wordList = corpus.GetWordList();

            _vocabulary = new List <VocabularyWord>();
            foreach (var word in wordList)
            {
                _vocabulary.Add(new VocabularyWord(word.GetName(), corpus.GetCount(word)));
            }
            _vocabulary.Sort();
            CreateUniGramTable();
            ConstructHuffmanTree();
            _vocabulary.Sort(new TurkishWordComparator());
        }