public void TestGetCount() { Assert.AreEqual(309, corpus.GetCount(new Word("mustafa"))); Assert.AreEqual(109, corpus.GetCount(new Word("kemal"))); Assert.AreEqual(122, corpus.GetCount(new Word("atatürk"))); Assert.AreEqual(4, simpleCorpus.GetCount(new Word("ali"))); Assert.AreEqual(3, simpleCorpus.GetCount(new Word("gitti"))); Assert.AreEqual(4, simpleCorpus.GetCount(new Word("at"))); }
/** * <summary>Constructor for the {@link Vocabulary} class. For each distinct word in the corpus, a {@link VocabularyWord} * instance is created. After that, words are sorted according to their occurrences. Unigram table is constructed, * where after Huffman tree is created based on the number of occurrences of the words.</summary> * <param name="corpus">Corpus used to train word vectors using Word2Vec algorithm.</param> */ public Vocabulary(Corpus.Corpus corpus) { var wordList = corpus.GetWordList(); _vocabulary = new List <VocabularyWord>(); foreach (var word in wordList) { _vocabulary.Add(new VocabularyWord(word.GetName(), corpus.GetCount(word))); } _vocabulary.Sort(); CreateUniGramTable(); ConstructHuffmanTree(); _vocabulary.Sort(new TurkishWordComparator()); }