public void TestContains() { Assert.True(corpus.Contains("atatürk")); foreach (Word word in corpus.GetWordList()) { Assert.True(corpus.Contains(word.GetName())); } Assert.True(simpleCorpus.Contains("mehmet")); foreach (Word word in simpleCorpus.GetWordList()) { Assert.True(simpleCorpus.Contains(word.GetName())); } }
/** * <summary>Constructor for the {@link Vocabulary} class. For each distinct word in the corpus, a {@link VocabularyWord} * instance is created. After that, words are sorted according to their occurrences. Unigram table is constructed, * where after Huffman tree is created based on the number of occurrences of the words.</summary> * <param name="corpus">Corpus used to train word vectors using Word2Vec algorithm.</param> */ public Vocabulary(Corpus.Corpus corpus) { var wordList = corpus.GetWordList(); _vocabulary = new List <VocabularyWord>(); foreach (var word in wordList) { _vocabulary.Add(new VocabularyWord(word.GetName(), corpus.GetCount(word))); } _vocabulary.Sort(); CreateUniGramTable(); ConstructHuffmanTree(); _vocabulary.Sort(new TurkishWordComparator()); }