void LoadVocabFromPreTrainModel(string strModelFileName) { Decoder preTrainedModel = new Decoder(); preTrainedModel.LoadModel(strModelFileName, false); layer1_size = preTrainedModel.vectorSize; Console.WriteLine("Apply the following options from pr-trained model file {0}", preTrainedModel); Console.WriteLine("Vector Size: {0}", layer1_size); string[] allTerms = preTrainedModel.GetAllTerms(); foreach (string strTerm in allTerms) { //Add terms in pre-trained model into vocabulary //If the term is already added from corpus or given dictionary, we ignore it if (word2id.ContainsKey(strTerm) == false) { Term term = preTrainedModel.GetTerm(strTerm); vocab_word word = new vocab_word(); word.word = strTerm; word.cnt = 0; word.source = WORD_SOURCE.PRETRAINED_MODEL; word2id.Add(word.word, vocab_size); vocab.Add(word); vocab_size++; } } }
private void GetTrainWordSize(string train_file) { StreamReader fin = new StreamReader(train_file); string strLine = null; train_words = 0; foreach (vocab_word vw in vocab) { vw.cnt = 0; } while ((strLine = fin.ReadLine()) != null) { //Append the end of sentence strLine = strLine.Trim(); string[] items = strLine.Split(); foreach (string item in items) { int wordId = SearchVocab(item); if (wordId == -1 && onlyUpdateCorpusWord == 0) { word2id.Add(item, vocab_size); wordId = vocab_size; vocab_word voc_word = new vocab_word(); voc_word.word = item; voc_word.cnt = 0; voc_word.source = WORD_SOURCE.CORPUS; vocab.Add(voc_word); vocab_size++; } if (wordId >= 0) { vocab[wordId].cnt++; if (vocab[wordId].source == WORD_SOURCE.PRETRAINED_MODEL && onlyUpdateCorpusWord == 1) { continue; } train_words++; if (debug_mode > 0 && train_words % 1000000 == 0) { Console.Write("{0}M... ", train_words / 1000000); } } } } fin.Close(); }
public void LearnVocabFromTrainFile(string train_file) { StreamReader fin = new StreamReader(train_file); string strLine = null; vocab_size = 0; int i = 0; while ((strLine = fin.ReadLine()) != null) { //Append the end of sentence strLine = strLine.Trim(); string[] items = strLine.Split(); foreach (string word in items) { //This term is normal word train_words++; if (debug_mode > 0 && train_words % 1000000 == 0) { Console.Write("{0}M... ", train_words / 1000000); } i = SearchVocab(word); if (i == -1) { word2id.Add(word, vocab_size); vocab_word voc_word = new vocab_word(); voc_word.word = word; voc_word.cnt = 1; voc_word.source = WORD_SOURCE.CORPUS; vocab.Add(voc_word); vocab_size++; } else { vocab[i].cnt++; } } } fin.Close(); }
public void LoadVocabFromFile(string vocab_file) { StreamReader sr = new StreamReader(vocab_file); string strLine = null; word2id = new Dictionary <string, int>(); vocab = new List <vocab_word>(); vocab_size = 0; while ((strLine = sr.ReadLine()) != null) { string[] items = strLine.Split('\t'); vocab_word word = new vocab_word(); word.word = items[0]; word.source = WORD_SOURCE.CORPUS; word2id.Add(word.word, vocab_size); vocab.Add(word); vocab_size++; } sr.Close(); }