Beispiel #1
0
        void LoadVocabFromPreTrainModel(string strModelFileName)
        {
            Decoder preTrainedModel = new Decoder();

            preTrainedModel.LoadModel(strModelFileName, false);

            layer1_size = preTrainedModel.vectorSize;
            Console.WriteLine("Apply the following options from pr-trained model file {0}", preTrainedModel);
            Console.WriteLine("Vector Size: {0}", layer1_size);

            string[] allTerms = preTrainedModel.GetAllTerms();
            foreach (string strTerm in allTerms)
            {
                //Add terms in pre-trained model into vocabulary
                //If the term is already added from corpus or given dictionary, we ignore it
                if (word2id.ContainsKey(strTerm) == false)
                {
                    Term       term = preTrainedModel.GetTerm(strTerm);
                    vocab_word word = new vocab_word();
                    word.word   = strTerm;
                    word.cnt    = 0;
                    word.source = WORD_SOURCE.PRETRAINED_MODEL;

                    word2id.Add(word.word, vocab_size);
                    vocab.Add(word);

                    vocab_size++;
                }
            }
        }
Beispiel #2
0
        private void GetTrainWordSize(string train_file)
        {
            StreamReader fin     = new StreamReader(train_file);
            string       strLine = null;

            train_words = 0;

            foreach (vocab_word vw in vocab)
            {
                vw.cnt = 0;
            }

            while ((strLine = fin.ReadLine()) != null)
            {
                //Append the end of sentence
                strLine = strLine.Trim();
                string[] items = strLine.Split();
                foreach (string item in items)
                {
                    int wordId = SearchVocab(item);

                    if (wordId == -1 && onlyUpdateCorpusWord == 0)
                    {
                        word2id.Add(item, vocab_size);
                        wordId = vocab_size;

                        vocab_word voc_word = new vocab_word();
                        voc_word.word   = item;
                        voc_word.cnt    = 0;
                        voc_word.source = WORD_SOURCE.CORPUS;

                        vocab.Add(voc_word);
                        vocab_size++;
                    }

                    if (wordId >= 0)
                    {
                        vocab[wordId].cnt++;

                        if (vocab[wordId].source == WORD_SOURCE.PRETRAINED_MODEL && onlyUpdateCorpusWord == 1)
                        {
                            continue;
                        }

                        train_words++;
                        if (debug_mode > 0 && train_words % 1000000 == 0)
                        {
                            Console.Write("{0}M... ", train_words / 1000000);
                        }
                    }
                }
            }

            fin.Close();
        }
Beispiel #3
0
        public void LearnVocabFromTrainFile(string train_file)
        {
            StreamReader fin     = new StreamReader(train_file);
            string       strLine = null;

            vocab_size = 0;
            int i = 0;

            while ((strLine = fin.ReadLine()) != null)
            {
                //Append the end of sentence
                strLine = strLine.Trim();
                string[] items = strLine.Split();

                foreach (string word in items)
                {
                    //This term is normal word
                    train_words++;
                    if (debug_mode > 0 && train_words % 1000000 == 0)
                    {
                        Console.Write("{0}M... ", train_words / 1000000);
                    }

                    i = SearchVocab(word);
                    if (i == -1)
                    {
                        word2id.Add(word, vocab_size);

                        vocab_word voc_word = new vocab_word();
                        voc_word.word   = word;
                        voc_word.cnt    = 1;
                        voc_word.source = WORD_SOURCE.CORPUS;

                        vocab.Add(voc_word);
                        vocab_size++;
                    }
                    else
                    {
                        vocab[i].cnt++;
                    }
                }
            }

            fin.Close();
        }
Beispiel #4
0
        public void LoadVocabFromFile(string vocab_file)
        {
            StreamReader sr      = new StreamReader(vocab_file);
            string       strLine = null;

            word2id    = new Dictionary <string, int>();
            vocab      = new List <vocab_word>();
            vocab_size = 0;

            while ((strLine = sr.ReadLine()) != null)
            {
                string[]   items = strLine.Split('\t');
                vocab_word word  = new vocab_word();
                word.word   = items[0];
                word.source = WORD_SOURCE.CORPUS;

                word2id.Add(word.word, vocab_size);
                vocab.Add(word);

                vocab_size++;
            }

            sr.Close();
        }