Exemple #1
0
        /// <summary>
        /// Load vocabulary from given files
        /// </summary>
        /// <param name="srcVocabFilePath"></param>
        /// <param name="tgtVocabFilePath"></param>
        public Vocab(string srcVocabFilePath, string tgtVocabFilePath)
        {
            Logger.WriteLine("Loading vocabulary files...");
            string[] srcVocab = File.ReadAllLines(srcVocabFilePath);
            string[] tgtVocab = File.ReadAllLines(tgtVocabFilePath);

            CreateIndex();

            //Build word index for both source and target sides
            int q = 3;

            foreach (string line in srcVocab)
            {
                string[] items = line.Split('\t');
                string   word  = items[0];

                if (ParallelCorpus.IsPreDefinedToken(word) == false)
                {
                    m_srcVocab.Add(word);
                    SrcWordToIndex[word] = q;
                    m_srcIndexToWord[q]  = word;
                    q++;
                }
            }

            q = 3;
            foreach (string line in tgtVocab)
            {
                string[] items = line.Split('\t');
                string   word  = items[0];

                if (ParallelCorpus.IsPreDefinedToken(word) == false)
                {
                    m_tgtVocab.Add(word);
                    TgtWordToIndex[word] = q;
                    m_tgtIndexToWord[q]  = word;
                    q++;
                }
            }
        }
Exemple #2
0
        /// <summary>
        /// Build vocabulary from training corpus
        /// </summary>
        /// <param name="trainCorpus"></param>
        /// <param name="minFreq"></param>
        public Vocab(ParallelCorpus trainCorpus, int minFreq = 1)
        {
            Logger.WriteLine($"Building vocabulary from given training corpus.");
            // count up all words
            Dictionary <string, int> s_d = new Dictionary <string, int>();
            Dictionary <string, int> t_d = new Dictionary <string, int>();

            CreateIndex();

            foreach (SntPairBatch sntPairBatch in trainCorpus)
            {
                foreach (SntPair sntPair in sntPairBatch.SntPairs)
                {
                    string[] item = sntPair.SrcSnt;
                    for (int i = 0, n = item.Length; i < n; i++)
                    {
                        string txti = item[i];
                        if (s_d.Keys.Contains(txti))
                        {
                            s_d[txti] += 1;
                        }
                        else
                        {
                            s_d.Add(txti, 1);
                        }
                    }

                    string[] item2 = sntPair.TgtSnt;
                    for (int i = 0, n = item2.Length; i < n; i++)
                    {
                        string txti = item2[i];
                        if (t_d.Keys.Contains(txti))
                        {
                            t_d[txti] += 1;
                        }
                        else
                        {
                            t_d.Add(txti, 1);
                        }
                    }
                }
            }


            int q = 3;

            foreach (KeyValuePair <string, int> ch in s_d)
            {
                if (ch.Value >= minFreq && ParallelCorpus.IsPreDefinedToken(ch.Key) == false)
                {
                    // add word to vocab
                    SrcWordToIndex[ch.Key] = q;
                    m_srcIndexToWord[q]    = ch.Key;
                    m_srcVocab.Add(ch.Key);
                    q++;
                }
            }
            Logger.WriteLine($"Source language Max term id = '{q}'");


            q = 3;
            foreach (KeyValuePair <string, int> ch in t_d)
            {
                if (ch.Value >= minFreq && ParallelCorpus.IsPreDefinedToken(ch.Key) == false)
                {
                    // add word to vocab
                    TgtWordToIndex[ch.Key] = q;
                    m_tgtIndexToWord[q]    = ch.Key;
                    m_tgtVocab.Add(ch.Key);
                    q++;
                }
            }

            Logger.WriteLine($"Target language Max term id = '{q}'");
        }