示例#1
0
        /// <summary>
        /// Build vocabulary from training corpus
        /// For return vocabs: (source vocab, target vocab, classification vocab)
        /// </summary>
        /// <param name="vocabSize"></param>
        public (Vocab, Vocab, Vocab) BuildVocabs(int srcVocabSize = 45000, int tgtVocabSize = 45000, bool sharedVocab = false)
        {
            if (sharedVocab && (srcVocabSize != tgtVocabSize))
            {
                throw new ArgumentException($"Vocab size must be equal if sharedVocab is true. Src Vocab Size = '{srcVocabSize}', Tgt Vocab Size = '{tgtVocabSize}'");
            }

            foreach (var sntPairBatch in this)
            {
                CorpusBatch.CountSntPairTokens(sntPairBatch.SntPairs);
            }

            CorpusBatch.ReduceSrcTokensToSingleGroup();
            if (sharedVocab)
            {
                CorpusBatch.MergeTokensCountSrcTgt(0, 1);
            }

            (var srcVocabs, var tgtVocabs) = CorpusBatch.GenerateVocabs(srcVocabSize, tgtVocabSize);

            Vocab srcVocab = srcVocabs[0];
            Vocab clsVocab = tgtVocabs[0];
            Vocab tgtVocab = tgtVocabs[1];

            return(srcVocab, tgtVocab, clsVocab);
        }
示例#2
0
        /// <summary>
        /// Build vocabulary from training corpus
        /// </summary>
        /// <param name="vocabSize"></param>
        public (Vocab, List <Vocab>) BuildVocabs(int srcVocabSize = 45000, int tgtVocabSize = 45000)
        {
            foreach (var sntPairBatch in this)
            {
                CorpusBatch.CountSntPairTokens(sntPairBatch.SntPairs);
            }

            CorpusBatch.ReduceSrcTokensToSingleGroup();

            (var srcVocabs, var tgtVocabs) = CorpusBatch.GenerateVocabs(srcVocabSize, tgtVocabSize);
            return(srcVocabs[0], tgtVocabs);
        }