/// <summary> /// Build vocabulary from training corpus /// For return vocabs: (source vocab, target vocab, classification vocab) /// </summary> /// <param name="vocabSize"></param> public (Vocab, Vocab, Vocab) BuildVocabs(int srcVocabSize = 45000, int tgtVocabSize = 45000, bool sharedVocab = false) { if (sharedVocab && (srcVocabSize != tgtVocabSize)) { throw new ArgumentException($"Vocab size must be equal if sharedVocab is true. Src Vocab Size = '{srcVocabSize}', Tgt Vocab Size = '{tgtVocabSize}'"); } foreach (var sntPairBatch in this) { CorpusBatch.CountSntPairTokens(sntPairBatch.SntPairs); } CorpusBatch.ReduceSrcTokensToSingleGroup(); if (sharedVocab) { CorpusBatch.MergeTokensCountSrcTgt(0, 1); } (var srcVocabs, var tgtVocabs) = CorpusBatch.GenerateVocabs(srcVocabSize, tgtVocabSize); Vocab srcVocab = srcVocabs[0]; Vocab clsVocab = tgtVocabs[0]; Vocab tgtVocab = tgtVocabs[1]; return(srcVocab, tgtVocab, clsVocab); }
/// <summary> /// Build vocabulary from training corpus /// </summary> /// <param name="vocabSize"></param> public (Vocab, List <Vocab>) BuildVocabs(int srcVocabSize = 45000, int tgtVocabSize = 45000) { foreach (var sntPairBatch in this) { CorpusBatch.CountSntPairTokens(sntPairBatch.SntPairs); } CorpusBatch.ReduceSrcTokensToSingleGroup(); (var srcVocabs, var tgtVocabs) = CorpusBatch.GenerateVocabs(srcVocabSize, tgtVocabSize); return(srcVocabs[0], tgtVocabs); }