public ParallelCorpus(string corpusFilePath, string srcLangName, string tgtLangName, int batchSize, int shuffleBlockSize = -1, int maxSrcSentLength = 32, int maxTgtSentLength = 32, ShuffleEnums shuffleEnums = ShuffleEnums.Random, TooLongSequence tooLongSequence = TooLongSequence.Ignore) { Logger.WriteLine($"Loading parallel corpus from '{corpusFilePath}' for source side '{srcLangName}' and target side '{tgtLangName}' MaxSrcSentLength = '{maxSrcSentLength}', MaxTgtSentLength = '{maxTgtSentLength}', aggregateSrcLengthForShuffle = '{shuffleEnums}', TooLongSequence = '{tooLongSequence}'"); m_batchSize = batchSize; m_blockSize = shuffleBlockSize; m_maxSrcSentLength = maxSrcSentLength; m_maxTgtSentLength = maxTgtSentLength; m_tooLongSequence = tooLongSequence; m_shuffleEnums = shuffleEnums; CorpusName = corpusFilePath; m_srcFileList = new List <string>(); m_tgtFileList = new List <string>(); string[] files = Directory.GetFiles(corpusFilePath, $"*.*", SearchOption.TopDirectoryOnly); Dictionary <string, string> srcKey2FileName = new Dictionary <string, string>(); Dictionary <string, string> tgtKey2FileName = new Dictionary <string, string>(); string srcSuffix = $".{srcLangName}.snt"; string tgtSuffix = $".{tgtLangName}.snt"; foreach (string file in files) { if (file.EndsWith(srcSuffix, StringComparison.InvariantCultureIgnoreCase)) { string srcKey = file.Substring(0, file.Length - srcSuffix.Length); srcKey2FileName.Add(srcKey, file); Logger.WriteLine($"Add source file '{file}' to key '{srcKey}'"); } if (file.EndsWith(tgtSuffix, StringComparison.InvariantCultureIgnoreCase)) { string tgtKey = file.Substring(0, file.Length - tgtSuffix.Length); tgtKey2FileName.Add(tgtKey, file); Logger.WriteLine($"Add target file '{file}' to key '{tgtKey}'"); } } foreach (var pair in srcKey2FileName) { m_srcFileList.Add(pair.Value); m_tgtFileList.Add(tgtKey2FileName[pair.Key]); } }
public Seq2SeqClassificationCorpus(string corpusFilePath, string srcLangName, string tgtLangName, int batchSize, int shuffleBlockSize = -1, int maxSrcSentLength = 32, int maxTgtSentLength = 32, ShuffleEnums shuffleEnums = ShuffleEnums.Random, TooLongSequence tooLongSequence = TooLongSequence.Ignore) : base(corpusFilePath, srcLangName, tgtLangName, batchSize, shuffleBlockSize, maxSrcSentLength, maxTgtSentLength, shuffleEnums: shuffleEnums, tooLongSequence: tooLongSequence) { }