/// <summary> /// Tokenized sequences are read from the source and target language file. The sequences are /// distribute into different bucketSequenceLengths according to their sequence length. /// By default the data is padded to the bucket length and the target sequence is prepended with the go symbol id. /// </summary> /// <param name="sourceLanguage"></param> /// <param name="targetLanguage"></param> /// <param name="bucketSequenceLengths"></param> /// <returns></returns> public static BucketedData BucketTokenizedData(string sourceLanguage, string targetLanguage, IEnumerable <Tuple <int, int> > bucketSequenceLengths) { var bucketedData = new BucketedData(bucketSequenceLengths); using (var file1 = new StreamReader(sourceLanguage, Encoding.UTF8, true)) using (var file2 = new StreamReader(targetLanguage, Encoding.UTF8, true)) { var counter = 0; string line1, line2; while ((line1 = file1.ReadLine()) != null && (line2 = file2.ReadLine()) != null) { var source = line1.Trim().Split(null).Select(int.Parse).ToArray(); var target = line2.Trim().Split(null).Select(int.Parse).ToArray(); counter++; if (counter % 100000 == 0) { Console.WriteLine($"PrepareForTraining {sourceLanguage} {targetLanguage} : line {counter}"); } bucketedData.Add(source, target); } } return(bucketedData); }
public BucketedDataBatcher(BucketedData data, int batchSize, Random random) { Random = random; Data = data; BatchSize = batchSize; var sizes = data.BucketSizes; var total = (double)data.NumDataPoints; CumulativeProbabilities = new double[data.NumBuckets]; CumulativeProbabilities[0] = sizes[0] / total; for (var i = 1; i < data.NumBuckets; ++i) { CumulativeProbabilities[i] = CumulativeProbabilities[i - 1] + sizes[i] / total; } }