Beispiel #1
0
        /// <summary>
        /// Tokenized sequences are read from the source and target language file. The sequences are
        /// distribute into different bucketSequenceLengths according to their sequence length.
        /// By default the data is padded to the bucket length and the target sequence is prepended with the go symbol id.
        /// </summary>
        /// <param name="sourceLanguage"></param>
        /// <param name="targetLanguage"></param>
        /// <param name="bucketSequenceLengths"></param>
        /// <returns></returns>
        public static BucketedData BucketTokenizedData(string sourceLanguage, string targetLanguage, IEnumerable <Tuple <int, int> > bucketSequenceLengths)
        {
            var bucketedData = new BucketedData(bucketSequenceLengths);

            using (var file1 = new StreamReader(sourceLanguage, Encoding.UTF8, true))
                using (var file2 = new StreamReader(targetLanguage, Encoding.UTF8, true))
                {
                    var    counter = 0;
                    string line1, line2;
                    while ((line1 = file1.ReadLine()) != null && (line2 = file2.ReadLine()) != null)
                    {
                        var source = line1.Trim().Split(null).Select(int.Parse).ToArray();
                        var target = line2.Trim().Split(null).Select(int.Parse).ToArray();

                        counter++;
                        if (counter % 100000 == 0)
                        {
                            Console.WriteLine($"PrepareForTraining {sourceLanguage} {targetLanguage} : line {counter}");
                        }

                        bucketedData.Add(source, target);
                    }
                }
            return(bucketedData);
        }
        public BucketedDataBatcher(BucketedData data, int batchSize, Random random)
        {
            Random    = random;
            Data      = data;
            BatchSize = batchSize;
            var sizes = data.BucketSizes;
            var total = (double)data.NumDataPoints;

            CumulativeProbabilities    = new double[data.NumBuckets];
            CumulativeProbabilities[0] = sizes[0] / total;
            for (var i = 1; i < data.NumBuckets; ++i)
            {
                CumulativeProbabilities[i] = CumulativeProbabilities[i - 1] + sizes[i] / total;
            }
        }