Example #1
0
        private static void train(int epoch, Tensor train_data, TransformerModel model, Loss criterion, int bptt, int ntokens, torch.optim.Optimizer optimizer)
        {
            model.train();

            using (var d = torch.NewDisposeScope()) {
                var total_loss = 0.0f;

                var batch        = 0;
                var log_interval = 200;

                var src_mask = model.GenerateSquareSubsequentMask(bptt);

                var tdlen = train_data.shape[0];


                for (int i = 0; i < tdlen - 1; batch++, i += bptt)
                {
                    var(data, targets) = GetBatch(train_data, i, bptt);
                    optimizer.zero_grad();

                    if (data.shape[0] != bptt)
                    {
                        src_mask = model.GenerateSquareSubsequentMask(data.shape[0]);
                    }

                    using (var output = model.forward(data, src_mask)) {
                        var loss = criterion(output.view(-1, ntokens), targets);
                        loss.backward();
                        torch.nn.utils.clip_grad_norm_(model.parameters().ToArray(), 0.5);
                        optimizer.step();

                        total_loss += loss.to(torch.CPU).item <float>();
                    }

                    if (batch % log_interval == 0 && batch > 0)
                    {
                        var cur_loss = total_loss / log_interval;
                        Console.WriteLine($"epoch: {epoch} | batch: {batch} / {tdlen / bptt} | loss: {cur_loss:0.00}");
                        total_loss = 0;
                    }

                    d.DisposeEverythingBut(src_mask);
                }
            }
        }
        private static void train(int epoch, TorchTensor train_data, TransformerModel model, Loss criterion, int bptt, int ntokens, Optimizer optimizer)
        {
            model.Train();

            var total_loss = 0.0f;

            var src_mask = model.GenerateSquareSubsequentMask(bptt);

            var batch        = 0;
            var log_interval = 200;

            var tdlen = train_data.shape[0];

            for (int i = 0; i < tdlen - 1; batch++, i += bptt)
            {
                var(data, targets) = GetBatch(train_data, i, bptt);
                optimizer.zero_grad();

                if (data.shape[0] != bptt)
                {
                    src_mask.Dispose();
                    src_mask = model.GenerateSquareSubsequentMask(data.shape[0]);
                }

                var output = model.forward(data, src_mask);
                var loss   = criterion(output.view(-1, ntokens), targets);
                {
                    loss.backward();
                    model.parameters().clip_grad_norm(0.5);
                    optimizer.step();

                    total_loss += loss.to(Device.CPU).DataItem <float>();
                }

                GC.Collect();

                if (batch % log_interval == 0 && batch > 0)
                {
                    var cur_loss = total_loss / log_interval;
                    Console.WriteLine($"epoch: {epoch} | batch: {batch} / {tdlen/bptt} | loss: {cur_loss:0.00}");
                    total_loss = 0;
                }
            }
        }
        static void Main(string[] args)

        {
            Torch.SetSeed(1);

            var cwd = Environment.CurrentDirectory;

            var device = Torch.IsCudaAvailable() ? Device.CUDA : Device.CPU;

            Console.WriteLine($"Running SequenceToSequence on {device.Type.ToString()}");

            var vocab_iter = TorchText.Datasets.WikiText2("train", _dataLocation);
            var tokenizer  = TorchText.Data.Utils.get_tokenizer("basic_english");

            var counter = new TorchText.Vocab.Counter <string>();

            foreach (var item in vocab_iter)
            {
                counter.update(tokenizer(item));
            }

            var vocab = new TorchText.Vocab.Vocab(counter);

            var(train_iter, valid_iter, test_iter) = TorchText.Datasets.WikiText2(_dataLocation);

            var train_data = Batchify(ProcessInput(train_iter, tokenizer, vocab), batch_size).to(device);
            var valid_data = Batchify(ProcessInput(valid_iter, tokenizer, vocab), eval_batch_size).to(device);
            var test_data  = Batchify(ProcessInput(test_iter, tokenizer, vocab), eval_batch_size).to(device);

            var bptt = 32;

            var(data, targets) = GetBatch(train_data, 0, bptt);

            var ntokens = vocab.Count;

            var model     = new TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device);
            var loss      = cross_entropy_loss();
            var lr        = 2.50;
            var optimizer = NN.Optimizer.SGD(model.parameters(), lr);
            var scheduler = NN.Optimizer.StepLR(optimizer, 1, 0.95, last_epoch: 15);

            var totalTime = new Stopwatch();

            totalTime.Start();

            foreach (var epoch in Enumerable.Range(1, epochs))
            {
                var sw = new Stopwatch();
                sw.Start();

                train(epoch, train_data, model, loss, bptt, ntokens, optimizer);

                var val_loss = evaluate(valid_data, model, loss, lr, bptt, ntokens, optimizer);
                sw.Stop();

                Console.WriteLine($"\nEnd of epoch: {epoch} | lr: {scheduler.LearningRate:0.00} | time: {sw.Elapsed.TotalSeconds:0.0}s | loss: {val_loss:0.00}\n");
                scheduler.step();
            }

            var tst_loss = evaluate(test_data, model, loss, lr, bptt, ntokens, optimizer);

            totalTime.Stop();

            Console.WriteLine($"\nEnd of training | time: {totalTime.Elapsed.TotalSeconds:0.0}s | loss: {tst_loss:0.00}\n");
        }