private static void train(int epoch, Tensor train_data, TransformerModel model, Loss criterion, int bptt, int ntokens, torch.optim.Optimizer optimizer) { model.train(); using (var d = torch.NewDisposeScope()) { var total_loss = 0.0f; var batch = 0; var log_interval = 200; var src_mask = model.GenerateSquareSubsequentMask(bptt); var tdlen = train_data.shape[0]; for (int i = 0; i < tdlen - 1; batch++, i += bptt) { var(data, targets) = GetBatch(train_data, i, bptt); optimizer.zero_grad(); if (data.shape[0] != bptt) { src_mask = model.GenerateSquareSubsequentMask(data.shape[0]); } using (var output = model.forward(data, src_mask)) { var loss = criterion(output.view(-1, ntokens), targets); loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters().ToArray(), 0.5); optimizer.step(); total_loss += loss.to(torch.CPU).item <float>(); } if (batch % log_interval == 0 && batch > 0) { var cur_loss = total_loss / log_interval; Console.WriteLine($"epoch: {epoch} | batch: {batch} / {tdlen / bptt} | loss: {cur_loss:0.00}"); total_loss = 0; } d.DisposeEverythingBut(src_mask); } } }
private static void train(int epoch, TorchTensor train_data, TransformerModel model, Loss criterion, int bptt, int ntokens, Optimizer optimizer) { model.Train(); var total_loss = 0.0f; var src_mask = model.GenerateSquareSubsequentMask(bptt); var batch = 0; var log_interval = 200; var tdlen = train_data.shape[0]; for (int i = 0; i < tdlen - 1; batch++, i += bptt) { var(data, targets) = GetBatch(train_data, i, bptt); optimizer.zero_grad(); if (data.shape[0] != bptt) { src_mask.Dispose(); src_mask = model.GenerateSquareSubsequentMask(data.shape[0]); } var output = model.forward(data, src_mask); var loss = criterion(output.view(-1, ntokens), targets); { loss.backward(); model.parameters().clip_grad_norm(0.5); optimizer.step(); total_loss += loss.to(Device.CPU).DataItem <float>(); } GC.Collect(); if (batch % log_interval == 0 && batch > 0) { var cur_loss = total_loss / log_interval; Console.WriteLine($"epoch: {epoch} | batch: {batch} / {tdlen/bptt} | loss: {cur_loss:0.00}"); total_loss = 0; } } }
static void Main(string[] args) { Torch.SetSeed(1); var cwd = Environment.CurrentDirectory; var device = Torch.IsCudaAvailable() ? Device.CUDA : Device.CPU; Console.WriteLine($"Running SequenceToSequence on {device.Type.ToString()}"); var vocab_iter = TorchText.Datasets.WikiText2("train", _dataLocation); var tokenizer = TorchText.Data.Utils.get_tokenizer("basic_english"); var counter = new TorchText.Vocab.Counter <string>(); foreach (var item in vocab_iter) { counter.update(tokenizer(item)); } var vocab = new TorchText.Vocab.Vocab(counter); var(train_iter, valid_iter, test_iter) = TorchText.Datasets.WikiText2(_dataLocation); var train_data = Batchify(ProcessInput(train_iter, tokenizer, vocab), batch_size).to(device); var valid_data = Batchify(ProcessInput(valid_iter, tokenizer, vocab), eval_batch_size).to(device); var test_data = Batchify(ProcessInput(test_iter, tokenizer, vocab), eval_batch_size).to(device); var bptt = 32; var(data, targets) = GetBatch(train_data, 0, bptt); var ntokens = vocab.Count; var model = new TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device); var loss = cross_entropy_loss(); var lr = 2.50; var optimizer = NN.Optimizer.SGD(model.parameters(), lr); var scheduler = NN.Optimizer.StepLR(optimizer, 1, 0.95, last_epoch: 15); var totalTime = new Stopwatch(); totalTime.Start(); foreach (var epoch in Enumerable.Range(1, epochs)) { var sw = new Stopwatch(); sw.Start(); train(epoch, train_data, model, loss, bptt, ntokens, optimizer); var val_loss = evaluate(valid_data, model, loss, lr, bptt, ntokens, optimizer); sw.Stop(); Console.WriteLine($"\nEnd of epoch: {epoch} | lr: {scheduler.LearningRate:0.00} | time: {sw.Elapsed.TotalSeconds:0.0}s | loss: {val_loss:0.00}\n"); scheduler.step(); } var tst_loss = evaluate(test_data, model, loss, lr, bptt, ntokens, optimizer); totalTime.Stop(); Console.WriteLine($"\nEnd of training | time: {totalTime.Elapsed.TotalSeconds:0.0}s | loss: {tst_loss:0.00}\n"); }