private static double evaluate(TorchTensor eval_data, TransformerModel model, Loss criterion, double lr, int bptt, int ntokens, Optimizer optimizer) { model.Eval(); var total_loss = 0.0f; var src_mask = model.GenerateSquareSubsequentMask(bptt); var batch = 0; for (int i = 0; i < eval_data.shape[0] - 1; batch++, i += bptt) { var(data, targets) = GetBatch(eval_data, i, bptt); if (data.shape[0] != bptt) { src_mask.Dispose(); src_mask = model.GenerateSquareSubsequentMask(data.shape[0]); } var output = model.forward(data, src_mask); var loss = criterion(output.view(-1, ntokens), targets); total_loss += data.shape[0] * loss.to(Device.CPU).DataItem <float>(); data.Dispose(); targets.Dispose(); GC.Collect(); } return(total_loss / eval_data.shape[0]); }
private static double evaluate(Tensor eval_data, TransformerModel model, Loss criterion, int bptt, int ntokens, torch.optim.Optimizer optimizer) { model.eval(); using (var d = torch.NewDisposeScope()) { var src_mask = model.GenerateSquareSubsequentMask(bptt); var total_loss = 0.0f; var batch = 0; for (int i = 0; i < eval_data.shape[0] - 1; batch++, i += bptt) { var(data, targets) = GetBatch(eval_data, i, bptt); if (data.shape[0] != bptt) { src_mask = model.GenerateSquareSubsequentMask(data.shape[0]); } using (var output = model.forward(data, src_mask)) { var loss = criterion(output.view(-1, ntokens), targets); total_loss += data.shape[0] * loss.to(torch.CPU).item <float>(); } data.Dispose(); targets.Dispose(); d.DisposeEverythingBut(src_mask); } return(total_loss / eval_data.shape[0]); } }
private static void train(int epoch, Tensor train_data, TransformerModel model, Loss criterion, int bptt, int ntokens, torch.optim.Optimizer optimizer) { model.train(); using (var d = torch.NewDisposeScope()) { var total_loss = 0.0f; var batch = 0; var log_interval = 200; var src_mask = model.GenerateSquareSubsequentMask(bptt); var tdlen = train_data.shape[0]; for (int i = 0; i < tdlen - 1; batch++, i += bptt) { var(data, targets) = GetBatch(train_data, i, bptt); optimizer.zero_grad(); if (data.shape[0] != bptt) { src_mask = model.GenerateSquareSubsequentMask(data.shape[0]); } using (var output = model.forward(data, src_mask)) { var loss = criterion(output.view(-1, ntokens), targets); loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters().ToArray(), 0.5); optimizer.step(); total_loss += loss.to(torch.CPU).item <float>(); } if (batch % log_interval == 0 && batch > 0) { var cur_loss = total_loss / log_interval; Console.WriteLine($"epoch: {epoch} | batch: {batch} / {tdlen / bptt} | loss: {cur_loss:0.00}"); total_loss = 0; } d.DisposeEverythingBut(src_mask); } } }
private static void train(int epoch, TorchTensor train_data, TransformerModel model, Loss criterion, int bptt, int ntokens, Optimizer optimizer) { model.Train(); var total_loss = 0.0f; var src_mask = model.GenerateSquareSubsequentMask(bptt); var batch = 0; var log_interval = 200; var tdlen = train_data.shape[0]; for (int i = 0; i < tdlen - 1; batch++, i += bptt) { var(data, targets) = GetBatch(train_data, i, bptt); optimizer.zero_grad(); if (data.shape[0] != bptt) { src_mask.Dispose(); src_mask = model.GenerateSquareSubsequentMask(data.shape[0]); } var output = model.forward(data, src_mask); var loss = criterion(output.view(-1, ntokens), targets); { loss.backward(); model.parameters().clip_grad_norm(0.5); optimizer.step(); total_loss += loss.to(Device.CPU).DataItem <float>(); } GC.Collect(); if (batch % log_interval == 0 && batch > 0) { var cur_loss = total_loss / log_interval; Console.WriteLine($"epoch: {epoch} | batch: {batch} / {tdlen/bptt} | loss: {cur_loss:0.00}"); total_loss = 0; } } }