public override void TrainModel(List <SARS> batch) { var inp = new StatePair[batch.Count]; var outp = new TargetIndexPair[batch.Count]; int i = 0; foreach (var sars in batch) { inp[i] = sars.State.Features; float target; if (!sars.NextState.IsTerminal) { var a0max = QMax(sars.NextState); target = sars.Reward + Discount * a0max; } else { target = sars.Reward; } outp[i++] = new TargetIndexPair(target, _amap[sars.Action.ActionId]); } for (int j = 0; j < batch.Count; j++) { _net.SGD(inp[j], outp[j]); } }
public void SGD(StatePair input, TargetIndexPair p) { _loss.Clear(); _loss.At(p.Index, p.Target - Compute(input, true)[p.Index]); var split = _split.Visit(_outback.Visit(_loss, _params), _params); _backprop.BackPropagation(_unflatten.Visit(split.left, _params), _params); _hiddenBackprop.Visit(split.right, _params); IsOutputFromTraining = true; }