/// <summary> /// Calcualte the loss and initial gradients. /// </summary> /// <param name="sender">Specifies the MemoryLoss layer firing the event.</param> /// <param name="e">Specifies the arguments with the Bottom(s) flowing into the MemoryLoss layer and the loss value to be filled out.</param> /// <remarks> /// The initial gradient is calculated such that it encourages the action that was taken to be taken. /// /// When using a Sigmoid, the gradient = (action=0) ? 1 - Aprob : 0 - Aprob. /// When using a Softmax, the gradient = the SoftmaxCrossEntropyLoss backward. /// /// @see [CS231n Convolution Neural Networks for Visual Recognition](http://cs231n.github.io/neural-networks-2/#losses) by Karpathy, Stanford University /// /// Regardless of the gradient used, the gradient is then modulated by multiplying it with the discounted rewards. /// </remarks> private void memLoss_OnGetLoss(object sender, MemoryLossLayerGetLossArgs <T> e) { if (m_bSkipLoss) { return; } int nCount = m_blobPolicyGradient.count(); long hActionOneHot = m_blobActionOneHot.gpu_data; long hPolicyGrad = m_blobPolicyGradient.mutable_gpu_data; long hDiscountedR = m_blobDiscountedR.gpu_data; double dfLoss; Blob <T> blobOriginalBottom = e.Bottom[0]; int nDataSize = e.Bottom[0].count(1); bool bUsingEndData = false; // When using a recurrent model and receiving data with more than one sequence, // copy and only use the last sequence data. if (m_nRecurrentSequenceLength > 1) { if (e.Bottom[0].num > 1) { m_blobAprobLogit.CopyFrom(e.Bottom[0], false, true); m_blobAprobLogit.CopyFrom(e.Bottom[0], true); List <int> rgShape = e.Bottom[0].shape(); rgShape[0] = 1; e.Bottom[0].Reshape(rgShape); e.Bottom[0].CopyFrom(m_blobAprobLogit, (m_blobAprobLogit.num - 1) * nDataSize, 0, nDataSize, true, true); bUsingEndData = true; } } long hBottomDiff = e.Bottom[0].mutable_gpu_diff; // Calculate the initial gradients (policy grad initially just contains the action probabilities) if (m_softmax != null) { BlobCollection <T> colBottom = new BlobCollection <T>(); BlobCollection <T> colTop = new BlobCollection <T>(); colBottom.Add(e.Bottom[0]); // aprob logit colBottom.Add(m_blobActionOneHot); // action one-hot vectors colTop.Add(m_blobLoss); colTop.Add(m_blobPolicyGradient); if (!m_bSoftmaxCeSetup) { m_softmaxCe.Setup(colBottom, colTop); m_bSoftmaxCeSetup = true; } dfLoss = m_softmaxCe.Forward(colBottom, colTop); m_softmaxCe.Backward(colTop, new List <bool>() { true, false }, colBottom); hPolicyGrad = colBottom[0].gpu_diff; } else { // Calculate (a=0) ? 1-aprob : 0-aprob m_mycaffe.Cuda.add_scalar(nCount, -1.0, hActionOneHot); // invert one hot m_mycaffe.Cuda.abs(nCount, hActionOneHot, hActionOneHot); m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad); // negate Aprob m_mycaffe.Cuda.add(nCount, hActionOneHot, hPolicyGrad, hPolicyGrad); // gradient = ((a=0)?1:0) - Aprob dfLoss = Utility.ConvertVal <T>(m_blobPolicyGradient.sumsq_data()); m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad); // invert for we ApplyUpdate subtracts the gradients } // Modulate the gradient with the advantage (PG magic happens right here.) m_mycaffe.Cuda.mul(nCount, hPolicyGrad, hDiscountedR, hPolicyGrad); e.Loss = dfLoss; e.EnableLossUpdate = false; // apply gradients to bottom directly. if (hPolicyGrad != hBottomDiff) { m_mycaffe.Cuda.copy(nCount, hPolicyGrad, hBottomDiff); } // When using recurrent model with more than one sequence of data, only // copy the diff to the last in the sequence and zero out the rest in the sequence. if (m_nRecurrentSequenceLength > 1 && bUsingEndData) { m_blobAprobLogit.SetDiff(0); m_blobAprobLogit.CopyFrom(e.Bottom[0], 0, (m_blobAprobLogit.num - 1) * nDataSize, nDataSize, false, true); e.Bottom[0].CopyFrom(m_blobAprobLogit, false, true); e.Bottom[0].CopyFrom(m_blobAprobLogit, true); } }
/// <summary> /// Calcualte the loss and initial gradients. /// </summary> /// <param name="sender">Specifies the MemoryLoss layer firing the event.</param> /// <param name="e">Specifies the arguments with the Bottom(s) flowing into the MemoryLoss layer and the loss value to be filled out.</param> /// <remarks> /// The initial gradient is calculated such that it encourages the action that was taken to be taken. /// /// When using a Sigmoid, the gradient = (action=0) ? 1 - Aprob : 0 - Aprob. /// When using a Softmax, the gradient = the SoftmaxCrossEntropyLoss backward. /// /// @see [CS231n Convolution Neural Networks for Visual Recognition](http://cs231n.github.io/neural-networks-2/#losses) by Karpathy, Stanford University /// /// Regardless of the gradient used, the gradient is then modulated by multiplying it with the discounted rewards. /// </remarks> private void memLoss_OnGetLoss(object sender, MemoryLossLayerGetLossArgs <T> e) { if (m_bSkipLoss) { return; } int nCount = m_blobPolicyGradient.count(); long hActionOneHot = m_blobActionOneHot.gpu_data; long hPolicyGrad = m_blobPolicyGradient.mutable_gpu_data; long hBottomDiff = e.Bottom[0].mutable_gpu_diff; long hDiscountedR = m_blobDiscountedR.gpu_data; double dfLoss; // Calculate the initial gradients (policy grad initially just contains the action probabilities) if (m_softmax != null) { BlobCollection <T> colBottom = new BlobCollection <T>(); BlobCollection <T> colTop = new BlobCollection <T>(); colBottom.Add(e.Bottom[0]); // aprob logit colBottom.Add(m_blobActionOneHot); // action one-hot vectors colTop.Add(m_blobLoss); colTop.Add(m_blobPolicyGradient); if (!m_bSoftmaxCeSetup) { m_softmaxCe.Setup(colBottom, colTop); m_bSoftmaxCeSetup = true; } dfLoss = m_softmaxCe.Forward(colBottom, colTop); m_softmaxCe.Backward(colTop, new List <bool>() { true, false }, colBottom); hPolicyGrad = colBottom[0].gpu_diff; } else { // Calculate (a=0) ? 1-aprob : 0-aprob m_mycaffe.Cuda.add_scalar(nCount, -1.0, hActionOneHot); // invert one hot m_mycaffe.Cuda.abs(nCount, hActionOneHot, hActionOneHot); m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad); // negate Aprob m_mycaffe.Cuda.add(nCount, hActionOneHot, hPolicyGrad, hPolicyGrad); // gradient = ((a=0)?1:0) - Aprob dfLoss = Utility.ConvertVal <T>(m_blobPolicyGradient.sumsq_data()); m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad); // invert for we ApplyUpdate subtracts the gradients } // Modulate the gradient with the advantage (PG magic happens right here.) m_mycaffe.Cuda.mul(nCount, hPolicyGrad, hDiscountedR, hPolicyGrad); e.Loss = dfLoss; e.EnableLossUpdate = false; // apply gradients to bottom directly. if (hPolicyGrad != hBottomDiff) { m_mycaffe.Cuda.copy(nCount, hPolicyGrad, hBottomDiff); } }