Пример #1
0
        /// <summary>
        /// Calcualte the loss and initial gradients.
        /// </summary>
        /// <param name="sender">Specifies the MemoryLoss layer firing the event.</param>
        /// <param name="e">Specifies the arguments with the Bottom(s) flowing into the MemoryLoss layer and the loss value to be filled out.</param>
        /// <remarks>
        /// The initial gradient is calculated such that it encourages the action that was taken to be taken.
        ///
        /// When using a Sigmoid, the gradient = (action=0) ? 1 - Aprob : 0 - Aprob.
        /// When using a Softmax, the gradient = the SoftmaxCrossEntropyLoss backward.
        ///
        /// @see [CS231n Convolution Neural Networks for Visual Recognition](http://cs231n.github.io/neural-networks-2/#losses) by Karpathy, Stanford University
        ///
        /// Regardless of the gradient used, the gradient is then modulated by multiplying it with the discounted rewards.
        /// </remarks>
        private void memLoss_OnGetLoss(object sender, MemoryLossLayerGetLossArgs <T> e)
        {
            if (m_bSkipLoss)
            {
                return;
            }

            int      nCount        = m_blobPolicyGradient.count();
            long     hActionOneHot = m_blobActionOneHot.gpu_data;
            long     hPolicyGrad   = m_blobPolicyGradient.mutable_gpu_data;
            long     hDiscountedR  = m_blobDiscountedR.gpu_data;
            double   dfLoss;
            Blob <T> blobOriginalBottom = e.Bottom[0];
            int      nDataSize          = e.Bottom[0].count(1);
            bool     bUsingEndData      = false;

            // When using a recurrent model and receiving data with more than one sequence,
            // copy and only use the last sequence data.
            if (m_nRecurrentSequenceLength > 1)
            {
                if (e.Bottom[0].num > 1)
                {
                    m_blobAprobLogit.CopyFrom(e.Bottom[0], false, true);
                    m_blobAprobLogit.CopyFrom(e.Bottom[0], true);

                    List <int> rgShape = e.Bottom[0].shape();
                    rgShape[0] = 1;
                    e.Bottom[0].Reshape(rgShape);
                    e.Bottom[0].CopyFrom(m_blobAprobLogit, (m_blobAprobLogit.num - 1) * nDataSize, 0, nDataSize, true, true);
                    bUsingEndData = true;
                }
            }

            long hBottomDiff = e.Bottom[0].mutable_gpu_diff;

            // Calculate the initial gradients (policy grad initially just contains the action probabilities)
            if (m_softmax != null)
            {
                BlobCollection <T> colBottom = new BlobCollection <T>();
                BlobCollection <T> colTop    = new BlobCollection <T>();

                colBottom.Add(e.Bottom[0]);             // aprob logit
                colBottom.Add(m_blobActionOneHot);      // action one-hot vectors
                colTop.Add(m_blobLoss);
                colTop.Add(m_blobPolicyGradient);

                if (!m_bSoftmaxCeSetup)
                {
                    m_softmaxCe.Setup(colBottom, colTop);
                    m_bSoftmaxCeSetup = true;
                }

                dfLoss = m_softmaxCe.Forward(colBottom, colTop);
                m_softmaxCe.Backward(colTop, new List <bool>()
                {
                    true, false
                }, colBottom);
                hPolicyGrad = colBottom[0].gpu_diff;
            }
            else
            {
                // Calculate (a=0) ? 1-aprob : 0-aprob
                m_mycaffe.Cuda.add_scalar(nCount, -1.0, hActionOneHot);              // invert one hot
                m_mycaffe.Cuda.abs(nCount, hActionOneHot, hActionOneHot);
                m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad);                // negate Aprob
                m_mycaffe.Cuda.add(nCount, hActionOneHot, hPolicyGrad, hPolicyGrad); // gradient = ((a=0)?1:0) - Aprob
                dfLoss = Utility.ConvertVal <T>(m_blobPolicyGradient.sumsq_data());

                m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad); // invert for we ApplyUpdate subtracts the gradients
            }

            // Modulate the gradient with the advantage (PG magic happens right here.)
            m_mycaffe.Cuda.mul(nCount, hPolicyGrad, hDiscountedR, hPolicyGrad);

            e.Loss             = dfLoss;
            e.EnableLossUpdate = false; // apply gradients to bottom directly.

            if (hPolicyGrad != hBottomDiff)
            {
                m_mycaffe.Cuda.copy(nCount, hPolicyGrad, hBottomDiff);
            }

            // When using recurrent model with more than one sequence of data, only
            // copy the diff to the last in the sequence and zero out the rest in the sequence.
            if (m_nRecurrentSequenceLength > 1 && bUsingEndData)
            {
                m_blobAprobLogit.SetDiff(0);
                m_blobAprobLogit.CopyFrom(e.Bottom[0], 0, (m_blobAprobLogit.num - 1) * nDataSize, nDataSize, false, true);
                e.Bottom[0].CopyFrom(m_blobAprobLogit, false, true);
                e.Bottom[0].CopyFrom(m_blobAprobLogit, true);
            }
        }
Пример #2
0
        /// <summary>
        /// Calcualte the loss and initial gradients.
        /// </summary>
        /// <param name="sender">Specifies the MemoryLoss layer firing the event.</param>
        /// <param name="e">Specifies the arguments with the Bottom(s) flowing into the MemoryLoss layer and the loss value to be filled out.</param>
        /// <remarks>
        /// The initial gradient is calculated such that it encourages the action that was taken to be taken.
        ///
        /// When using a Sigmoid, the gradient = (action=0) ? 1 - Aprob : 0 - Aprob.
        /// When using a Softmax, the gradient = the SoftmaxCrossEntropyLoss backward.
        ///
        /// @see [CS231n Convolution Neural Networks for Visual Recognition](http://cs231n.github.io/neural-networks-2/#losses) by Karpathy, Stanford University
        ///
        /// Regardless of the gradient used, the gradient is then modulated by multiplying it with the discounted rewards.
        /// </remarks>
        private void memLoss_OnGetLoss(object sender, MemoryLossLayerGetLossArgs <T> e)
        {
            if (m_bSkipLoss)
            {
                return;
            }

            int    nCount        = m_blobPolicyGradient.count();
            long   hActionOneHot = m_blobActionOneHot.gpu_data;
            long   hPolicyGrad   = m_blobPolicyGradient.mutable_gpu_data;
            long   hBottomDiff   = e.Bottom[0].mutable_gpu_diff;
            long   hDiscountedR  = m_blobDiscountedR.gpu_data;
            double dfLoss;

            // Calculate the initial gradients (policy grad initially just contains the action probabilities)
            if (m_softmax != null)
            {
                BlobCollection <T> colBottom = new BlobCollection <T>();
                BlobCollection <T> colTop    = new BlobCollection <T>();

                colBottom.Add(e.Bottom[0]);             // aprob logit
                colBottom.Add(m_blobActionOneHot);      // action one-hot vectors
                colTop.Add(m_blobLoss);
                colTop.Add(m_blobPolicyGradient);

                if (!m_bSoftmaxCeSetup)
                {
                    m_softmaxCe.Setup(colBottom, colTop);
                    m_bSoftmaxCeSetup = true;
                }

                dfLoss = m_softmaxCe.Forward(colBottom, colTop);
                m_softmaxCe.Backward(colTop, new List <bool>()
                {
                    true, false
                }, colBottom);
                hPolicyGrad = colBottom[0].gpu_diff;
            }
            else
            {
                // Calculate (a=0) ? 1-aprob : 0-aprob
                m_mycaffe.Cuda.add_scalar(nCount, -1.0, hActionOneHot);              // invert one hot
                m_mycaffe.Cuda.abs(nCount, hActionOneHot, hActionOneHot);
                m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad);                // negate Aprob
                m_mycaffe.Cuda.add(nCount, hActionOneHot, hPolicyGrad, hPolicyGrad); // gradient = ((a=0)?1:0) - Aprob
                dfLoss = Utility.ConvertVal <T>(m_blobPolicyGradient.sumsq_data());

                m_mycaffe.Cuda.mul_scalar(nCount, -1.0, hPolicyGrad); // invert for we ApplyUpdate subtracts the gradients
            }

            // Modulate the gradient with the advantage (PG magic happens right here.)
            m_mycaffe.Cuda.mul(nCount, hPolicyGrad, hDiscountedR, hPolicyGrad);

            e.Loss             = dfLoss;
            e.EnableLossUpdate = false; // apply gradients to bottom directly.

            if (hPolicyGrad != hBottomDiff)
            {
                m_mycaffe.Cuda.copy(nCount, hPolicyGrad, hBottomDiff);
            }
        }