Пример #1
0
        /////////////
        public void Backpropagate(DErrorsList dErr_wrt_dXn /* in */,
                                  DErrorsList dErr_wrt_dXnm1 /* out */,
                                  NNNeuronOutputs thisLayerOutput, // memorized values of this layer's output
                                  NNNeuronOutputs prevLayerOutput, // memorized values of previous layer's output
                                  double etaLearningRate)
        {
            // nomenclature (repeated from NeuralNetwork class):
            //
            // Err is output error of the entire neural net
            // Xn is the output vector on the n-th layer
            // Xnm1 is the output vector of the previous layer
            // Wn is the vector of weights of the n-th layer
            // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
            // F is the squashing function: Xn = F(Yn)
            // F' is the derivative of the squashing function
            //   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input
            try
            {
                int         ii, jj;
                uint        kk;
                int         nIndex;
                double      output;
                DErrorsList dErr_wrt_dYn = new DErrorsList(m_Neurons.Count);
                //
                //	std::vector< double > dErr_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
                //////////////////////////////////////////////////
                //
                ///// DESIGN TRADEOFF: REVIEW !!
                // We would prefer (for ease of coding) to use STL vector for the array "dErr_wrt_dWn", which is the
                // differential of the current pattern's error wrt weights in the layer.  However, for layers with
                // many weights, such as fully-connected layers, there are also many weights.  The STL vector
                // class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable
                // number of page faults, with a consequent slowing of the application's overall execution time.

                // To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and
                // delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
                // errors, and did not improve performance.

                // So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
                // write a statement like
                //    double dErr_wrt_dWn[ m_Weights.size() ];
                // since the compiler insists upon a compile-time known constant value for the size of the array.
                // To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
                // The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
                // this comment is labeled "REVIEW"
                double[] dErr_wrt_dWn = new double[m_Weights.Count];
                for (ii = 0; ii < m_Weights.Count; ii++)
                {
                    dErr_wrt_dWn[ii] = 0.0;
                }

                bool bMemorized = (thisLayerOutput != null) && (prevLayerOutput != null);
                // calculate dErr_wrt_dYn = F'(Yn) * dErr_wrt_Xn

                for (ii = 0; ii < m_Neurons.Count; ii++)
                {
                    if (bMemorized != false)
                    {
                        output = thisLayerOutput[ii];
                    }
                    else
                    {
                        output = m_Neurons[ii].output;
                    }

                    dErr_wrt_dYn.Add(m_sigmoid.DSIGMOID(output) * dErr_wrt_dXn[ii]);
                }

                // calculate dErr_wrt_Wn = Xnm1 * dErr_wrt_Yn
                // For each neuron in this layer, go through the list of connections from the prior layer, and
                // update the differential for the corresponding weight

                ii = 0;
                foreach (NNNeuron nit in m_Neurons)
                {
                    foreach (NNConnection cit in nit.m_Connections)
                    {
                        kk = cit.NeuronIndex;
                        if (kk == 0xffffffff)
                        {
                            output = 1.0;  // this is the bias weight
                        }
                        else
                        {
                            if (bMemorized != false)
                            {
                                output = prevLayerOutput[(int)kk];
                            }
                            else
                            {
                                output = m_pPrevLayer.m_Neurons[(int)kk].output;
                            }
                        }
                        dErr_wrt_dWn[cit.WeightIndex] += dErr_wrt_dYn[ii] * output;
                    }

                    ii++;
                }
                // calculate dErr_wrt_Xnm1 = Wn * dErr_wrt_dYn, which is needed as the input value of
                // dErr_wrt_Xn for backpropagation of the next (i.e., previous) layer
                // For each neuron in this layer

                ii = 0;
                foreach (NNNeuron nit in m_Neurons)
                {
                    foreach (NNConnection cit in nit.m_Connections)
                    {
                        kk = cit.NeuronIndex;
                        if (kk != 0xffffffff)
                        {
                            // we exclude ULONG_MAX, which signifies the phantom bias neuron with
                            // constant output of "1", since we cannot train the bias neuron

                            nIndex = (int)kk;
                            dErr_wrt_dXnm1[nIndex] += dErr_wrt_dYn[ii] * m_Weights[(int)cit.WeightIndex].value;
                        }
                    }

                    ii++;  // ii tracks the neuron iterator
                }
                // finally, update the weights of this layer neuron using dErr_wrt_dW and the learning rate eta
                // Use an atomic compare-and-exchange operation, which means that another thread might be in
                // the process of backpropagation and the weights might have shifted slightly
                const double dMicron = 0.10;
                double       epsilon, divisor;
                double       oldValue;
                double       newValue;
                for (jj = 0; jj < m_Weights.Count; ++jj)
                {
                    divisor = m_Weights[jj].diagHessian + dMicron;

                    // the following code has been rendered unnecessary, since the value of the Hessian has been
                    // verified when it was created, so as to ensure that it is strictly
                    // zero-positve.  Thus, it is impossible for the diagHessian to be less than zero,
                    // and it is impossible for the divisor to be less than dMicron

                    /*
                     * if ( divisor < dMicron )
                     * {
                     * // it should not be possible to reach here, since everything in the second derviative equations
                     * // is strictly zero-positive, and thus "divisor" should definitely be as large as MICRON.
                     *
                     * ASSERT( divisor >= dMicron );
                     * divisor = 1.0 ;  // this will limit the size of the update to the same as the size of gloabal eta
                     * }
                     */
                    epsilon  = etaLearningRate / divisor;
                    oldValue = m_Weights[jj].value;
                    newValue = oldValue - epsilon * dErr_wrt_dWn[jj];
                    while (oldValue != Interlocked.CompareExchange(
                               ref (m_Weights[jj].value),
                               (double)newValue, (double)oldValue))
                    {
                        // another thread must have modified the weight.

                        // Obtain its new value, adjust it, and try again

                        oldValue = m_Weights[jj].value;
                        newValue = oldValue - epsilon * dErr_wrt_dWn[jj];
                    }
                }
            }
            catch (Exception ex)
            {
                return;
            }
        }
Пример #2
0
        public void Backpropagate(double[] actualOutput, double[] desiredOutput, int count, NNNeuronOutputsList pMemorizedNeuronOutputs)
        {
            // backpropagates through the neural net

            if ((m_Layers.Count >= 2) == false)        // there must be at least two layers in the net
            {
                return;
            }
            if ((actualOutput == null) || (desiredOutput == null) || (count >= 256))
            {
                return;
            }


            // check if it's time for a weight sanity check

            m_cBackprops++;

            if ((m_cBackprops % 10000) == 0)
            {
                // every 10000 backprops

                PeriodicWeightSanityCheck();
            }


            // proceed from the last layer to the first, iteratively
            // We calculate the last layer separately, and first, since it provides the needed derviative
            // (i.e., dErr_wrt_dXnm1) for the previous layers

            // nomenclature:
            //
            // Err is output error of the entire neural net
            // Xn is the output vector on the n-th layer
            // Xnm1 is the output vector of the previous layer
            // Wn is the vector of weights of the n-th layer
            // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
            // F is the squashing function: Xn = F(Yn)
            // F' is the derivative of the squashing function
            //   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input

            int iSize           = m_Layers.Count;
            var dErr_wrt_dXlast = new DErrorsList(m_Layers[m_Layers.Count - 1].m_Neurons.Count);
            var differentials   = new List <DErrorsList>(iSize);

            int ii;

            // start the process by calculating dErr_wrt_dXn for the last layer.
            // for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is simply
            // the difference between the target and the actual

            for (ii = 0; ii < m_Layers[m_Layers.Count - 1].m_Neurons.Count; ++ii)
            {
                dErr_wrt_dXlast.Add(actualOutput[ii] - desiredOutput[ii]);
            }


            // store Xlast and reserve memory for the remaining vectors stored in differentials



            for (ii = 0; ii < iSize - 1; ii++)
            {
                var m_differential = new DErrorsList(m_Layers[ii].m_Neurons.Count);
                for (int kk = 0; kk < m_Layers[ii].m_Neurons.Count; kk++)
                {
                    m_differential.Add(0.0);
                }
                differentials.Add(m_differential);
            }
            differentials.Add(dErr_wrt_dXlast);          // last one
            // now iterate through all layers including the last but excluding the first, and ask each of
            // them to backpropagate error and adjust their weights, and to return the differential
            // dErr_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer

            bool bMemorized = (pMemorizedNeuronOutputs != null);

            for (int jj = iSize - 1; jj > 0; jj--)
            {
                if (bMemorized != false)
                {
                    m_Layers[jj].Backpropagate(differentials[jj], differentials[jj - 1],
                                               pMemorizedNeuronOutputs[jj], pMemorizedNeuronOutputs[jj - 1], m_etaLearningRate);
                }
                else
                {
                    m_Layers[jj].Backpropagate(differentials[jj], differentials[jj - 1],
                                               null, null, m_etaLearningRate);
                }
            }


            differentials.Clear();
        }
Пример #3
0
        public void BackpropagateSecondDerivatives(DErrorsList d2Err_wrt_dXn /* in */,
                                                   DErrorsList d2Err_wrt_dXnm1 /* out */)
        {
            // nomenclature (repeated from NeuralNetwork class)
            // NOTE: even though we are addressing SECOND derivatives ( and not first derivatives),
            // we use nearly the same notation as if there were first derivatives, since otherwise the
            // ASCII look would be confusing.  We add one "2" but not two "2's", such as "d2Err_wrt_dXn",
            // to give a gentle emphasis that we are using second derivatives
            //
            // Err is output error of the entire neural net
            // Xn is the output vector on the n-th layer
            // Xnm1 is the output vector of the previous layer
            // Wn is the vector of weights of the n-th layer
            // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied
            // F is the squashing function: Xn = F(Yn)
            // F' is the derivative of the squashing function
            //   Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input

            int    ii, jj;
            uint   kk;
            int    nIndex;
            double output;
            double dTemp;

            var d2Err_wrt_dYn = new DErrorsList(m_Neurons.Count);

            //
            // std::vector< double > d2Err_wrt_dWn( m_Weights.size(), 0.0 );  // important to initialize to zero
            //////////////////////////////////////////////////
            //
            ///// DESIGN TRADEOFF: REVIEW !!
            //
            // Note that the reasoning of this comment is identical to that in the NNLayer::Backpropagate()
            // function, from which the instant BackpropagateSecondDerivatives() function is derived from
            //
            // We would prefer (for ease of coding) to use STL vector for the array "d2Err_wrt_dWn", which is the
            // second differential of the current pattern's error wrt weights in the layer.  However, for layers with
            // many weights, such as fully-connected layers, there are also many weights.  The STL vector
            // class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable
            // number of page faults, with a consequent slowing of the application's overall execution time.

            // To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and
            // delete[]'ing it at the end of the function.  However, this caused the same number of page-fault
            // errors, and did not improve performance.

            // So I tried a plain-old C array allocated on the stack (i.e., not the heap).  Of course I could not
            // write a statement like
            //    double d2Err_wrt_dWn[ m_Weights.size() ];
            // since the compiler insists upon a compile-time known constant value for the size of the array.
            // To avoid this requirement, I used the _alloca function, to allocate memory on the stack.
            // The downside of this is excessive stack usage, and there might be stack overflow probelms.  That's why
            // this comment is labeled "REVIEW"

            double[] d2Err_wrt_dWn = new double[m_Weights.Count];
            for (ii = 0; ii < m_Weights.Count; ii++)
            {
                d2Err_wrt_dWn[ii] = 0.0;
            }
            // calculate d2Err_wrt_dYn = ( F'(Yn) )^2 * dErr_wrt_Xn (where dErr_wrt_Xn is actually a second derivative )

            for (ii = 0; ii < m_Neurons.Count; ii++)
            {
                output = m_Neurons[ii].output;
                dTemp  = m_sigmoid.DSIGMOID(output);
                d2Err_wrt_dYn.Add(d2Err_wrt_dXn[ii] * dTemp * dTemp);
            }
            // calculate d2Err_wrt_Wn = ( Xnm1 )^2 * d2Err_wrt_Yn (where dE2rr_wrt_Yn is actually a second derivative)
            // For each neuron in this layer, go through the list of connections from the prior layer, and
            // update the differential for the corresponding weight

            ii = 0;
            foreach (NNNeuron nit in m_Neurons)
            {
                foreach (NNConnection cit in nit.m_Connections)
                {
                    try
                    {
                        kk = (uint)cit.NeuronIndex;
                        if (kk == 0xffffffff)
                        {
                            output = 1.0;  // this is the bias connection; implied neuron output of "1"
                        }
                        else
                        {
                            output = m_pPrevLayer.m_Neurons[(int)kk].output;
                        }

                        ////////////	ASSERT( (*cit).WeightIndex < d2Err_wrt_dWn.size() );  // since after changing d2Err_wrt_dWn to a C-style array, the size() function this won't work
                        //d2Err_wrt_dWn[cit.WeightIndex] += d2Err_wrt_dYn[ii] * output * output;
                        d2Err_wrt_dWn[cit.WeightIndex] = d2Err_wrt_dYn[ii] * output * output;
                    }
                    catch (Exception ex)
                    {
                    }
                }

                ii++;
            }
            // calculate d2Err_wrt_Xnm1 = ( Wn )^2 * d2Err_wrt_dYn (where d2Err_wrt_dYn is a second derivative not a first).
            // d2Err_wrt_Xnm1 is needed as the input value of
            // d2Err_wrt_Xn for backpropagation of second derivatives for the next (i.e., previous spatially) layer
            // For each neuron in this layer

            ii = 0;
            foreach (NNNeuron nit in m_Neurons)
            {
                foreach (NNConnection cit in nit.m_Connections)
                {
                    try
                    {
                        kk = cit.NeuronIndex;
                        if (kk != 0xffffffff)
                        {
                            // we exclude ULONG_MAX, which signifies the phantom bias neuron with
                            // constant output of "1", since we cannot train the bias neuron

                            nIndex = (int)kk;
                            dTemp  = m_Weights[(int)cit.WeightIndex].value;
                            d2Err_wrt_dXnm1[nIndex] += d2Err_wrt_dYn[ii] * dTemp * dTemp;
                        }
                    }
                    catch (Exception ex)
                    {
                        return;
                    }
                }

                ii++;  // ii tracks the neuron iterator
            }
            double oldValue, newValue;

            // finally, update the diagonal Hessians for the weights of this layer neuron using dErr_wrt_dW.
            // By design, this function (and its iteration over many (approx 500 patterns) is called while a
            // single thread has locked the nueral network, so there is no possibility that another
            // thread might change the value of the Hessian.  Nevertheless, since it's easy to do, we
            // use an atomic compare-and-exchange operation, which means that another thread might be in
            // the process of backpropagation of second derivatives and the Hessians might have shifted slightly

            for (jj = 0; jj < m_Weights.Count; jj++)
            {
                oldValue = m_Weights[jj].diagHessian;
                newValue = oldValue + d2Err_wrt_dWn[jj];
                m_Weights[jj].diagHessian = newValue;
            }
        }
Пример #4
0
        public void BackpropagateSecondDervatives(double[] actualOutputVector, double[] targetOutputVector, uint count)
        {
            // calculates the second dervatives (for diagonal Hessian) and backpropagates
            // them through neural net


            if (m_Layers.Count < 2)
            {
                return;
            }
            ;                                          // there must be at least two layers in the net

            if ((actualOutputVector == null) || (targetOutputVector == null) || (count >= 256))
            {
                return;
            }

            // we use nearly the same nomenclature as above (e.g., "dErr_wrt_dXnm1") even though everything here
            // is actually second derivatives and not first derivatives, since otherwise the ASCII would
            // become too confusing.  To emphasize that these are second derivatives, we insert a "2"
            // such as "d2Err_wrt_dXnm1".  We don't insert the second "2" that's conventional for designating
            // second derivatives"

            int iSize            = m_Layers.Count;
            int neuronCount      = m_Layers[m_Layers.Count - 1].m_Neurons.Count;
            var d2Err_wrt_dXlast = new DErrorsList(neuronCount);
            var differentials    = new List <DErrorsList>(iSize);


            // start the process by calculating the second derivative dErr_wrt_dXn for the last layer.
            // for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is
            // exactly one

            var lit = m_Layers.Last();      // point to last layer

            for (int ii = 0; ii < lit.m_Neurons.Count; ii++)
            {
                d2Err_wrt_dXlast.Add(1.0);
            }

            // store Xlast and reserve memory for the remaining vectors stored in differentials


            for (int ii = 0; ii < iSize - 1; ii++)
            {
                var m_differential = new DErrorsList(m_Layers[ii].m_Neurons.Count);
                for (int kk = 0; kk < m_Layers[ii].m_Neurons.Count; kk++)
                {
                    m_differential.Add(0.0);
                }
                differentials.Add(m_differential);
            }

            differentials.Add(d2Err_wrt_dXlast);          // last one

            // now iterate through all layers including the last but excluding the first, starting from
            // the last, and ask each of
            // them to backpropagate the second derviative and accumulate the diagonal Hessian, and also to
            // return the second dervative
            // d2Err_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer (which
            // is the previous layer spatially)

            for (int ii = iSize - 1; ii > 0; ii--)
            {
                m_Layers[ii].BackpropagateSecondDerivatives(differentials[ii], differentials[ii - 1]);
            }

            differentials.Clear();
        }