///////////// public void Backpropagate(DErrorsList dErr_wrt_dXn /* in */, DErrorsList dErr_wrt_dXnm1 /* out */, NNNeuronOutputs thisLayerOutput, // memorized values of this layer's output NNNeuronOutputs prevLayerOutput, // memorized values of previous layer's output double etaLearningRate) { // nomenclature (repeated from NeuralNetwork class): // // Err is output error of the entire neural net // Xn is the output vector on the n-th layer // Xnm1 is the output vector of the previous layer // Wn is the vector of weights of the n-th layer // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied // F is the squashing function: Xn = F(Yn) // F' is the derivative of the squashing function // Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input try { int ii, jj; uint kk; int nIndex; double output; DErrorsList dErr_wrt_dYn = new DErrorsList(m_Neurons.Count); // // std::vector< double > dErr_wrt_dWn( m_Weights.size(), 0.0 ); // important to initialize to zero ////////////////////////////////////////////////// // ///// DESIGN TRADEOFF: REVIEW !! // We would prefer (for ease of coding) to use STL vector for the array "dErr_wrt_dWn", which is the // differential of the current pattern's error wrt weights in the layer. However, for layers with // many weights, such as fully-connected layers, there are also many weights. The STL vector // class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable // number of page faults, with a consequent slowing of the application's overall execution time. // To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and // delete[]'ing it at the end of the function. However, this caused the same number of page-fault // errors, and did not improve performance. // So I tried a plain-old C array allocated on the stack (i.e., not the heap). Of course I could not // write a statement like // double dErr_wrt_dWn[ m_Weights.size() ]; // since the compiler insists upon a compile-time known constant value for the size of the array. // To avoid this requirement, I used the _alloca function, to allocate memory on the stack. // The downside of this is excessive stack usage, and there might be stack overflow probelms. That's why // this comment is labeled "REVIEW" double[] dErr_wrt_dWn = new double[m_Weights.Count]; for (ii = 0; ii < m_Weights.Count; ii++) { dErr_wrt_dWn[ii] = 0.0; } bool bMemorized = (thisLayerOutput != null) && (prevLayerOutput != null); // calculate dErr_wrt_dYn = F'(Yn) * dErr_wrt_Xn for (ii = 0; ii < m_Neurons.Count; ii++) { if (bMemorized != false) { output = thisLayerOutput[ii]; } else { output = m_Neurons[ii].output; } dErr_wrt_dYn.Add(m_sigmoid.DSIGMOID(output) * dErr_wrt_dXn[ii]); } // calculate dErr_wrt_Wn = Xnm1 * dErr_wrt_Yn // For each neuron in this layer, go through the list of connections from the prior layer, and // update the differential for the corresponding weight ii = 0; foreach (NNNeuron nit in m_Neurons) { foreach (NNConnection cit in nit.m_Connections) { kk = cit.NeuronIndex; if (kk == 0xffffffff) { output = 1.0; // this is the bias weight } else { if (bMemorized != false) { output = prevLayerOutput[(int)kk]; } else { output = m_pPrevLayer.m_Neurons[(int)kk].output; } } dErr_wrt_dWn[cit.WeightIndex] += dErr_wrt_dYn[ii] * output; } ii++; } // calculate dErr_wrt_Xnm1 = Wn * dErr_wrt_dYn, which is needed as the input value of // dErr_wrt_Xn for backpropagation of the next (i.e., previous) layer // For each neuron in this layer ii = 0; foreach (NNNeuron nit in m_Neurons) { foreach (NNConnection cit in nit.m_Connections) { kk = cit.NeuronIndex; if (kk != 0xffffffff) { // we exclude ULONG_MAX, which signifies the phantom bias neuron with // constant output of "1", since we cannot train the bias neuron nIndex = (int)kk; dErr_wrt_dXnm1[nIndex] += dErr_wrt_dYn[ii] * m_Weights[(int)cit.WeightIndex].value; } } ii++; // ii tracks the neuron iterator } // finally, update the weights of this layer neuron using dErr_wrt_dW and the learning rate eta // Use an atomic compare-and-exchange operation, which means that another thread might be in // the process of backpropagation and the weights might have shifted slightly const double dMicron = 0.10; double epsilon, divisor; double oldValue; double newValue; for (jj = 0; jj < m_Weights.Count; ++jj) { divisor = m_Weights[jj].diagHessian + dMicron; // the following code has been rendered unnecessary, since the value of the Hessian has been // verified when it was created, so as to ensure that it is strictly // zero-positve. Thus, it is impossible for the diagHessian to be less than zero, // and it is impossible for the divisor to be less than dMicron /* * if ( divisor < dMicron ) * { * // it should not be possible to reach here, since everything in the second derviative equations * // is strictly zero-positive, and thus "divisor" should definitely be as large as MICRON. * * ASSERT( divisor >= dMicron ); * divisor = 1.0 ; // this will limit the size of the update to the same as the size of gloabal eta * } */ epsilon = etaLearningRate / divisor; oldValue = m_Weights[jj].value; newValue = oldValue - epsilon * dErr_wrt_dWn[jj]; while (oldValue != Interlocked.CompareExchange( ref (m_Weights[jj].value), (double)newValue, (double)oldValue)) { // another thread must have modified the weight. // Obtain its new value, adjust it, and try again oldValue = m_Weights[jj].value; newValue = oldValue - epsilon * dErr_wrt_dWn[jj]; } } } catch (Exception ex) { return; } }
public void Backpropagate(double[] actualOutput, double[] desiredOutput, int count, NNNeuronOutputsList pMemorizedNeuronOutputs) { // backpropagates through the neural net if ((m_Layers.Count >= 2) == false) // there must be at least two layers in the net { return; } if ((actualOutput == null) || (desiredOutput == null) || (count >= 256)) { return; } // check if it's time for a weight sanity check m_cBackprops++; if ((m_cBackprops % 10000) == 0) { // every 10000 backprops PeriodicWeightSanityCheck(); } // proceed from the last layer to the first, iteratively // We calculate the last layer separately, and first, since it provides the needed derviative // (i.e., dErr_wrt_dXnm1) for the previous layers // nomenclature: // // Err is output error of the entire neural net // Xn is the output vector on the n-th layer // Xnm1 is the output vector of the previous layer // Wn is the vector of weights of the n-th layer // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied // F is the squashing function: Xn = F(Yn) // F' is the derivative of the squashing function // Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input int iSize = m_Layers.Count; var dErr_wrt_dXlast = new DErrorsList(m_Layers[m_Layers.Count - 1].m_Neurons.Count); var differentials = new List <DErrorsList>(iSize); int ii; // start the process by calculating dErr_wrt_dXn for the last layer. // for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is simply // the difference between the target and the actual for (ii = 0; ii < m_Layers[m_Layers.Count - 1].m_Neurons.Count; ++ii) { dErr_wrt_dXlast.Add(actualOutput[ii] - desiredOutput[ii]); } // store Xlast and reserve memory for the remaining vectors stored in differentials for (ii = 0; ii < iSize - 1; ii++) { var m_differential = new DErrorsList(m_Layers[ii].m_Neurons.Count); for (int kk = 0; kk < m_Layers[ii].m_Neurons.Count; kk++) { m_differential.Add(0.0); } differentials.Add(m_differential); } differentials.Add(dErr_wrt_dXlast); // last one // now iterate through all layers including the last but excluding the first, and ask each of // them to backpropagate error and adjust their weights, and to return the differential // dErr_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer bool bMemorized = (pMemorizedNeuronOutputs != null); for (int jj = iSize - 1; jj > 0; jj--) { if (bMemorized != false) { m_Layers[jj].Backpropagate(differentials[jj], differentials[jj - 1], pMemorizedNeuronOutputs[jj], pMemorizedNeuronOutputs[jj - 1], m_etaLearningRate); } else { m_Layers[jj].Backpropagate(differentials[jj], differentials[jj - 1], null, null, m_etaLearningRate); } } differentials.Clear(); }
public void BackpropagateSecondDerivatives(DErrorsList d2Err_wrt_dXn /* in */, DErrorsList d2Err_wrt_dXnm1 /* out */) { // nomenclature (repeated from NeuralNetwork class) // NOTE: even though we are addressing SECOND derivatives ( and not first derivatives), // we use nearly the same notation as if there were first derivatives, since otherwise the // ASCII look would be confusing. We add one "2" but not two "2's", such as "d2Err_wrt_dXn", // to give a gentle emphasis that we are using second derivatives // // Err is output error of the entire neural net // Xn is the output vector on the n-th layer // Xnm1 is the output vector of the previous layer // Wn is the vector of weights of the n-th layer // Yn is the activation value of the n-th layer, i.e., the weighted sum of inputs BEFORE the squashing function is applied // F is the squashing function: Xn = F(Yn) // F' is the derivative of the squashing function // Conveniently, for F = tanh, then F'(Yn) = 1 - Xn^2, i.e., the derivative can be calculated from the output, without knowledge of the input int ii, jj; uint kk; int nIndex; double output; double dTemp; var d2Err_wrt_dYn = new DErrorsList(m_Neurons.Count); // // std::vector< double > d2Err_wrt_dWn( m_Weights.size(), 0.0 ); // important to initialize to zero ////////////////////////////////////////////////// // ///// DESIGN TRADEOFF: REVIEW !! // // Note that the reasoning of this comment is identical to that in the NNLayer::Backpropagate() // function, from which the instant BackpropagateSecondDerivatives() function is derived from // // We would prefer (for ease of coding) to use STL vector for the array "d2Err_wrt_dWn", which is the // second differential of the current pattern's error wrt weights in the layer. However, for layers with // many weights, such as fully-connected layers, there are also many weights. The STL vector // class's allocator is remarkably stupid when allocating large memory chunks, and causes a remarkable // number of page faults, with a consequent slowing of the application's overall execution time. // To fix this, I tried using a plain-old C array, by new'ing the needed space from the heap, and // delete[]'ing it at the end of the function. However, this caused the same number of page-fault // errors, and did not improve performance. // So I tried a plain-old C array allocated on the stack (i.e., not the heap). Of course I could not // write a statement like // double d2Err_wrt_dWn[ m_Weights.size() ]; // since the compiler insists upon a compile-time known constant value for the size of the array. // To avoid this requirement, I used the _alloca function, to allocate memory on the stack. // The downside of this is excessive stack usage, and there might be stack overflow probelms. That's why // this comment is labeled "REVIEW" double[] d2Err_wrt_dWn = new double[m_Weights.Count]; for (ii = 0; ii < m_Weights.Count; ii++) { d2Err_wrt_dWn[ii] = 0.0; } // calculate d2Err_wrt_dYn = ( F'(Yn) )^2 * dErr_wrt_Xn (where dErr_wrt_Xn is actually a second derivative ) for (ii = 0; ii < m_Neurons.Count; ii++) { output = m_Neurons[ii].output; dTemp = m_sigmoid.DSIGMOID(output); d2Err_wrt_dYn.Add(d2Err_wrt_dXn[ii] * dTemp * dTemp); } // calculate d2Err_wrt_Wn = ( Xnm1 )^2 * d2Err_wrt_Yn (where dE2rr_wrt_Yn is actually a second derivative) // For each neuron in this layer, go through the list of connections from the prior layer, and // update the differential for the corresponding weight ii = 0; foreach (NNNeuron nit in m_Neurons) { foreach (NNConnection cit in nit.m_Connections) { try { kk = (uint)cit.NeuronIndex; if (kk == 0xffffffff) { output = 1.0; // this is the bias connection; implied neuron output of "1" } else { output = m_pPrevLayer.m_Neurons[(int)kk].output; } //////////// ASSERT( (*cit).WeightIndex < d2Err_wrt_dWn.size() ); // since after changing d2Err_wrt_dWn to a C-style array, the size() function this won't work //d2Err_wrt_dWn[cit.WeightIndex] += d2Err_wrt_dYn[ii] * output * output; d2Err_wrt_dWn[cit.WeightIndex] = d2Err_wrt_dYn[ii] * output * output; } catch (Exception ex) { } } ii++; } // calculate d2Err_wrt_Xnm1 = ( Wn )^2 * d2Err_wrt_dYn (where d2Err_wrt_dYn is a second derivative not a first). // d2Err_wrt_Xnm1 is needed as the input value of // d2Err_wrt_Xn for backpropagation of second derivatives for the next (i.e., previous spatially) layer // For each neuron in this layer ii = 0; foreach (NNNeuron nit in m_Neurons) { foreach (NNConnection cit in nit.m_Connections) { try { kk = cit.NeuronIndex; if (kk != 0xffffffff) { // we exclude ULONG_MAX, which signifies the phantom bias neuron with // constant output of "1", since we cannot train the bias neuron nIndex = (int)kk; dTemp = m_Weights[(int)cit.WeightIndex].value; d2Err_wrt_dXnm1[nIndex] += d2Err_wrt_dYn[ii] * dTemp * dTemp; } } catch (Exception ex) { return; } } ii++; // ii tracks the neuron iterator } double oldValue, newValue; // finally, update the diagonal Hessians for the weights of this layer neuron using dErr_wrt_dW. // By design, this function (and its iteration over many (approx 500 patterns) is called while a // single thread has locked the nueral network, so there is no possibility that another // thread might change the value of the Hessian. Nevertheless, since it's easy to do, we // use an atomic compare-and-exchange operation, which means that another thread might be in // the process of backpropagation of second derivatives and the Hessians might have shifted slightly for (jj = 0; jj < m_Weights.Count; jj++) { oldValue = m_Weights[jj].diagHessian; newValue = oldValue + d2Err_wrt_dWn[jj]; m_Weights[jj].diagHessian = newValue; } }
public void BackpropagateSecondDervatives(double[] actualOutputVector, double[] targetOutputVector, uint count) { // calculates the second dervatives (for diagonal Hessian) and backpropagates // them through neural net if (m_Layers.Count < 2) { return; } ; // there must be at least two layers in the net if ((actualOutputVector == null) || (targetOutputVector == null) || (count >= 256)) { return; } // we use nearly the same nomenclature as above (e.g., "dErr_wrt_dXnm1") even though everything here // is actually second derivatives and not first derivatives, since otherwise the ASCII would // become too confusing. To emphasize that these are second derivatives, we insert a "2" // such as "d2Err_wrt_dXnm1". We don't insert the second "2" that's conventional for designating // second derivatives" int iSize = m_Layers.Count; int neuronCount = m_Layers[m_Layers.Count - 1].m_Neurons.Count; var d2Err_wrt_dXlast = new DErrorsList(neuronCount); var differentials = new List <DErrorsList>(iSize); // start the process by calculating the second derivative dErr_wrt_dXn for the last layer. // for the standard MSE Err function (i.e., 0.5*sumof( (actual-target)^2 ), this differential is // exactly one var lit = m_Layers.Last(); // point to last layer for (int ii = 0; ii < lit.m_Neurons.Count; ii++) { d2Err_wrt_dXlast.Add(1.0); } // store Xlast and reserve memory for the remaining vectors stored in differentials for (int ii = 0; ii < iSize - 1; ii++) { var m_differential = new DErrorsList(m_Layers[ii].m_Neurons.Count); for (int kk = 0; kk < m_Layers[ii].m_Neurons.Count; kk++) { m_differential.Add(0.0); } differentials.Add(m_differential); } differentials.Add(d2Err_wrt_dXlast); // last one // now iterate through all layers including the last but excluding the first, starting from // the last, and ask each of // them to backpropagate the second derviative and accumulate the diagonal Hessian, and also to // return the second dervative // d2Err_wrt_dXnm1 for use as the input value of dErr_wrt_dXn for the next iterated layer (which // is the previous layer spatially) for (int ii = iSize - 1; ii > 0; ii--) { m_Layers[ii].BackpropagateSecondDerivatives(differentials[ii], differentials[ii - 1]); } differentials.Clear(); }