private void CalculateGradientForSingleTrainingExample(Network network, IErrorFunction errorFunction, ref List <List <NeuronData> > intermediateResults, float[] trainingInput, float[] trainingDesiredOutput)
        {
            List <float[]> activations = new List <float[]>();
            List <float[]> zValues     = new List <float[]>();

            network.Compute(this, trainingInput, ref activations, ref zValues, false); //dont flush working cache

            var          lastLayerGradient = intermediateResults.Last();
            List <float> delta_k_holder    = new List <float>();

            CalculateOutputLayerGradient(network, errorFunction, ref lastLayerGradient, ref delta_k_holder, activations, trainingInput, zValues, trainingDesiredOutput);

            for (int i = network.layers.Count - 2; i >= 0; --i)
            {
                var layerGradient = intermediateResults[i];
                CalculateHiddenLayerGradient(network, i, ref layerGradient, ref delta_k_holder, i == 0 ? trainingInput : activations[i - 1], zValues);
            }
        }
        private void CalculateOutputLayerGradient(Network network, IErrorFunction errorFunction, ref List <NeuronData> gradientData, ref List <float> delta_k_vector, List <float[]> activations, float[] trainingInput, List <float[]> zValues, float[] desiredOutput)
        {
            var prevActivations      = activations.Count <= 1 ? trainingInput : activations[activations.Count - 2];
            int lastLayerWeightCount = network.layers.Last().GetWeightsPerNeuron();
            int lastLayerNeuronCount = network.layers.Last().GetNeuronCount();

            for (int i = 0; i < lastLayerNeuronCount; i++)
            {
                float outputValue = activations.Last()[i];
                float delta_k     = errorFunction.CalculateDelta(zValues.Last()[i], outputValue, desiredOutput[i], network.activationFunction);

                var gradientDataItem = gradientData[i];
                //Assert(gradientData[i].weights.Length == prevActivations.Length);
                for (int j = 0; j < lastLayerWeightCount; j++)
                {
                    gradientDataItem.weights[j] += delta_k * prevActivations[j];
                }
                gradientDataItem.bias += delta_k;
                delta_k_vector.Add(delta_k);
            }
        }