int m_iterationCounter; // iteration counter

        /// <summary>
        /// Neural net optimizer for controlling the weight updates in neural net learning.
        /// uses mini-batch stochastic gradient descent.
        /// Several different optimization methods is available through the constructor.
        /// </summary>
        /// <param name="learningRate">Controls the step size when updating the weights. (Default is 0.01)</param>
        /// <param name="batchSize">Batch size for mini-batch stochastic gradient descent. (Default is 128)</param>
        /// <param name="l1decay">L1 regularization term. (Default is 0, so no regularization)</param>
        /// <param name="l2decay">L2 regularization term. (Default is 0, so no regularization)</param>
        /// <param name="optimizerMethod">The method used for optimization (Default is RMSProp)</param>
        /// <param name="momentum">Momentum for gradient update. Should be between 0 and 1. (Default is 0.9)</param>
        /// <param name="rho">Squared gradient moving average decay factor (Default is 0.95)</param>
        /// <param name="beta1">Exponential decay rate for estimates of first moment vector, should be in range 0 to 1 (Default is 0.9)</param>
        /// <param name="beta2">Exponential decay rate for estimates of second moment vector, should be in range 0 to 1 (Default is 0.999)</param>
        public NeuralNetOptimizer(
            double learningRate,
            int batchSize,
            double l1decay = 0,
            double l2decay = 0,
            OptimizerMethod optimizerMethod = OptimizerMethod.RMSProp,
            double momentum = 0.9,
            double rho      = 0.95,
            double beta1    = 0.9,
            double beta2    = 0.999)
        {
            if (learningRate <= 0)
            {
                throw new ArgumentNullException("learning rate must be larger than 0. Was: " + learningRate);
            }
            if (batchSize <= 0)
            {
                throw new ArgumentNullException("batchSize must be larger than 0. Was: " + batchSize);
            }
            if (l1decay < 0)
            {
                throw new ArgumentNullException("l1decay must be positive. Was: " + l1decay);
            }
            if (l2decay < 0)
            {
                throw new ArgumentNullException("l1decay must be positive. Was: " + l2decay);
            }
            if (momentum <= 0)
            {
                throw new ArgumentNullException("momentum must be larger than 0. Was: " + momentum);
            }
            if (rho <= 0)
            {
                throw new ArgumentNullException("rho must be larger than 0. Was: " + rho);
            }
            if (beta1 <= 0)
            {
                throw new ArgumentNullException("beta1 must be larger than 0. Was: " + beta1);
            }
            if (beta2 <= 0)
            {
                throw new ArgumentNullException("beta2 must be larger than 0. Was: " + beta2);
            }

            m_learningRate     = (float)learningRate;
            m_learningRateInit = (float)learningRate;
            m_batchSize        = batchSize;
            m_l1Decay          = (float)l1decay;
            m_l2Decay          = (float)l2decay;

            m_optimizerMethod = optimizerMethod;
            m_momentum        = (float)momentum;
            m_rho             = (float)rho;
            m_beta1           = (float)beta1;
            m_beta2           = (float)beta2;
        }
        /// <summary>
        /// ClassificationNeuralNet learner using mini-batch gradient descent.
        /// Several optimization methods is availible through the constructor.
        /// </summary>
        /// <param name="net">The neural net to learn</param>
        /// <param name="loss">The loss measured and shown between each iteration</param>
        /// <param name="learningRate">Controls the step size when updating the weights. (Default is 0.001)</param>
        /// <param name="iterations">The maximum number of iterations before termination. (Default is 100)</param>
        /// <param name="batchSize">Batch size for mini-batch stochastic gradient descent. (Default is 128)</param>
        /// <param name="l1decay">L1 reguralization term. (Default is 0, so no reguralization)</param>
        /// <param name="l2decay">L2 reguralization term. (Default is 0, so no reguralization)</param>
        /// <param name="optimizerMethod">The method used for optimization (Default is RMSProp)</param>
        /// <param name="momentum">Momentum for gradient update. Should be between 0 and 1. (Defualt is 0.9)</param>
        /// <param name="rho">Squared gradient moving average decay factor (Default is 0.95)</param>
        /// <param name="beta1">Exponential decay rate for estimates of first moment vector, should be in range 0 to 1 (Default is 0.9)</param>
        /// <param name="beta2">Exponential decay rate for estimates of second moment vector, should be in range 0 to 1 (Default is 0.999)</param>
        public ClassificationNeuralNetLearner(NeuralNet net, ILoss loss, double learningRate = 0.001, int iterations     = 100, int batchSize = 128, double l1decay = 0, double l2decay = 0,
                                              OptimizerMethod optimizerMethod = OptimizerMethod.RMSProp, double momentum = 0.9, double rho    = 0.95, double beta1  = 0.9, double beta2 = 0.999)
        {
            if (!(net.Layers.Last() is IClassificationLayer))
            {
                throw new ArgumentException("Last layer must be a classification layer type. Was: " + net.Layers.Last().GetType().Name);
            }

            m_learner = new NeuralNetLearner(net, new OneOfNTargetEncoder(), loss,
                                             learningRate, iterations, batchSize, l1decay, l2decay, optimizerMethod, momentum, rho, beta1, beta2);
        }
        /// <summary>
        /// Neural net learner. Controls the learning process using mini-batch gradient descent.
        /// </summary>
        /// <param name="net">The neural net to learn</param>
        /// <param name="targetEncoder">Controls how the training targets should be decoded.
        /// This is different depending on if the net should be used for regression or classification.</param>
        /// <param name="loss">The loss measured and shown between each iteration</param>
        /// <param name="learningRate">Controls the step size when updating the weights. (Default is 0.001)</param>
        /// <param name="iterations">The maximum number of iterations before termination. (Default is 100)</param>
        /// <param name="batchSize">Batch size for mini-batch stochastic gradient descent. (Default is 128)</param>
        /// <param name="l1decay">L1 regularization term. (Default is 0, so no regularization)</param>
        /// <param name="l2decay">L2 regularization term. (Default is 0, so no regularization)</param>
        /// <param name="optimizerMethod">The method used for optimization (Default is RMSProp)</param>
        /// <param name="momentum">Momentum for gradient update. Should be between 0 and 1. (Default is 0.9)</param>
        /// <param name="rho">Squared gradient moving average decay factor (Default is 0.95)</param>
        /// <param name="beta1">Exponential decay rate for estimates of first moment vector, should be in range 0 to 1 (Default is 0.9)</param>
        /// <param name="beta2">Exponential decay rate for estimates of second moment vector, should be in range 0 to 1 (Default is 0.999)</param>
        public NeuralNetLearner(
            NeuralNet net, ITargetEncoder targetEncoder,
            ILoss loss,
            double learningRate             = 0.001,
            int iterations                  = 100,
            int batchSize                   = 128,
            double l1decay                  = 0,
            double l2decay                  = 0,
            OptimizerMethod optimizerMethod = OptimizerMethod.RMSProp,
            double momentum                 = 0.9,
            double rho   = 0.95,
            double beta1 = 0.9,
            double beta2 = 0.999)
        {
            m_net           = net ?? throw new ArgumentNullException(nameof(net));
            m_targetEncoder = targetEncoder ?? throw new ArgumentNullException(nameof(targetEncoder));
            m_loss          = loss ?? throw new ArgumentNullException(nameof(loss));
            if (learningRate <= 0)
            {
                throw new ArgumentNullException("learning rate must be larger than 0. Was: " + learningRate);
            }
            if (iterations <= 0)
            {
                throw new ArgumentNullException("Iterations must be larger than 0. Was: " + iterations);
            }
            if (batchSize <= 0)
            {
                throw new ArgumentNullException("batchSize must be larger than 0. Was: " + batchSize);
            }
            if (l1decay < 0)
            {
                throw new ArgumentNullException("l1decay must be positive. Was: " + l1decay);
            }
            if (l2decay < 0)
            {
                throw new ArgumentNullException("l1decay must be positive. Was: " + l2decay);
            }
            if (momentum <= 0)
            {
                throw new ArgumentNullException("momentum must be larger than 0. Was: " + momentum);
            }
            if (rho <= 0)
            {
                throw new ArgumentNullException("rho must be larger than 0. Was: " + rho);
            }
            if (beta1 <= 0)
            {
                throw new ArgumentNullException("beta1 must be larger than 0. Was: " + beta1);
            }
            if (beta2 <= 0)
            {
                throw new ArgumentNullException("beta2 must be larger than 0. Was: " + beta2);
            }

            m_learningRate = learningRate;
            m_iterations   = iterations;
            m_momentum     = momentum;
            m_batchSize    = batchSize;
            m_random       = new Random(232);

            m_optimizer = new NeuralNetOptimizer(learningRate, batchSize,
                                                 l1decay, l2decay, optimizerMethod, momentum, rho, beta1, beta2);

            SetupLinerAlgebraProvider();
        }