// this magic number was arrived at with relation to the CoNLL benchmark, and tinkering
        public override bool UpdateWeights(ConcatVector weights, ConcatVector gradient, double logLikelihood, AbstractBatchOptimizer.OptimizationState optimizationState, bool quiet)
        {
            BacktrackingAdaGradOptimizer.AdaGradOptimizationState s = (BacktrackingAdaGradOptimizer.AdaGradOptimizationState)optimizationState;
            double logLikelihoodChange = logLikelihood - s.lastLogLikelihood;

            if (logLikelihoodChange == 0)
            {
                if (!quiet)
                {
                    log.Info("\tlogLikelihood improvement = 0: quitting");
                }
                return(true);
            }
            else
            {
                // Check if we should backtrack
                if (logLikelihoodChange < 0)
                {
                    // If we should, move the weights back by half, and cut the lastDerivative by half
                    s.lastDerivative.MapInPlace(null);
                    weights.AddVectorInPlace(s.lastDerivative, -1.0);
                    if (!quiet)
                    {
                        log.Info("\tBACKTRACK...");
                    }
                    // if the lastDerivative norm falls below a threshold, it means we've converged
                    if (s.lastDerivative.DotProduct(s.lastDerivative) < 1.0e-10)
                    {
                        if (!quiet)
                        {
                            log.Info("\tBacktracking derivative norm " + s.lastDerivative.DotProduct(s.lastDerivative) + " < 1.0e-9: quitting");
                        }
                        return(true);
                    }
                }
                else
                {
                    // Apply AdaGrad
                    ConcatVector squared = gradient.DeepClone();
                    squared.MapInPlace(null);
                    s.adagradAccumulator.AddVectorInPlace(squared, 1.0);
                    ConcatVector sqrt = s.adagradAccumulator.DeepClone();
                    sqrt.MapInPlace(null);
                    gradient.ElementwiseProductInPlace(sqrt);
                    weights.AddVectorInPlace(gradient, 1.0);
                    // Setup for backtracking, in case necessary
                    s.lastDerivative    = gradient;
                    s.lastLogLikelihood = logLikelihood;
                    if (!quiet)
                    {
                        log.Info("\tLL: " + logLikelihood);
                    }
                }
            }
            return(false);
        }