// this magic number was arrived at with relation to the CoNLL benchmark, and tinkering
        public override bool UpdateWeights(ConcatVector weights, ConcatVector gradient, double logLikelihood, AbstractBatchOptimizer.OptimizationState optimizationState, bool quiet)
        {
            BacktrackingAdaGradOptimizer.AdaGradOptimizationState s = (BacktrackingAdaGradOptimizer.AdaGradOptimizationState)optimizationState;
            double logLikelihoodChange = logLikelihood - s.lastLogLikelihood;

            if (logLikelihoodChange == 0)
            {
                if (!quiet)
                {
                    log.Info("\tlogLikelihood improvement = 0: quitting");
                }
                return(true);
            }
            else
            {
                // Check if we should backtrack
                if (logLikelihoodChange < 0)
                {
                    // If we should, move the weights back by half, and cut the lastDerivative by half
                    s.lastDerivative.MapInPlace(null);
                    weights.AddVectorInPlace(s.lastDerivative, -1.0);
                    if (!quiet)
                    {
                        log.Info("\tBACKTRACK...");
                    }
                    // if the lastDerivative norm falls below a threshold, it means we've converged
                    if (s.lastDerivative.DotProduct(s.lastDerivative) < 1.0e-10)
                    {
                        if (!quiet)
                        {
                            log.Info("\tBacktracking derivative norm " + s.lastDerivative.DotProduct(s.lastDerivative) + " < 1.0e-9: quitting");
                        }
                        return(true);
                    }
                }
                else
                {
                    // Apply AdaGrad
                    ConcatVector squared = gradient.DeepClone();
                    squared.MapInPlace(null);
                    s.adagradAccumulator.AddVectorInPlace(squared, 1.0);
                    ConcatVector sqrt = s.adagradAccumulator.DeepClone();
                    sqrt.MapInPlace(null);
                    gradient.ElementwiseProductInPlace(sqrt);
                    weights.AddVectorInPlace(gradient, 1.0);
                    // Setup for backtracking, in case necessary
                    s.lastDerivative    = gradient;
                    s.lastLogLikelihood = logLikelihood;
                    if (!quiet)
                    {
                        log.Info("\tLL: " + logLikelihood);
                    }
                }
            }
            return(false);
        }
            public virtual void Run()
            {
                // Multithreading stuff
                int numThreads = Math.Max(1, Runtime.GetRuntime().AvailableProcessors());

                IList <T>[] queues = (IList <T>[])(new IList[numThreads]);
                Random      r      = new Random();

                // Allocate work to make estimated cost of work per thread as even as possible
                if (this.useThreads)
                {
                    for (int i = 0; i < numThreads; i++)
                    {
                        queues[i] = new List <T>();
                    }
                    int[] queueEstimatedTotalCost = new int[numThreads];
                    foreach (T datum in this.dataset)
                    {
                        int datumEstimatedCost = this.EstimateRelativeRuntime(datum);
                        int minCostQueue       = 0;
                        for (int i_1 = 0; i_1 < numThreads; i_1++)
                        {
                            if (queueEstimatedTotalCost[i_1] < queueEstimatedTotalCost[minCostQueue])
                            {
                                minCostQueue = i_1;
                            }
                        }
                        queueEstimatedTotalCost[minCostQueue] += datumEstimatedCost;
                        queues[minCostQueue].Add(datum);
                    }
                }
                while (!this.isFinished)
                {
                    // Collect log-likelihood and derivatives
                    long         startTime     = Runtime.CurrentTimeMillis();
                    long         threadWaiting = 0;
                    ConcatVector derivative    = this.weights.NewEmptyClone();
                    double       logLikelihood = 0.0;
                    if (this.useThreads)
                    {
                        AbstractBatchOptimizer.GradientWorker[] workers = new AbstractBatchOptimizer.GradientWorker[numThreads];
                        Thread[] threads = new Thread[numThreads];
                        for (int i = 0; i < workers.Length; i++)
                        {
                            workers[i]             = new AbstractBatchOptimizer.GradientWorker(this, i, numThreads, queues[i], this.fn, this.weights);
                            threads[i]             = new Thread(workers[i]);
                            workers[i].jvmThreadId = threads[i].GetId();
                            threads[i].Start();
                        }
                        // This is for logging
                        long minFinishTime = long.MaxValue;
                        long maxFinishTime = long.MinValue;
                        // This is for re-balancing
                        long minCPUTime    = long.MaxValue;
                        long maxCPUTime    = long.MinValue;
                        int  slowestWorker = 0;
                        int  fastestWorker = 0;
                        for (int i_1 = 0; i_1 < workers.Length; i_1++)
                        {
                            try
                            {
                                threads[i_1].Join();
                            }
                            catch (Exception e)
                            {
                                throw new RuntimeInterruptedException(e);
                            }
                            logLikelihood += workers[i_1].localLogLikelihood;
                            derivative.AddVectorInPlace(workers[i_1].localDerivative, 1.0);
                            if (workers[i_1].finishedAtTime < minFinishTime)
                            {
                                minFinishTime = workers[i_1].finishedAtTime;
                            }
                            if (workers[i_1].finishedAtTime > maxFinishTime)
                            {
                                maxFinishTime = workers[i_1].finishedAtTime;
                            }
                            if (workers[i_1].cpuTimeRequired < minCPUTime)
                            {
                                fastestWorker = i_1;
                                minCPUTime    = workers[i_1].cpuTimeRequired;
                            }
                            if (workers[i_1].cpuTimeRequired > maxCPUTime)
                            {
                                slowestWorker = i_1;
                                maxCPUTime    = workers[i_1].cpuTimeRequired;
                            }
                        }
                        threadWaiting = maxFinishTime - minFinishTime;
                        // Try to reallocate work dynamically to minimize waiting on subsequent rounds
                        // Figure out the percentage of work represented by the waiting
                        double waitingPercentage = (double)(maxCPUTime - minCPUTime) / (double)maxCPUTime;
                        int    needTransferItems = (int)Math.Floor(queues[slowestWorker].Count * waitingPercentage * 0.5);
                        for (int i_2 = 0; i_2 < needTransferItems; i_2++)
                        {
                            int toTransfer = r.NextInt(queues[slowestWorker].Count);
                            T   datum      = queues[slowestWorker][toTransfer];
                            queues[slowestWorker].Remove(toTransfer);
                            queues[fastestWorker].Add(datum);
                        }
                        // Check for user interrupt
                        if (this.isFinished)
                        {
                            return;
                        }
                    }
                    else
                    {
                        foreach (T datum in this.dataset)
                        {
                            System.Diagnostics.Debug.Assert((datum != null));
                            logLikelihood += this.fn.GetSummaryForInstance(datum, this.weights, derivative);
                            // Check for user interrupt
                            if (this.isFinished)
                            {
                                return;
                            }
                        }
                    }
                    logLikelihood /= this.dataset.Length;
                    derivative.MapInPlace(null);
                    long gradientComputationTime = Runtime.CurrentTimeMillis() - startTime;
                    // Regularization
                    logLikelihood = logLikelihood - (this.l2regularization * this.weights.DotProduct(this.weights));
                    derivative.AddVectorInPlace(this.weights, -2 * this.l2regularization);
                    // Zero out the derivative on the components we're holding fixed
                    foreach (AbstractBatchOptimizer.Constraint constraint in this._enclosing.constraints)
                    {
                        constraint.ApplyToDerivative(derivative);
                    }
                    // If our derivative is sufficiently small, we've converged
                    double derivativeNorm = derivative.DotProduct(derivative);
                    if (derivativeNorm < this.convergenceDerivativeNorm)
                    {
                        if (!this.quiet)
                        {
                            AbstractBatchOptimizer.log.Info("Derivative norm " + derivativeNorm + " < " + this.convergenceDerivativeNorm + ": quitting");
                        }
                        break;
                    }
                    // Do the actual computation
                    if (!this.quiet)
                    {
                        AbstractBatchOptimizer.log.Info("[" + gradientComputationTime + " ms, threads waiting " + threadWaiting + " ms]");
                    }
                    bool converged = this._enclosing.UpdateWeights(this.weights, derivative, logLikelihood, this.optimizationState, this.quiet);
                    // Apply constraints to the weights vector
                    foreach (AbstractBatchOptimizer.Constraint constraint_1 in this._enclosing.constraints)
                    {
                        constraint_1.ApplyToWeights(this.weights);
                    }
                    if (converged)
                    {
                        break;
                    }
                }
                lock (this.naturalTerminationBarrier)
                {
                    Sharpen.Runtime.NotifyAll(this.naturalTerminationBarrier);
                }
                this.isFinished = true;
            }