// todo: The current version has variables for a 2 model version and arrays for an n-model version.  Unify.
 /// <summary>
 /// <inheritDoc/>
 ///
 /// </summary>
 public virtual double[] ScoresOf(int[] sequence, int pos)
 {
     if (models != null)
     {
         double[] dist = ArrayMath.Multiply(models[0].ScoresOf(sequence, pos), wts[0]);
         for (int i = 1; i < models.Length; i++)
         {
             double[] dist_i = models[i].ScoresOf(sequence, pos);
             ArrayMath.AddMultInPlace(dist, dist_i, wts[i]);
         }
         return(dist);
     }
     double[] dist1  = model1.ScoresOf(sequence, pos);
     double[] dist2  = model2.ScoresOf(sequence, pos);
     double[] dist_1 = new double[dist1.Length];
     for (int i_1 = 0; i_1 < dist1.Length; i_1++)
     {
         dist_1[i_1] = model1Wt * dist1[i_1] + model2Wt * dist2[i_1];
     }
     return(dist_1);
 }
        public virtual void ExecuteOneTrainingBatch(IList <Tree> trainingBatch, IdentityHashMap <Tree, byte[]> compressedParses, double[] sumGradSquare)
        {
            Timing convertTiming = new Timing();

            convertTiming.Doing("Converting trees");
            IdentityHashMap <Tree, IList <Tree> > topParses = CacheParseHypotheses.ConvertToTrees(trainingBatch, compressedParses, op.trainOptions.trainingThreads);

            convertTiming.Done();
            DVParserCostAndGradient gcFunc = new DVParserCostAndGradient(trainingBatch, topParses, dvModel, op);

            double[] theta = dvModel.ParamsToVector();
            switch (Minimizer)
            {
            case (1):
            {
                //maxFuncIter = 10;
                // 1: QNMinimizer, 2: SGD
                QNMinimizer qn = new QNMinimizer(op.trainOptions.qnEstimates, true);
                qn.UseMinPackSearch();
                qn.UseDiagonalScaling();
                qn.TerminateOnAverageImprovement(true);
                qn.TerminateOnNumericalZero(true);
                qn.TerminateOnRelativeNorm(true);
                theta = qn.Minimize(gcFunc, op.trainOptions.qnTolerance, theta, op.trainOptions.qnIterationsPerBatch);
                break;
            }

            case 2:
            {
                //Minimizer smd = new SGDMinimizer();       double tol = 1e-4;      theta = smd.minimize(gcFunc,tol,theta,op.trainOptions.qnIterationsPerBatch);
                double lastCost  = 0;
                double currCost  = 0;
                bool   firstTime = true;
                for (int i = 0; i < op.trainOptions.qnIterationsPerBatch; i++)
                {
                    //gcFunc.calculate(theta);
                    double[] grad = gcFunc.DerivativeAt(theta);
                    currCost = gcFunc.ValueAt(theta);
                    log.Info("batch cost: " + currCost);
                    //          if(!firstTime){
                    //              if(currCost > lastCost){
                    //                  System.out.println("HOW IS FUNCTION VALUE INCREASING????!!! ... still updating theta");
                    //              }
                    //              if(Math.abs(currCost - lastCost) < 0.0001){
                    //                  System.out.println("function value is not decreasing. stop");
                    //              }
                    //          }else{
                    //              firstTime = false;
                    //          }
                    lastCost = currCost;
                    ArrayMath.AddMultInPlace(theta, grad, -1 * op.trainOptions.learningRate);
                }
                break;
            }

            case 3:
            {
                // AdaGrad
                double eps      = 1e-3;
                double currCost = 0;
                for (int i = 0; i < op.trainOptions.qnIterationsPerBatch; i++)
                {
                    double[] gradf = gcFunc.DerivativeAt(theta);
                    currCost = gcFunc.ValueAt(theta);
                    log.Info("batch cost: " + currCost);
                    for (int feature = 0; feature < gradf.Length; feature++)
                    {
                        sumGradSquare[feature] = sumGradSquare[feature] + gradf[feature] * gradf[feature];
                        theta[feature]         = theta[feature] - (op.trainOptions.learningRate * gradf[feature] / (System.Math.Sqrt(sumGradSquare[feature]) + eps));
                    }
                }
                break;
            }

            default:
            {
                throw new ArgumentException("Unsupported minimizer " + Minimizer);
            }
            }
            dvModel.VectorToParams(theta);
        }