public virtual void ExecuteOneTrainingBatch(IList <Tree> trainingBatch, IdentityHashMap <Tree, byte[]> compressedParses, double[] sumGradSquare)
        {
            Timing convertTiming = new Timing();

            convertTiming.Doing("Converting trees");
            IdentityHashMap <Tree, IList <Tree> > topParses = CacheParseHypotheses.ConvertToTrees(trainingBatch, compressedParses, op.trainOptions.trainingThreads);

            convertTiming.Done();
            DVParserCostAndGradient gcFunc = new DVParserCostAndGradient(trainingBatch, topParses, dvModel, op);

            double[] theta = dvModel.ParamsToVector();
            switch (Minimizer)
            {
            case (1):
            {
                //maxFuncIter = 10;
                // 1: QNMinimizer, 2: SGD
                QNMinimizer qn = new QNMinimizer(op.trainOptions.qnEstimates, true);
                qn.UseMinPackSearch();
                qn.UseDiagonalScaling();
                qn.TerminateOnAverageImprovement(true);
                qn.TerminateOnNumericalZero(true);
                qn.TerminateOnRelativeNorm(true);
                theta = qn.Minimize(gcFunc, op.trainOptions.qnTolerance, theta, op.trainOptions.qnIterationsPerBatch);
                break;
            }

            case 2:
            {
                //Minimizer smd = new SGDMinimizer();       double tol = 1e-4;      theta = smd.minimize(gcFunc,tol,theta,op.trainOptions.qnIterationsPerBatch);
                double lastCost  = 0;
                double currCost  = 0;
                bool   firstTime = true;
                for (int i = 0; i < op.trainOptions.qnIterationsPerBatch; i++)
                {
                    //gcFunc.calculate(theta);
                    double[] grad = gcFunc.DerivativeAt(theta);
                    currCost = gcFunc.ValueAt(theta);
                    log.Info("batch cost: " + currCost);
                    //          if(!firstTime){
                    //              if(currCost > lastCost){
                    //                  System.out.println("HOW IS FUNCTION VALUE INCREASING????!!! ... still updating theta");
                    //              }
                    //              if(Math.abs(currCost - lastCost) < 0.0001){
                    //                  System.out.println("function value is not decreasing. stop");
                    //              }
                    //          }else{
                    //              firstTime = false;
                    //          }
                    lastCost = currCost;
                    ArrayMath.AddMultInPlace(theta, grad, -1 * op.trainOptions.learningRate);
                }
                break;
            }

            case 3:
            {
                // AdaGrad
                double eps      = 1e-3;
                double currCost = 0;
                for (int i = 0; i < op.trainOptions.qnIterationsPerBatch; i++)
                {
                    double[] gradf = gcFunc.DerivativeAt(theta);
                    currCost = gcFunc.ValueAt(theta);
                    log.Info("batch cost: " + currCost);
                    for (int feature = 0; feature < gradf.Length; feature++)
                    {
                        sumGradSquare[feature] = sumGradSquare[feature] + gradf[feature] * gradf[feature];
                        theta[feature]         = theta[feature] - (op.trainOptions.learningRate * gradf[feature] / (System.Math.Sqrt(sumGradSquare[feature]) + eps));
                    }
                }
                break;
            }

            default:
            {
                throw new ArgumentException("Unsupported minimizer " + Minimizer);
            }
            }
            dvModel.VectorToParams(theta);
        }
Esempio n. 2
0
        // fill value & derivative
        protected internal override void Calculate(double[] theta)
        {
            dvModel.VectorToParams(theta);
            double localValue = 0.0;

            double[] localDerivative = new double[theta.Length];
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsB;

            binaryW_dfsG = TwoDimensionalMap.TreeMap();
            binaryW_dfsB = TwoDimensionalMap.TreeMap();
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesB;

            binaryScoreDerivativesG = TwoDimensionalMap.TreeMap();
            binaryScoreDerivativesB = TwoDimensionalMap.TreeMap();
            IDictionary <string, SimpleMatrix> unaryW_dfsG;
            IDictionary <string, SimpleMatrix> unaryW_dfsB;

            unaryW_dfsG = new SortedDictionary <string, SimpleMatrix>();
            unaryW_dfsB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesG;
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesB;

            unaryScoreDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            unaryScoreDerivativesB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesB = new SortedDictionary <string, SimpleMatrix>();

            foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in dvModel.binaryTransform)
            {
                int numRows = entry.GetValue().NumRows();
                int numCols = entry.GetValue().NumCols();
                binaryW_dfsG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryW_dfsB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryScoreDerivativesG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
                binaryScoreDerivativesB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
            }
            foreach (KeyValuePair <string, SimpleMatrix> entry_1 in dvModel.unaryTransform)
            {
                int numRows = entry_1.Value.NumRows();
                int numCols = entry_1.Value.NumCols();
                unaryW_dfsG[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryW_dfsB[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryScoreDerivativesG[entry_1.Key] = new SimpleMatrix(1, numRows);
                unaryScoreDerivativesB[entry_1.Key] = new SimpleMatrix(1, numRows);
            }
            if (op.trainOptions.trainWordVectors)
            {
                foreach (KeyValuePair <string, SimpleMatrix> entry_2 in dvModel.wordVectors)
                {
                    int numRows = entry_2.Value.NumRows();
                    int numCols = entry_2.Value.NumCols();
                    wordVectorDerivativesG[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                    wordVectorDerivativesB[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                }
            }
            // Some optimization methods prints out a line without an end, so our
            // debugging statements are misaligned
            Timing scoreTiming = new Timing();

            scoreTiming.Doing("Scoring trees");
            int treeNum = 0;
            MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> > wrapper = new MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> >(op.trainOptions.trainingThreads, new DVParserCostAndGradient.ScoringProcessor(this));

            foreach (Tree tree in trainingBatch)
            {
                wrapper.Put(tree);
            }
            wrapper.Join();
            scoreTiming.Done();
            while (wrapper.Peek())
            {
                Pair <DeepTree, DeepTree> result = wrapper.Poll();
                DeepTree      goldTree           = result.first;
                DeepTree      bestTree           = result.second;
                StringBuilder treeDebugLine      = new StringBuilder();
                Formatter     formatter          = new Formatter(treeDebugLine);
                bool          isDone             = (Math.Abs(bestTree.GetScore() - goldTree.GetScore()) <= 0.00001 || goldTree.GetScore() > bestTree.GetScore());
                string        done = isDone ? "done" : string.Empty;
                formatter.Format("Tree %6d Highest tree: %12.4f Correct tree: %12.4f %s", treeNum, bestTree.GetScore(), goldTree.GetScore(), done);
                log.Info(treeDebugLine.ToString());
                if (!isDone)
                {
                    // if the gold tree is better than the best hypothesis tree by
                    // a large enough margin, then the score difference will be 0
                    // and we ignore the tree
                    double valueDelta = bestTree.GetScore() - goldTree.GetScore();
                    //double valueDelta = Math.max(0.0, - scoreGold + bestScore);
                    localValue += valueDelta;
                    // get the context words for this tree - should be the same
                    // for either goldTree or bestTree
                    IList <string> words = GetContextWords(goldTree.GetTree());
                    // The derivatives affected by this tree are only based on the
                    // nodes present in this tree, eg not all matrix derivatives
                    // will be affected by this tree
                    BackpropDerivative(goldTree.GetTree(), words, goldTree.GetVectors(), binaryW_dfsG, unaryW_dfsG, binaryScoreDerivativesG, unaryScoreDerivativesG, wordVectorDerivativesG);
                    BackpropDerivative(bestTree.GetTree(), words, bestTree.GetVectors(), binaryW_dfsB, unaryW_dfsB, binaryScoreDerivativesB, unaryScoreDerivativesB, wordVectorDerivativesB);
                }
                ++treeNum;
            }
            double[] localDerivativeGood;
            double[] localDerivativeB;
            if (op.trainOptions.trainWordVectors)
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator(), wordVectorDerivativesG.Values
                                                                 .GetEnumerator());
                localDerivativeB = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator(), wordVectorDerivativesB.Values
                                                              .GetEnumerator());
            }
            else
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator());
                localDerivativeB    = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator());
            }
            // correct - highest
            for (int i = 0; i < localDerivativeGood.Length; i++)
            {
                localDerivative[i] = localDerivativeB[i] - localDerivativeGood[i];
            }
            // TODO: this is where we would combine multiple costs if we had parallelized the calculation
            value      = localValue;
            derivative = localDerivative;
            // normalizing by training batch size
            value = (1.0 / trainingBatch.Count) * value;
            ArrayMath.MultiplyInPlace(derivative, (1.0 / trainingBatch.Count));
            // add regularization to cost:
            double[] currentParams = dvModel.ParamsToVector();
            double   regCost       = 0;

            foreach (double currentParam in currentParams)
            {
                regCost += currentParam * currentParam;
            }
            regCost = op.trainOptions.regCost * 0.5 * regCost;
            value  += regCost;
            // add regularization to gradient
            ArrayMath.MultiplyInPlace(currentParams, op.trainOptions.regCost);
            ArrayMath.PairwiseAddInPlace(derivative, currentParams);
        }