public virtual double[] ParamsToVector()
        {
            int totalSize = TotalParamSize();

            return(NeuralUtils.ParamsToVector(totalSize, binaryTransform.ValueIterator(), binaryClassification.ValueIterator(), SimpleTensor.IteratorSimpleMatrix(binaryTensors.ValueIterator()), unaryClassification.Values.GetEnumerator(), wordVectors.Values
                                              .GetEnumerator()));
        }
        public virtual double[] ParamsToVector()
        {
            int totalSize = TotalParamSize();

            if (op.trainOptions.trainWordVectors)
            {
                return(NeuralUtils.ParamsToVector(totalSize, binaryTransform.ValueIterator(), unaryTransform.Values.GetEnumerator(), binaryScore.ValueIterator(), unaryScore.Values.GetEnumerator(), wordVectors.Values.GetEnumerator()));
            }
            else
            {
                return(NeuralUtils.ParamsToVector(totalSize, binaryTransform.ValueIterator(), unaryTransform.Values.GetEnumerator(), binaryScore.ValueIterator(), unaryScore.Values.GetEnumerator()));
            }
        }
        protected internal override void Calculate(double[] theta)
        {
            model.VectorToParams(theta);
            SentimentCostAndGradient.ModelDerivatives derivatives;
            if (model.op.trainOptions.nThreads == 1)
            {
                derivatives = ScoreDerivatives(trainingBatch);
            }
            else
            {
                // TODO: because some addition operations happen in different
                // orders now, this results in slightly different values, which
                // over time add up to significantly different models even when
                // given the same random seed.  Probably not a big deal.
                // To be more specific, for trees T1, T2, T3, ... Tn,
                // when using one thread, we sum the derivatives T1 + T2 ...
                // When using multiple threads, we first sum T1 + ... + Tk,
                // then sum Tk+1 + ... + T2k, etc, for split size k.
                // The splits are then summed in order.
                // This different sum order results in slightly different numbers.
                MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives> wrapper = new MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives>(model.op.trainOptions.nThreads, new SentimentCostAndGradient.ScoringProcessor(this
                                                                                                                                                                                                                                                                  ));
                // use wrapper.nThreads in case the number of threads was automatically changed
                foreach (IList <Tree> chunk in CollectionUtils.PartitionIntoFolds(trainingBatch, wrapper.NThreads()))
                {
                    wrapper.Put(chunk);
                }
                wrapper.Join();
                derivatives = new SentimentCostAndGradient.ModelDerivatives(model);
                while (wrapper.Peek())
                {
                    SentimentCostAndGradient.ModelDerivatives batchDerivatives = wrapper.Poll();
                    derivatives.Add(batchDerivatives);
                }
            }
            // scale the error by the number of sentences so that the
            // regularization isn't drowned out for large training batchs
            double scale = (1.0 / trainingBatch.Count);

            value      = derivatives.error * scale;
            value     += ScaleAndRegularize(derivatives.binaryTD, model.binaryTransform, scale, model.op.trainOptions.regTransformMatrix, false);
            value     += ScaleAndRegularize(derivatives.binaryCD, model.binaryClassification, scale, model.op.trainOptions.regClassification, true);
            value     += ScaleAndRegularizeTensor(derivatives.binaryTensorTD, model.binaryTensors, scale, model.op.trainOptions.regTransformTensor);
            value     += ScaleAndRegularize(derivatives.unaryCD, model.unaryClassification, scale, model.op.trainOptions.regClassification, false, true);
            value     += ScaleAndRegularize(derivatives.wordVectorD, model.wordVectors, scale, model.op.trainOptions.regWordVector, true, false);
            derivative = NeuralUtils.ParamsToVector(theta.Length, derivatives.binaryTD.ValueIterator(), derivatives.binaryCD.ValueIterator(), SimpleTensor.IteratorSimpleMatrix(derivatives.binaryTensorTD.ValueIterator()), derivatives.unaryCD.Values.GetEnumerator
                                                        (), derivatives.wordVectorD.Values.GetEnumerator());
        }
Example #4
0
        // fill value & derivative
        protected internal override void Calculate(double[] theta)
        {
            dvModel.VectorToParams(theta);
            double localValue = 0.0;

            double[] localDerivative = new double[theta.Length];
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsB;

            binaryW_dfsG = TwoDimensionalMap.TreeMap();
            binaryW_dfsB = TwoDimensionalMap.TreeMap();
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesB;

            binaryScoreDerivativesG = TwoDimensionalMap.TreeMap();
            binaryScoreDerivativesB = TwoDimensionalMap.TreeMap();
            IDictionary <string, SimpleMatrix> unaryW_dfsG;
            IDictionary <string, SimpleMatrix> unaryW_dfsB;

            unaryW_dfsG = new SortedDictionary <string, SimpleMatrix>();
            unaryW_dfsB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesG;
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesB;

            unaryScoreDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            unaryScoreDerivativesB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesB = new SortedDictionary <string, SimpleMatrix>();

            foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in dvModel.binaryTransform)
            {
                int numRows = entry.GetValue().NumRows();
                int numCols = entry.GetValue().NumCols();
                binaryW_dfsG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryW_dfsB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryScoreDerivativesG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
                binaryScoreDerivativesB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
            }
            foreach (KeyValuePair <string, SimpleMatrix> entry_1 in dvModel.unaryTransform)
            {
                int numRows = entry_1.Value.NumRows();
                int numCols = entry_1.Value.NumCols();
                unaryW_dfsG[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryW_dfsB[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryScoreDerivativesG[entry_1.Key] = new SimpleMatrix(1, numRows);
                unaryScoreDerivativesB[entry_1.Key] = new SimpleMatrix(1, numRows);
            }
            if (op.trainOptions.trainWordVectors)
            {
                foreach (KeyValuePair <string, SimpleMatrix> entry_2 in dvModel.wordVectors)
                {
                    int numRows = entry_2.Value.NumRows();
                    int numCols = entry_2.Value.NumCols();
                    wordVectorDerivativesG[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                    wordVectorDerivativesB[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                }
            }
            // Some optimization methods prints out a line without an end, so our
            // debugging statements are misaligned
            Timing scoreTiming = new Timing();

            scoreTiming.Doing("Scoring trees");
            int treeNum = 0;
            MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> > wrapper = new MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> >(op.trainOptions.trainingThreads, new DVParserCostAndGradient.ScoringProcessor(this));

            foreach (Tree tree in trainingBatch)
            {
                wrapper.Put(tree);
            }
            wrapper.Join();
            scoreTiming.Done();
            while (wrapper.Peek())
            {
                Pair <DeepTree, DeepTree> result = wrapper.Poll();
                DeepTree      goldTree           = result.first;
                DeepTree      bestTree           = result.second;
                StringBuilder treeDebugLine      = new StringBuilder();
                Formatter     formatter          = new Formatter(treeDebugLine);
                bool          isDone             = (Math.Abs(bestTree.GetScore() - goldTree.GetScore()) <= 0.00001 || goldTree.GetScore() > bestTree.GetScore());
                string        done = isDone ? "done" : string.Empty;
                formatter.Format("Tree %6d Highest tree: %12.4f Correct tree: %12.4f %s", treeNum, bestTree.GetScore(), goldTree.GetScore(), done);
                log.Info(treeDebugLine.ToString());
                if (!isDone)
                {
                    // if the gold tree is better than the best hypothesis tree by
                    // a large enough margin, then the score difference will be 0
                    // and we ignore the tree
                    double valueDelta = bestTree.GetScore() - goldTree.GetScore();
                    //double valueDelta = Math.max(0.0, - scoreGold + bestScore);
                    localValue += valueDelta;
                    // get the context words for this tree - should be the same
                    // for either goldTree or bestTree
                    IList <string> words = GetContextWords(goldTree.GetTree());
                    // The derivatives affected by this tree are only based on the
                    // nodes present in this tree, eg not all matrix derivatives
                    // will be affected by this tree
                    BackpropDerivative(goldTree.GetTree(), words, goldTree.GetVectors(), binaryW_dfsG, unaryW_dfsG, binaryScoreDerivativesG, unaryScoreDerivativesG, wordVectorDerivativesG);
                    BackpropDerivative(bestTree.GetTree(), words, bestTree.GetVectors(), binaryW_dfsB, unaryW_dfsB, binaryScoreDerivativesB, unaryScoreDerivativesB, wordVectorDerivativesB);
                }
                ++treeNum;
            }
            double[] localDerivativeGood;
            double[] localDerivativeB;
            if (op.trainOptions.trainWordVectors)
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator(), wordVectorDerivativesG.Values
                                                                 .GetEnumerator());
                localDerivativeB = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator(), wordVectorDerivativesB.Values
                                                              .GetEnumerator());
            }
            else
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator());
                localDerivativeB    = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator());
            }
            // correct - highest
            for (int i = 0; i < localDerivativeGood.Length; i++)
            {
                localDerivative[i] = localDerivativeB[i] - localDerivativeGood[i];
            }
            // TODO: this is where we would combine multiple costs if we had parallelized the calculation
            value      = localValue;
            derivative = localDerivative;
            // normalizing by training batch size
            value = (1.0 / trainingBatch.Count) * value;
            ArrayMath.MultiplyInPlace(derivative, (1.0 / trainingBatch.Count));
            // add regularization to cost:
            double[] currentParams = dvModel.ParamsToVector();
            double   regCost       = 0;

            foreach (double currentParam in currentParams)
            {
                regCost += currentParam * currentParam;
            }
            regCost = op.trainOptions.regCost * 0.5 * regCost;
            value  += regCost;
            // add regularization to gradient
            ArrayMath.MultiplyInPlace(currentParams, op.trainOptions.regCost);
            ArrayMath.PairwiseAddInPlace(derivative, currentParams);
        }