public virtual double[] ParamsToVector() { int totalSize = TotalParamSize(); return(NeuralUtils.ParamsToVector(totalSize, binaryTransform.ValueIterator(), binaryClassification.ValueIterator(), SimpleTensor.IteratorSimpleMatrix(binaryTensors.ValueIterator()), unaryClassification.Values.GetEnumerator(), wordVectors.Values .GetEnumerator())); }
public virtual double[] ParamsToVector() { int totalSize = TotalParamSize(); if (op.trainOptions.trainWordVectors) { return(NeuralUtils.ParamsToVector(totalSize, binaryTransform.ValueIterator(), unaryTransform.Values.GetEnumerator(), binaryScore.ValueIterator(), unaryScore.Values.GetEnumerator(), wordVectors.Values.GetEnumerator())); } else { return(NeuralUtils.ParamsToVector(totalSize, binaryTransform.ValueIterator(), unaryTransform.Values.GetEnumerator(), binaryScore.ValueIterator(), unaryScore.Values.GetEnumerator())); } }
protected internal override void Calculate(double[] theta) { model.VectorToParams(theta); SentimentCostAndGradient.ModelDerivatives derivatives; if (model.op.trainOptions.nThreads == 1) { derivatives = ScoreDerivatives(trainingBatch); } else { // TODO: because some addition operations happen in different // orders now, this results in slightly different values, which // over time add up to significantly different models even when // given the same random seed. Probably not a big deal. // To be more specific, for trees T1, T2, T3, ... Tn, // when using one thread, we sum the derivatives T1 + T2 ... // When using multiple threads, we first sum T1 + ... + Tk, // then sum Tk+1 + ... + T2k, etc, for split size k. // The splits are then summed in order. // This different sum order results in slightly different numbers. MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives> wrapper = new MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives>(model.op.trainOptions.nThreads, new SentimentCostAndGradient.ScoringProcessor(this )); // use wrapper.nThreads in case the number of threads was automatically changed foreach (IList <Tree> chunk in CollectionUtils.PartitionIntoFolds(trainingBatch, wrapper.NThreads())) { wrapper.Put(chunk); } wrapper.Join(); derivatives = new SentimentCostAndGradient.ModelDerivatives(model); while (wrapper.Peek()) { SentimentCostAndGradient.ModelDerivatives batchDerivatives = wrapper.Poll(); derivatives.Add(batchDerivatives); } } // scale the error by the number of sentences so that the // regularization isn't drowned out for large training batchs double scale = (1.0 / trainingBatch.Count); value = derivatives.error * scale; value += ScaleAndRegularize(derivatives.binaryTD, model.binaryTransform, scale, model.op.trainOptions.regTransformMatrix, false); value += ScaleAndRegularize(derivatives.binaryCD, model.binaryClassification, scale, model.op.trainOptions.regClassification, true); value += ScaleAndRegularizeTensor(derivatives.binaryTensorTD, model.binaryTensors, scale, model.op.trainOptions.regTransformTensor); value += ScaleAndRegularize(derivatives.unaryCD, model.unaryClassification, scale, model.op.trainOptions.regClassification, false, true); value += ScaleAndRegularize(derivatives.wordVectorD, model.wordVectors, scale, model.op.trainOptions.regWordVector, true, false); derivative = NeuralUtils.ParamsToVector(theta.Length, derivatives.binaryTD.ValueIterator(), derivatives.binaryCD.ValueIterator(), SimpleTensor.IteratorSimpleMatrix(derivatives.binaryTensorTD.ValueIterator()), derivatives.unaryCD.Values.GetEnumerator (), derivatives.wordVectorD.Values.GetEnumerator()); }
// fill value & derivative protected internal override void Calculate(double[] theta) { dvModel.VectorToParams(theta); double localValue = 0.0; double[] localDerivative = new double[theta.Length]; TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsG; TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsB; binaryW_dfsG = TwoDimensionalMap.TreeMap(); binaryW_dfsB = TwoDimensionalMap.TreeMap(); TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesG; TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesB; binaryScoreDerivativesG = TwoDimensionalMap.TreeMap(); binaryScoreDerivativesB = TwoDimensionalMap.TreeMap(); IDictionary <string, SimpleMatrix> unaryW_dfsG; IDictionary <string, SimpleMatrix> unaryW_dfsB; unaryW_dfsG = new SortedDictionary <string, SimpleMatrix>(); unaryW_dfsB = new SortedDictionary <string, SimpleMatrix>(); IDictionary <string, SimpleMatrix> unaryScoreDerivativesG; IDictionary <string, SimpleMatrix> unaryScoreDerivativesB; unaryScoreDerivativesG = new SortedDictionary <string, SimpleMatrix>(); unaryScoreDerivativesB = new SortedDictionary <string, SimpleMatrix>(); IDictionary <string, SimpleMatrix> wordVectorDerivativesG = new SortedDictionary <string, SimpleMatrix>(); IDictionary <string, SimpleMatrix> wordVectorDerivativesB = new SortedDictionary <string, SimpleMatrix>(); foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in dvModel.binaryTransform) { int numRows = entry.GetValue().NumRows(); int numCols = entry.GetValue().NumCols(); binaryW_dfsG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols)); binaryW_dfsB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols)); binaryScoreDerivativesG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows)); binaryScoreDerivativesB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows)); } foreach (KeyValuePair <string, SimpleMatrix> entry_1 in dvModel.unaryTransform) { int numRows = entry_1.Value.NumRows(); int numCols = entry_1.Value.NumCols(); unaryW_dfsG[entry_1.Key] = new SimpleMatrix(numRows, numCols); unaryW_dfsB[entry_1.Key] = new SimpleMatrix(numRows, numCols); unaryScoreDerivativesG[entry_1.Key] = new SimpleMatrix(1, numRows); unaryScoreDerivativesB[entry_1.Key] = new SimpleMatrix(1, numRows); } if (op.trainOptions.trainWordVectors) { foreach (KeyValuePair <string, SimpleMatrix> entry_2 in dvModel.wordVectors) { int numRows = entry_2.Value.NumRows(); int numCols = entry_2.Value.NumCols(); wordVectorDerivativesG[entry_2.Key] = new SimpleMatrix(numRows, numCols); wordVectorDerivativesB[entry_2.Key] = new SimpleMatrix(numRows, numCols); } } // Some optimization methods prints out a line without an end, so our // debugging statements are misaligned Timing scoreTiming = new Timing(); scoreTiming.Doing("Scoring trees"); int treeNum = 0; MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> > wrapper = new MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> >(op.trainOptions.trainingThreads, new DVParserCostAndGradient.ScoringProcessor(this)); foreach (Tree tree in trainingBatch) { wrapper.Put(tree); } wrapper.Join(); scoreTiming.Done(); while (wrapper.Peek()) { Pair <DeepTree, DeepTree> result = wrapper.Poll(); DeepTree goldTree = result.first; DeepTree bestTree = result.second; StringBuilder treeDebugLine = new StringBuilder(); Formatter formatter = new Formatter(treeDebugLine); bool isDone = (Math.Abs(bestTree.GetScore() - goldTree.GetScore()) <= 0.00001 || goldTree.GetScore() > bestTree.GetScore()); string done = isDone ? "done" : string.Empty; formatter.Format("Tree %6d Highest tree: %12.4f Correct tree: %12.4f %s", treeNum, bestTree.GetScore(), goldTree.GetScore(), done); log.Info(treeDebugLine.ToString()); if (!isDone) { // if the gold tree is better than the best hypothesis tree by // a large enough margin, then the score difference will be 0 // and we ignore the tree double valueDelta = bestTree.GetScore() - goldTree.GetScore(); //double valueDelta = Math.max(0.0, - scoreGold + bestScore); localValue += valueDelta; // get the context words for this tree - should be the same // for either goldTree or bestTree IList <string> words = GetContextWords(goldTree.GetTree()); // The derivatives affected by this tree are only based on the // nodes present in this tree, eg not all matrix derivatives // will be affected by this tree BackpropDerivative(goldTree.GetTree(), words, goldTree.GetVectors(), binaryW_dfsG, unaryW_dfsG, binaryScoreDerivativesG, unaryScoreDerivativesG, wordVectorDerivativesG); BackpropDerivative(bestTree.GetTree(), words, bestTree.GetVectors(), binaryW_dfsB, unaryW_dfsB, binaryScoreDerivativesB, unaryScoreDerivativesB, wordVectorDerivativesB); } ++treeNum; } double[] localDerivativeGood; double[] localDerivativeB; if (op.trainOptions.trainWordVectors) { localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator(), wordVectorDerivativesG.Values .GetEnumerator()); localDerivativeB = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator(), wordVectorDerivativesB.Values .GetEnumerator()); } else { localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator()); localDerivativeB = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator()); } // correct - highest for (int i = 0; i < localDerivativeGood.Length; i++) { localDerivative[i] = localDerivativeB[i] - localDerivativeGood[i]; } // TODO: this is where we would combine multiple costs if we had parallelized the calculation value = localValue; derivative = localDerivative; // normalizing by training batch size value = (1.0 / trainingBatch.Count) * value; ArrayMath.MultiplyInPlace(derivative, (1.0 / trainingBatch.Count)); // add regularization to cost: double[] currentParams = dvModel.ParamsToVector(); double regCost = 0; foreach (double currentParam in currentParams) { regCost += currentParam * currentParam; } regCost = op.trainOptions.regCost * 0.5 * regCost; value += regCost; // add regularization to gradient ArrayMath.MultiplyInPlace(currentParams, op.trainOptions.regCost); ArrayMath.PairwiseAddInPlace(derivative, currentParams); }