/// <summary> /// Merge the given /// <c>Cost</c> /// data with the data in this /// instance. /// </summary> /// <param name="otherCost"/> public virtual void Merge(Classifier.Cost otherCost) { this.cost += otherCost.GetCost(); this.percentCorrect += otherCost.GetPercentCorrect(); ArrayMath.AddInPlace(this.gradW1, otherCost.GetGradW1()); ArrayMath.PairwiseAddInPlace(this.gradb1, otherCost.GetGradb1()); ArrayMath.AddInPlace(this.gradW2, otherCost.GetGradW2()); ArrayMath.AddInPlace(this.gradE, otherCost.GetGradE()); }
/// <summary> /// Determine the total cost on the dataset associated with this /// classifier using the current learned parameters. /// </summary> /// <remarks> /// Determine the total cost on the dataset associated with this /// classifier using the current learned parameters. This cost is /// evaluated using mini-batch adaptive gradient descent. /// This method launches multiple threads, each of which evaluates /// training cost on a partition of the mini-batch. /// </remarks> /// <param name="batchSize"/> /// <param name="regParameter">Regularization parameter (lambda)</param> /// <param name="dropOutProb"> /// Drop-out probability. Hidden-layer units in the /// neural network will be randomly turned off /// while training a particular example with this /// probability. /// </param> /// <returns> /// A /// <see cref="Cost"/> /// object which describes the total cost of the given /// weights, and includes gradients to be used for further /// training /// </returns> public virtual Classifier.Cost ComputeCostFunction(int batchSize, double regParameter, double dropOutProb) { ValidateTraining(); IList <Example> examples = Edu.Stanford.Nlp.Parser.Nndep.Util.GetRandomSubList(dataset.examples, batchSize); // Redo precomputations for only those features which are triggered // by examples in this mini-batch. ICollection <int> toPreCompute = GetToPreCompute(examples); PreCompute(toPreCompute); // Set up parameters for feedforward Classifier.FeedforwardParams @params = new Classifier.FeedforwardParams(batchSize, dropOutProb); // Zero out saved-embedding gradients gradSaved = new double[][] { }; int numChunks = config.trainingThreads; IList <IList <Example> > chunks = CollectionUtils.PartitionIntoFolds(examples, numChunks); // Submit chunks for processing on separate threads foreach (ICollection <Example> chunk in chunks) { jobHandler.Put(new Pair <ICollection <Example>, Classifier.FeedforwardParams>(chunk, @params)); } jobHandler.Join(false); // Join costs from each chunk Classifier.Cost cost = null; while (jobHandler.Peek()) { Classifier.Cost otherCost = jobHandler.Poll(); if (cost == null) { cost = otherCost; } else { cost.Merge(otherCost); } } if (cost == null) { return(null); } // Backpropagate gradients on saved pre-computed values to actual // embeddings cost.BackpropSaved(toPreCompute); cost.AddL2Regularization(regParameter); return(cost); }
/// <summary> /// Update classifier weights using the given training cost /// information. /// </summary> /// <param name="cost"> /// Cost information as returned by /// <see cref="ComputeCostFunction(int, double, double)"/> /// . /// </param> /// <param name="adaAlpha">Global AdaGrad learning rate</param> /// <param name="adaEps"> /// Epsilon value for numerical stability in AdaGrad's /// division /// </param> public virtual void TakeAdaGradientStep(Classifier.Cost cost, double adaAlpha, double adaEps) { ValidateTraining(); double[][] gradW1 = cost.GetGradW1(); double[][] gradW2 = cost.GetGradW2(); double[][] gradE = cost.GetGradE(); double[] gradb1 = cost.GetGradb1(); for (int i = 0; i < W1.Length; ++i) { for (int j = 0; j < W1[i].Length; ++j) { eg2W1[i][j] += gradW1[i][j] * gradW1[i][j]; W1[i][j] -= adaAlpha * gradW1[i][j] / System.Math.Sqrt(eg2W1[i][j] + adaEps); } } for (int i_1 = 0; i_1 < b1.Length; ++i_1) { eg2b1[i_1] += gradb1[i_1] * gradb1[i_1]; b1[i_1] -= adaAlpha * gradb1[i_1] / System.Math.Sqrt(eg2b1[i_1] + adaEps); } for (int i_2 = 0; i_2 < W2.Length; ++i_2) { for (int j = 0; j < W2[i_2].Length; ++j) { eg2W2[i_2][j] += gradW2[i_2][j] * gradW2[i_2][j]; W2[i_2][j] -= adaAlpha * gradW2[i_2][j] / System.Math.Sqrt(eg2W2[i_2][j] + adaEps); } } if (config.doWordEmbeddingGradUpdate) { for (int i_3 = 0; i_3 < E.Length; ++i_3) { for (int j = 0; j < E[i_3].Length; ++j) { eg2E[i_3][j] += gradE[i_3][j] * gradE[i_3][j]; E[i_3][j] -= adaAlpha * gradE[i_3][j] / System.Math.Sqrt(eg2E[i_3][j] + adaEps); } } } }