/// <summary> /// Determine the total cost on the dataset associated with this /// classifier using the current learned parameters. /// </summary> /// <remarks> /// Determine the total cost on the dataset associated with this /// classifier using the current learned parameters. This cost is /// evaluated using mini-batch adaptive gradient descent. /// This method launches multiple threads, each of which evaluates /// training cost on a partition of the mini-batch. /// </remarks> /// <param name="batchSize"/> /// <param name="regParameter">Regularization parameter (lambda)</param> /// <param name="dropOutProb"> /// Drop-out probability. Hidden-layer units in the /// neural network will be randomly turned off /// while training a particular example with this /// probability. /// </param> /// <returns> /// A /// <see cref="Cost"/> /// object which describes the total cost of the given /// weights, and includes gradients to be used for further /// training /// </returns> public virtual Classifier.Cost ComputeCostFunction(int batchSize, double regParameter, double dropOutProb) { ValidateTraining(); IList <Example> examples = Edu.Stanford.Nlp.Parser.Nndep.Util.GetRandomSubList(dataset.examples, batchSize); // Redo precomputations for only those features which are triggered // by examples in this mini-batch. ICollection <int> toPreCompute = GetToPreCompute(examples); PreCompute(toPreCompute); // Set up parameters for feedforward Classifier.FeedforwardParams @params = new Classifier.FeedforwardParams(batchSize, dropOutProb); // Zero out saved-embedding gradients gradSaved = new double[][] { }; int numChunks = config.trainingThreads; IList <IList <Example> > chunks = CollectionUtils.PartitionIntoFolds(examples, numChunks); // Submit chunks for processing on separate threads foreach (ICollection <Example> chunk in chunks) { jobHandler.Put(new Pair <ICollection <Example>, Classifier.FeedforwardParams>(chunk, @params)); } jobHandler.Join(false); // Join costs from each chunk Classifier.Cost cost = null; while (jobHandler.Peek()) { Classifier.Cost otherCost = jobHandler.Poll(); if (cost == null) { cost = otherCost; } else { cost.Merge(otherCost); } } if (cost == null) { return(null); } // Backpropagate gradients on saved pre-computed values to actual // embeddings cost.BackpropSaved(toPreCompute); cost.AddL2Regularization(regParameter); return(cost); }
public virtual Classifier.Cost Process(Pair <ICollection <Example>, Classifier.FeedforwardParams> input) { ICollection <Example> examples = input.First(); Classifier.FeedforwardParams @params = input.Second(); // We can't fix the seed used with ThreadLocalRandom // TODO: Is this a serious problem? ThreadLocalRandom random = ThreadLocalRandom.Current(); this.gradW1 = new double[this._enclosing.W1.Length][]; this.gradb1 = new double[this._enclosing.b1.Length]; this.gradW2 = new double[this._enclosing.W2.Length][]; this.gradE = new double[this._enclosing.E.Length][]; double cost = 0.0; double correct = 0.0; foreach (Example ex in examples) { IList <int> feature = ex.GetFeature(); IList <int> label = ex.GetLabel(); double[] scores = new double[this._enclosing.numLabels]; double[] hidden = new double[this._enclosing.config.hiddenSize]; double[] hidden3 = new double[this._enclosing.config.hiddenSize]; // Run dropout: randomly drop some hidden-layer units. `ls` // contains the indices of those units which are still active int[] ls = IIntStream.Range(0, this._enclosing.config.hiddenSize).Filter(null).ToArray(); int offset = 0; for (int j = 0; j < this._enclosing.config.numTokens; ++j) { int tok = feature[j]; int index = tok * this._enclosing.config.numTokens + j; if (this._enclosing.preMap.Contains(index)) { // Unit activations for this input feature value have been // precomputed int id = this._enclosing.preMap[index]; // Only extract activations for those nodes which are still // activated (`ls`) foreach (int nodeIndex in ls) { hidden[nodeIndex] += this._enclosing.saved[id][nodeIndex]; } } else { foreach (int nodeIndex in ls) { for (int k = 0; k < this._enclosing.config.embeddingSize; ++k) { hidden[nodeIndex] += this._enclosing.W1[nodeIndex][offset + k] * this._enclosing.E[tok][k]; } } } offset += this._enclosing.config.embeddingSize; } // Add bias term and apply activation function foreach (int nodeIndex_1 in ls) { hidden[nodeIndex_1] += this._enclosing.b1[nodeIndex_1]; hidden3[nodeIndex_1] = Math.Pow(hidden[nodeIndex_1], 3); } // Feed forward to softmax layer (no activation yet) int optLabel = -1; for (int i = 0; i < this._enclosing.numLabels; ++i) { if (label[i] >= 0) { foreach (int nodeIndex in ls) { scores[i] += this._enclosing.W2[i][nodeIndex_1] * hidden3[nodeIndex_1]; } if (optLabel < 0 || scores[i] > scores[optLabel]) { optLabel = i; } } } double sum1 = 0.0; double sum2 = 0.0; double maxScore = scores[optLabel]; for (int i_1 = 0; i_1 < this._enclosing.numLabels; ++i_1) { if (label[i_1] >= 0) { scores[i_1] = Math.Exp(scores[i_1] - maxScore); if (label[i_1] == 1) { sum1 += scores[i_1]; } sum2 += scores[i_1]; } } cost += (Math.Log(sum2) - Math.Log(sum1)) / @params.GetBatchSize(); if (label[optLabel] == 1) { correct += +1.0 / @params.GetBatchSize(); } double[] gradHidden3 = new double[this._enclosing.config.hiddenSize]; for (int i_2 = 0; i_2 < this._enclosing.numLabels; ++i_2) { if (label[i_2] >= 0) { double delta = -(label[i_2] - scores[i_2] / sum2) / @params.GetBatchSize(); foreach (int nodeIndex in ls) { this.gradW2[i_2][nodeIndex_1] += delta * hidden3[nodeIndex_1]; gradHidden3[nodeIndex_1] += delta * this._enclosing.W2[i_2][nodeIndex_1]; } } } double[] gradHidden = new double[this._enclosing.config.hiddenSize]; foreach (int nodeIndex_2 in ls) { gradHidden[nodeIndex_2] = gradHidden3[nodeIndex_2] * 3 * hidden[nodeIndex_2] * hidden[nodeIndex_2]; this.gradb1[nodeIndex_2] += gradHidden[nodeIndex_2]; } offset = 0; for (int j_1 = 0; j_1 < this._enclosing.config.numTokens; ++j_1) { int tok = feature[j_1]; int index = tok * this._enclosing.config.numTokens + j_1; if (this._enclosing.preMap.Contains(index)) { int id = this._enclosing.preMap[index]; foreach (int nodeIndex in ls) { this._enclosing.gradSaved[id][nodeIndex_2] += gradHidden[nodeIndex_2]; } } else { foreach (int nodeIndex in ls) { for (int k = 0; k < this._enclosing.config.embeddingSize; ++k) { this.gradW1[nodeIndex_2][offset + k] += gradHidden[nodeIndex_2] * this._enclosing.E[tok][k]; this.gradE[tok][k] += gradHidden[nodeIndex_2] * this._enclosing.W1[nodeIndex_2][offset + k]; } } } offset += this._enclosing.config.embeddingSize; } } return(new Classifier.Cost(this, cost, correct, this.gradW1, this.gradb1, this.gradW2, this.gradE)); }