/// <summary>
        /// Determine the total cost on the dataset associated with this
        /// classifier using the current learned parameters.
        /// </summary>
        /// <remarks>
        /// Determine the total cost on the dataset associated with this
        /// classifier using the current learned parameters. This cost is
        /// evaluated using mini-batch adaptive gradient descent.
        /// This method launches multiple threads, each of which evaluates
        /// training cost on a partition of the mini-batch.
        /// </remarks>
        /// <param name="batchSize"/>
        /// <param name="regParameter">Regularization parameter (lambda)</param>
        /// <param name="dropOutProb">
        /// Drop-out probability. Hidden-layer units in the
        /// neural network will be randomly turned off
        /// while training a particular example with this
        /// probability.
        /// </param>
        /// <returns>
        /// A
        /// <see cref="Cost"/>
        /// object which describes the total cost of the given
        /// weights, and includes gradients to be used for further
        /// training
        /// </returns>
        public virtual Classifier.Cost ComputeCostFunction(int batchSize, double regParameter, double dropOutProb)
        {
            ValidateTraining();
            IList <Example> examples = Edu.Stanford.Nlp.Parser.Nndep.Util.GetRandomSubList(dataset.examples, batchSize);
            // Redo precomputations for only those features which are triggered
            // by examples in this mini-batch.
            ICollection <int> toPreCompute = GetToPreCompute(examples);

            PreCompute(toPreCompute);
            // Set up parameters for feedforward
            Classifier.FeedforwardParams @params = new Classifier.FeedforwardParams(batchSize, dropOutProb);
            // Zero out saved-embedding gradients
            gradSaved = new double[][] {  };
            int numChunks = config.trainingThreads;
            IList <IList <Example> > chunks = CollectionUtils.PartitionIntoFolds(examples, numChunks);

            // Submit chunks for processing on separate threads
            foreach (ICollection <Example> chunk in chunks)
            {
                jobHandler.Put(new Pair <ICollection <Example>, Classifier.FeedforwardParams>(chunk, @params));
            }
            jobHandler.Join(false);
            // Join costs from each chunk
            Classifier.Cost cost = null;
            while (jobHandler.Peek())
            {
                Classifier.Cost otherCost = jobHandler.Poll();
                if (cost == null)
                {
                    cost = otherCost;
                }
                else
                {
                    cost.Merge(otherCost);
                }
            }
            if (cost == null)
            {
                return(null);
            }
            // Backpropagate gradients on saved pre-computed values to actual
            // embeddings
            cost.BackpropSaved(toPreCompute);
            cost.AddL2Regularization(regParameter);
            return(cost);
        }
            public virtual Classifier.Cost Process(Pair <ICollection <Example>, Classifier.FeedforwardParams> input)
            {
                ICollection <Example> examples = input.First();

                Classifier.FeedforwardParams @params = input.Second();
                // We can't fix the seed used with ThreadLocalRandom
                // TODO: Is this a serious problem?
                ThreadLocalRandom random = ThreadLocalRandom.Current();

                this.gradW1 = new double[this._enclosing.W1.Length][];
                this.gradb1 = new double[this._enclosing.b1.Length];
                this.gradW2 = new double[this._enclosing.W2.Length][];
                this.gradE  = new double[this._enclosing.E.Length][];
                double cost    = 0.0;
                double correct = 0.0;

                foreach (Example ex in examples)
                {
                    IList <int> feature = ex.GetFeature();
                    IList <int> label   = ex.GetLabel();
                    double[]    scores  = new double[this._enclosing.numLabels];
                    double[]    hidden  = new double[this._enclosing.config.hiddenSize];
                    double[]    hidden3 = new double[this._enclosing.config.hiddenSize];
                    // Run dropout: randomly drop some hidden-layer units. `ls`
                    // contains the indices of those units which are still active
                    int[] ls     = IIntStream.Range(0, this._enclosing.config.hiddenSize).Filter(null).ToArray();
                    int   offset = 0;
                    for (int j = 0; j < this._enclosing.config.numTokens; ++j)
                    {
                        int tok   = feature[j];
                        int index = tok * this._enclosing.config.numTokens + j;
                        if (this._enclosing.preMap.Contains(index))
                        {
                            // Unit activations for this input feature value have been
                            // precomputed
                            int id = this._enclosing.preMap[index];
                            // Only extract activations for those nodes which are still
                            // activated (`ls`)
                            foreach (int nodeIndex in ls)
                            {
                                hidden[nodeIndex] += this._enclosing.saved[id][nodeIndex];
                            }
                        }
                        else
                        {
                            foreach (int nodeIndex in ls)
                            {
                                for (int k = 0; k < this._enclosing.config.embeddingSize; ++k)
                                {
                                    hidden[nodeIndex] += this._enclosing.W1[nodeIndex][offset + k] * this._enclosing.E[tok][k];
                                }
                            }
                        }
                        offset += this._enclosing.config.embeddingSize;
                    }
                    // Add bias term and apply activation function
                    foreach (int nodeIndex_1 in ls)
                    {
                        hidden[nodeIndex_1] += this._enclosing.b1[nodeIndex_1];
                        hidden3[nodeIndex_1] = Math.Pow(hidden[nodeIndex_1], 3);
                    }
                    // Feed forward to softmax layer (no activation yet)
                    int optLabel = -1;
                    for (int i = 0; i < this._enclosing.numLabels; ++i)
                    {
                        if (label[i] >= 0)
                        {
                            foreach (int nodeIndex in ls)
                            {
                                scores[i] += this._enclosing.W2[i][nodeIndex_1] * hidden3[nodeIndex_1];
                            }
                            if (optLabel < 0 || scores[i] > scores[optLabel])
                            {
                                optLabel = i;
                            }
                        }
                    }
                    double sum1     = 0.0;
                    double sum2     = 0.0;
                    double maxScore = scores[optLabel];
                    for (int i_1 = 0; i_1 < this._enclosing.numLabels; ++i_1)
                    {
                        if (label[i_1] >= 0)
                        {
                            scores[i_1] = Math.Exp(scores[i_1] - maxScore);
                            if (label[i_1] == 1)
                            {
                                sum1 += scores[i_1];
                            }
                            sum2 += scores[i_1];
                        }
                    }
                    cost += (Math.Log(sum2) - Math.Log(sum1)) / @params.GetBatchSize();
                    if (label[optLabel] == 1)
                    {
                        correct += +1.0 / @params.GetBatchSize();
                    }
                    double[] gradHidden3 = new double[this._enclosing.config.hiddenSize];
                    for (int i_2 = 0; i_2 < this._enclosing.numLabels; ++i_2)
                    {
                        if (label[i_2] >= 0)
                        {
                            double delta = -(label[i_2] - scores[i_2] / sum2) / @params.GetBatchSize();
                            foreach (int nodeIndex in ls)
                            {
                                this.gradW2[i_2][nodeIndex_1] += delta * hidden3[nodeIndex_1];
                                gradHidden3[nodeIndex_1]      += delta * this._enclosing.W2[i_2][nodeIndex_1];
                            }
                        }
                    }
                    double[] gradHidden = new double[this._enclosing.config.hiddenSize];
                    foreach (int nodeIndex_2 in ls)
                    {
                        gradHidden[nodeIndex_2]   = gradHidden3[nodeIndex_2] * 3 * hidden[nodeIndex_2] * hidden[nodeIndex_2];
                        this.gradb1[nodeIndex_2] += gradHidden[nodeIndex_2];
                    }
                    offset = 0;
                    for (int j_1 = 0; j_1 < this._enclosing.config.numTokens; ++j_1)
                    {
                        int tok   = feature[j_1];
                        int index = tok * this._enclosing.config.numTokens + j_1;
                        if (this._enclosing.preMap.Contains(index))
                        {
                            int id = this._enclosing.preMap[index];
                            foreach (int nodeIndex in ls)
                            {
                                this._enclosing.gradSaved[id][nodeIndex_2] += gradHidden[nodeIndex_2];
                            }
                        }
                        else
                        {
                            foreach (int nodeIndex in ls)
                            {
                                for (int k = 0; k < this._enclosing.config.embeddingSize; ++k)
                                {
                                    this.gradW1[nodeIndex_2][offset + k] += gradHidden[nodeIndex_2] * this._enclosing.E[tok][k];
                                    this.gradE[tok][k] += gradHidden[nodeIndex_2] * this._enclosing.W1[nodeIndex_2][offset + k];
                                }
                            }
                        }
                        offset += this._enclosing.config.embeddingSize;
                    }
                }
                return(new Classifier.Cost(this, cost, correct, this.gradW1, this.gradb1, this.gradW2, this.gradE));
            }