예제 #1
0
 /// <summary>Compute the expected counts for this document, which we will need to compute the derivative.</summary>
 protected internal virtual void DocumentExpectedCounts(double[][] E, int[][][] docData, double[][][] featureVal3DArr, CRFCliqueTree <string> cliqueTree)
 {
     // iterate over the positions in this document
     for (int i = 0; i < docData.Length; i++)
     {
         // for each possible clique at this position
         for (int j = 0; j < docData[i].Length; j++)
         {
             IIndex <CRFLabel> labelIndex = labelIndices[j];
             // for each possible labeling for that clique
             for (int k = 0; k < liSize; k++)
             {
                 int[]  label = labelIndex.Get(k).GetLabel();
                 double p     = cliqueTree.Prob(i, label);
                 // probability of these labels occurring in this clique with these features
                 for (int n = 0; n < docData[i][j].Length; n++)
                 {
                     double fVal = 1.0;
                     if (j == 0 && featureVal3DArr != null)
                     {
                         // j == 0 because only node features gets feature values
                         fVal = featureVal3DArr[i][j][n];
                     }
                     E[docData[i][j][n]][k] += p * fVal;
                 }
             }
         }
     }
 }
        // todo [cdm]: Below data[m] --> docData
        /// <summary>Calculates both value and partial derivatives at the point x, and save them internally.</summary>
        protected internal override void Calculate(double[] x)
        {
            double prob = 0.0;
            // the log prob of the sequence given the model, which is the negation of value at this point
            Quadruple <double[][], double[][], double[][], double[][]> allParams = SeparateWeights(x);

            double[][] W4Edge = allParams.First();
            // inputLayerWeights4Edge
            double[][] U4Edge = allParams.Second();
            // outputLayerWeights4Edge
            double[][] W = allParams.Third();
            // inputLayerWeights
            double[][] U = allParams.Fourth();
            // outputLayerWeights
            double[][] Y4Edge = null;
            double[][] Y      = null;
            if (flags.softmaxOutputLayer)
            {
                Y4Edge = new double[U4Edge.Length][];
                for (int i = 0; i < U4Edge.Length; i++)
                {
                    Y4Edge[i] = ArrayMath.Softmax(U4Edge[i]);
                }
                Y = new double[U.Length][];
                for (int i_1 = 0; i_1 < U.Length; i_1++)
                {
                    Y[i_1] = ArrayMath.Softmax(U[i_1]);
                }
            }
            double[][] What4Edge = EmptyW4Edge();
            double[][] Uhat4Edge = EmptyU4Edge();
            double[][] What      = EmptyW();
            double[][] Uhat      = EmptyU();
            // the expectations over counts
            // first index is feature index, second index is of possible labeling
            double[][] eW4Edge = EmptyW4Edge();
            double[][] eU4Edge = EmptyU4Edge();
            double[][] eW      = EmptyW();
            double[][] eU      = EmptyU();
            // iterate over all the documents
            for (int m = 0; m < data.Length; m++)
            {
                int[][][] docData   = data[m];
                int[]     docLabels = labels[m];
                NonLinearSecondOrderCliquePotentialFunction cliquePotentialFunction = new NonLinearSecondOrderCliquePotentialFunction(W4Edge, U4Edge, W, U, flags);
                // make a clique tree for this document
                CRFCliqueTree <string> cliqueTree = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunction, null);
                // compute the log probability of the document given the model with the parameters x
                int[] given = new int[window - 1];
                Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol));
                int[] windowLabels = new int[window];
                Arrays.Fill(windowLabels, classIndex.IndexOf(backgroundSymbol));
                if (docLabels.Length > docData.Length)
                {
                    // only true for self-training
                    // fill the given array with the extra docLabels
                    System.Array.Copy(docLabels, 0, given, 0, given.Length);
                    System.Array.Copy(docLabels, 0, windowLabels, 0, windowLabels.Length);
                    // shift the docLabels array left
                    int[] newDocLabels = new int[docData.Length];
                    System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length);
                    docLabels = newDocLabels;
                }
                // iterate over the positions in this document
                for (int i = 0; i < docData.Length; i++)
                {
                    int    label = docLabels[i];
                    double p     = cliqueTree.CondLogProbGivenPrevious(i, label, given);
                    if (Verbose)
                    {
                        log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + p);
                    }
                    prob += p;
                    System.Array.Copy(given, 1, given, 0, given.Length - 1);
                    given[given.Length - 1] = label;
                }
                // compute the expected counts for this document, which we will need to compute the derivative
                // iterate over the positions in this document
                for (int i_1 = 0; i_1 < docData.Length; i_1++)
                {
                    // for each possible clique at this position
                    System.Array.Copy(windowLabels, 1, windowLabels, 0, window - 1);
                    windowLabels[window - 1] = docLabels[i_1];
                    for (int j = 0; j < docData[i_1].Length; j++)
                    {
                        IIndex <CRFLabel> labelIndex = labelIndices[j];
                        // for each possible labeling for that clique
                        int[]      cliqueFeatures = docData[i_1][j];
                        double[]   As             = null;
                        double[]   fDeriv         = null;
                        double[][] yTimesA        = null;
                        double[]   sumOfYTimesA   = null;
                        int        inputSize;
                        int        outputSize = -1;
                        if (j == 0)
                        {
                            inputSize  = inputLayerSize;
                            outputSize = outputLayerSize;
                            As         = cliquePotentialFunction.HiddenLayerOutput(W, cliqueFeatures, flags, null, j + 1);
                        }
                        else
                        {
                            inputSize  = inputLayerSize4Edge;
                            outputSize = outputLayerSize4Edge;
                            As         = cliquePotentialFunction.HiddenLayerOutput(W4Edge, cliqueFeatures, flags, null, j + 1);
                        }
                        fDeriv = new double[inputSize];
                        double fD = 0;
                        for (int q = 0; q < inputSize; q++)
                        {
                            if (useSigmoid)
                            {
                                fD = As[q] * (1 - As[q]);
                            }
                            else
                            {
                                fD = 1 - As[q] * As[q];
                            }
                            fDeriv[q] = fD;
                        }
                        // calculating yTimesA for softmax
                        if (flags.softmaxOutputLayer)
                        {
                            double val = 0;
                            yTimesA = new double[outputSize][];
                            for (int ii = 0; ii < outputSize; ii++)
                            {
                                yTimesA[ii] = new double[numHiddenUnits];
                            }
                            sumOfYTimesA = new double[outputSize];
                            for (int k = 0; k < outputSize; k++)
                            {
                                double[] Yk = null;
                                if (flags.tieOutputLayer)
                                {
                                    if (j == 0)
                                    {
                                        Yk = Y[0];
                                    }
                                    else
                                    {
                                        Yk = Y4Edge[0];
                                    }
                                }
                                else
                                {
                                    if (j == 0)
                                    {
                                        Yk = Y[k];
                                    }
                                    else
                                    {
                                        Yk = Y4Edge[k];
                                    }
                                }
                                double sum = 0;
                                for (int q_1 = 0; q_1 < inputSize; q_1++)
                                {
                                    if (q_1 % outputSize == k)
                                    {
                                        int hiddenUnitNo = q_1 / outputSize;
                                        val = As[q_1] * Yk[hiddenUnitNo];
                                        yTimesA[k][hiddenUnitNo] = val;
                                        sum += val;
                                    }
                                }
                                sumOfYTimesA[k] = sum;
                            }
                        }
                        // calculating Uhat What
                        int[] cliqueLabel = new int[j + 1];
                        System.Array.Copy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1);
                        CRFLabel crfLabel        = new CRFLabel(cliqueLabel);
                        int      givenLabelIndex = labelIndex.IndexOf(crfLabel);
                        double[] Uk            = null;
                        double[] UhatK         = null;
                        double[] Yk_1          = null;
                        double[] yTimesAK      = null;
                        double   sumOfYTimesAK = 0;
                        if (flags.tieOutputLayer)
                        {
                            if (j == 0)
                            {
                                Uk    = U[0];
                                UhatK = Uhat[0];
                            }
                            else
                            {
                                Uk    = U4Edge[0];
                                UhatK = Uhat4Edge[0];
                            }
                            if (flags.softmaxOutputLayer)
                            {
                                if (j == 0)
                                {
                                    Yk_1 = Y[0];
                                }
                                else
                                {
                                    Yk_1 = Y4Edge[0];
                                }
                            }
                        }
                        else
                        {
                            if (j == 0)
                            {
                                Uk    = U[givenLabelIndex];
                                UhatK = Uhat[givenLabelIndex];
                            }
                            else
                            {
                                Uk    = U4Edge[givenLabelIndex];
                                UhatK = Uhat4Edge[givenLabelIndex];
                            }
                            if (flags.softmaxOutputLayer)
                            {
                                if (j == 0)
                                {
                                    Yk_1 = Y[givenLabelIndex];
                                }
                                else
                                {
                                    Yk_1 = Y4Edge[givenLabelIndex];
                                }
                            }
                        }
                        if (flags.softmaxOutputLayer)
                        {
                            yTimesAK      = yTimesA[givenLabelIndex];
                            sumOfYTimesAK = sumOfYTimesA[givenLabelIndex];
                        }
                        for (int k_1 = 0; k_1 < inputSize; k_1++)
                        {
                            double deltaK = 1;
                            if (flags.sparseOutputLayer || flags.tieOutputLayer)
                            {
                                if (k_1 % outputSize == givenLabelIndex)
                                {
                                    int hiddenUnitNo = k_1 / outputSize;
                                    if (flags.softmaxOutputLayer)
                                    {
                                        UhatK[hiddenUnitNo] += (yTimesAK[hiddenUnitNo] - Yk_1[hiddenUnitNo] * sumOfYTimesAK);
                                        deltaK *= Yk_1[hiddenUnitNo];
                                    }
                                    else
                                    {
                                        UhatK[hiddenUnitNo] += As[k_1];
                                        deltaK *= Uk[hiddenUnitNo];
                                    }
                                }
                            }
                            else
                            {
                                UhatK[k_1] += As[k_1];
                                if (useOutputLayer)
                                {
                                    deltaK *= Uk[k_1];
                                }
                            }
                            if (useHiddenLayer)
                            {
                                deltaK *= fDeriv[k_1];
                            }
                            if (useOutputLayer)
                            {
                                if (flags.sparseOutputLayer || flags.tieOutputLayer)
                                {
                                    if (k_1 % outputSize == givenLabelIndex)
                                    {
                                        double[] WhatK = null;
                                        if (j == 0)
                                        {
                                            WhatK = What[k_1];
                                        }
                                        else
                                        {
                                            WhatK = What4Edge[k_1];
                                        }
                                        foreach (int cliqueFeature in cliqueFeatures)
                                        {
                                            WhatK[cliqueFeature] += deltaK;
                                        }
                                    }
                                }
                                else
                                {
                                    double[] WhatK = null;
                                    if (j == 0)
                                    {
                                        WhatK = What[k_1];
                                    }
                                    else
                                    {
                                        WhatK = What4Edge[k_1];
                                    }
                                    foreach (int cliqueFeature in cliqueFeatures)
                                    {
                                        WhatK[cliqueFeature] += deltaK;
                                    }
                                }
                            }
                            else
                            {
                                if (k_1 == givenLabelIndex)
                                {
                                    double[] WhatK = null;
                                    if (j == 0)
                                    {
                                        WhatK = What[k_1];
                                    }
                                    else
                                    {
                                        WhatK = What4Edge[k_1];
                                    }
                                    foreach (int cliqueFeature in cliqueFeatures)
                                    {
                                        WhatK[cliqueFeature] += deltaK;
                                    }
                                }
                            }
                        }
                        for (int k_2 = 0; k_2 < labelIndex.Size(); k_2++)
                        {
                            // labelIndex.size() == numClasses
                            int[]  label = labelIndex.Get(k_2).GetLabel();
                            double p     = cliqueTree.Prob(i_1, label);
                            // probability of these labels occurring in this clique with these features
                            double[] Uk2 = null;
                            double[] eUK = null;
                            double[] Yk2 = null;
                            if (flags.tieOutputLayer)
                            {
                                if (j == 0)
                                {
                                    // for node features
                                    Uk2 = U[0];
                                    eUK = eU[0];
                                }
                                else
                                {
                                    Uk2 = U4Edge[0];
                                    eUK = eU4Edge[0];
                                }
                                if (flags.softmaxOutputLayer)
                                {
                                    if (j == 0)
                                    {
                                        Yk2 = Y[0];
                                    }
                                    else
                                    {
                                        Yk2 = Y4Edge[0];
                                    }
                                }
                            }
                            else
                            {
                                if (j == 0)
                                {
                                    Uk2 = U[k_2];
                                    eUK = eU[k_2];
                                }
                                else
                                {
                                    Uk2 = U4Edge[k_2];
                                    eUK = eU4Edge[k_2];
                                }
                                if (flags.softmaxOutputLayer)
                                {
                                    if (j == 0)
                                    {
                                        Yk2 = Y[k_2];
                                    }
                                    else
                                    {
                                        Yk2 = Y4Edge[k_2];
                                    }
                                }
                            }
                            if (useOutputLayer)
                            {
                                for (int q_1 = 0; q_1 < inputSize; q_1++)
                                {
                                    double deltaQ = 1;
                                    if (flags.sparseOutputLayer || flags.tieOutputLayer)
                                    {
                                        if (q_1 % outputSize == k_2)
                                        {
                                            int hiddenUnitNo = q_1 / outputSize;
                                            if (flags.softmaxOutputLayer)
                                            {
                                                eUK[hiddenUnitNo] += (yTimesA[k_2][hiddenUnitNo] - Yk2[hiddenUnitNo] * sumOfYTimesA[k_2]) * p;
                                                deltaQ             = Yk2[hiddenUnitNo];
                                            }
                                            else
                                            {
                                                eUK[hiddenUnitNo] += As[q_1] * p;
                                                deltaQ             = Uk2[hiddenUnitNo];
                                            }
                                        }
                                    }
                                    else
                                    {
                                        eUK[q_1] += As[q_1] * p;
                                        deltaQ    = Uk2[q_1];
                                    }
                                    if (useHiddenLayer)
                                    {
                                        deltaQ *= fDeriv[q_1];
                                    }
                                    if (flags.sparseOutputLayer || flags.tieOutputLayer)
                                    {
                                        if (q_1 % outputSize == k_2)
                                        {
                                            double[] eWq = null;
                                            if (j == 0)
                                            {
                                                eWq = eW[q_1];
                                            }
                                            else
                                            {
                                                eWq = eW4Edge[q_1];
                                            }
                                            foreach (int cliqueFeature in cliqueFeatures)
                                            {
                                                eWq[cliqueFeature] += deltaQ * p;
                                            }
                                        }
                                    }
                                    else
                                    {
                                        double[] eWq = null;
                                        if (j == 0)
                                        {
                                            eWq = eW[q_1];
                                        }
                                        else
                                        {
                                            eWq = eW4Edge[q_1];
                                        }
                                        foreach (int cliqueFeature in cliqueFeatures)
                                        {
                                            eWq[cliqueFeature] += deltaQ * p;
                                        }
                                    }
                                }
                            }
                            else
                            {
                                double deltaK = 1;
                                if (useHiddenLayer)
                                {
                                    deltaK *= fDeriv[k_2];
                                }
                                double[] eWK = null;
                                if (j == 0)
                                {
                                    eWK = eW[k_2];
                                }
                                else
                                {
                                    eWK = eW4Edge[k_2];
                                }
                                foreach (int cliqueFeature in cliqueFeatures)
                                {
                                    eWK[cliqueFeature] += deltaK * p;
                                }
                            }
                        }
                    }
                }
            }
            if (double.IsNaN(prob))
            {
                // shouldn't be the case
                throw new Exception("Got NaN for prob in CRFNonLinearSecondOrderLogConditionalObjectiveFunction.calculate()");
            }
            value = -prob;
            if (Verbose)
            {
                log.Info("value is " + value);
            }
            // compute the partial derivative for each feature by comparing expected counts to empirical counts
            int index = 0;

            for (int i_2 = 0; i_2 < eW4Edge.Length; i_2++)
            {
                for (int j = 0; j < eW4Edge[i_2].Length; j++)
                {
                    derivative[index++] = (eW4Edge[i_2][j] - What4Edge[i_2][j]);
                    if (Verbose)
                    {
                        log.Info("inputLayerWeights4Edge deriv(" + i_2 + "," + j + ") = " + eW4Edge[i_2][j] + " - " + What4Edge[i_2][j] + " = " + derivative[index - 1]);
                    }
                }
            }
            for (int i_3 = 0; i_3 < eW.Length; i_3++)
            {
                for (int j = 0; j < eW[i_3].Length; j++)
                {
                    derivative[index++] = (eW[i_3][j] - What[i_3][j]);
                    if (Verbose)
                    {
                        log.Info("inputLayerWeights deriv(" + i_3 + "," + j + ") = " + eW[i_3][j] + " - " + What[i_3][j] + " = " + derivative[index - 1]);
                    }
                }
            }
            if (index != beforeOutputWeights)
            {
                throw new Exception("after W derivative, index(" + index + ") != beforeOutputWeights(" + beforeOutputWeights + ")");
            }
            if (useOutputLayer)
            {
                for (int i = 0; i_3 < eU4Edge.Length; i_3++)
                {
                    for (int j = 0; j < eU4Edge[i_3].Length; j++)
                    {
                        derivative[index++] = (eU4Edge[i_3][j] - Uhat4Edge[i_3][j]);
                        if (Verbose)
                        {
                            log.Info("outputLayerWeights4Edge deriv(" + i_3 + "," + j + ") = " + eU4Edge[i_3][j] + " - " + Uhat4Edge[i_3][j] + " = " + derivative[index - 1]);
                        }
                    }
                }
                for (int i_1 = 0; i_1 < eU.Length; i_1++)
                {
                    for (int j = 0; j < eU[i_1].Length; j++)
                    {
                        derivative[index++] = (eU[i_1][j] - Uhat[i_1][j]);
                        if (Verbose)
                        {
                            log.Info("outputLayerWeights deriv(" + i_1 + "," + j + ") = " + eU[i_1][j] + " - " + Uhat[i_1][j] + " = " + derivative[index - 1]);
                        }
                    }
                }
            }
            if (index != x.Length)
            {
                throw new Exception("after W derivative, index(" + index + ") != x.length(" + x.Length + ")");
            }
            int regSize = x.Length;

            if (flags.skipOutputRegularization || flags.softmaxOutputLayer)
            {
                regSize = beforeOutputWeights;
            }
            // incorporate priors
            if (prior == QuadraticPrior)
            {
                double sigmaSq = sigma * sigma;
                for (int i = 0; i_3 < regSize; i_3++)
                {
                    double k = 1.0;
                    double w = x[i_3];
                    value           += k * w * w / 2.0 / sigmaSq;
                    derivative[i_3] += k * w / sigmaSq;
                }
            }
            else
            {
                if (prior == HuberPrior)
                {
                    double sigmaSq = sigma * sigma;
                    for (int i = 0; i_3 < regSize; i_3++)
                    {
                        double w    = x[i_3];
                        double wabs = System.Math.Abs(w);
                        if (wabs < epsilon)
                        {
                            value           += w * w / 2.0 / epsilon / sigmaSq;
                            derivative[i_3] += w / epsilon / sigmaSq;
                        }
                        else
                        {
                            value           += (wabs - epsilon / 2) / sigmaSq;
                            derivative[i_3] += ((w < 0.0) ? -1.0 : 1.0) / sigmaSq;
                        }
                    }
                }
                else
                {
                    if (prior == QuarticPrior)
                    {
                        double sigmaQu = sigma * sigma * sigma * sigma;
                        for (int i = 0; i_3 < regSize; i_3++)
                        {
                            double k = 1.0;
                            double w = x[i_3];
                            value           += k * w * w * w * w / 2.0 / sigmaQu;
                            derivative[i_3] += k * w / sigmaQu;
                        }
                    }
                }
            }
        }
        private double GetDropoutPrior(CRFCliqueTree <string> cliqueTree, int[][][] docData, IDictionary <int, double[]> EForADoc, IList <ICollection <int> > docDataHash, int[] activeFeatures, IDictionary <int, double[]> dropoutPriorGrad, IDictionary <int,
                                                                                                                                                                                                                                                            IList <int> > condensedFeaturesMap, IList <IDictionary <int, double[]> > EForADocPos)
        {
            IDictionary <int, double[]> dropoutPriorGradFirstHalf = SparseE(activeFeatures);
            Timing timer      = new Timing();
            double priorValue = 0;
            long   elapsedMs  = 0;
            Pair <double[][][], double[][][]> condProbs = GetCondProbs(cliqueTree, docData);

            // first index position is curr index, second index curr-class, third index prev-class
            // e.g. [1][2][3] means curr is at position 1 with class 2, prev is at position 0 with class 3
            double[][][] prevGivenCurr = condProbs.First();
            // first index position is curr index, second index curr-class, third index next-class
            // e.g. [0][2][3] means curr is at position 0 with class 2, next is at position 1 with class 3
            double[][][] nextGivenCurr = condProbs.Second();
            // first dim is doc length (i)
            // second dim is numOfFeatures (fIndex)
            // third dim is numClasses (y)
            // fourth dim is labelIndexSize (matching the clique type of fIndex, for \theta)
            double[][][][] FAlpha = null;
            double[][][][] FBeta  = null;
            if (!dropoutApprox)
            {
                FAlpha = new double[docData.Length][][][];
                FBeta  = new double[docData.Length][][][];
            }
            for (int i = 0; i < docData.Length; i++)
            {
                if (!dropoutApprox)
                {
                    FAlpha[i] = new double[activeFeatures.Length][][];
                    FBeta[i]  = new double[activeFeatures.Length][][];
                }
            }
            if (!dropoutApprox)
            {
                // computing FAlpha
                int    fIndex = 0;
                double aa;
                double bb;
                double cc = 0;
                bool   prevFeaturePresent = false;
                for (int i_1 = 1; i_1 < docData.Length; i_1++)
                {
                    // for each possible clique at this position
                    ICollection <int> docDataHashIMinusOne = docDataHash[i_1 - 1];
                    for (int fIndexPos = 0; fIndexPos < activeFeatures.Length; fIndexPos++)
                    {
                        fIndex             = activeFeatures[fIndexPos];
                        prevFeaturePresent = docDataHashIMinusOne.Contains(fIndex);
                        int j = map[fIndex];
                        IIndex <CRFLabel> labelIndex = labelIndices[j];
                        int labelIndexSize           = labelIndex.Size();
                        if (FAlpha[i_1 - 1][fIndexPos] == null)
                        {
                            FAlpha[i_1 - 1][fIndexPos] = new double[numClasses][];
                            for (int q = 0; q < numClasses; q++)
                            {
                                FAlpha[i_1 - 1][fIndexPos][q] = new double[labelIndexSize];
                            }
                        }
                        foreach (KeyValuePair <int, IList <int> > entry in currPrevLabelsMap)
                        {
                            int y = entry.Key;
                            // value at i-1
                            double[] sum = new double[labelIndexSize];
                            foreach (int yPrime in entry.Value)
                            {
                                // value at i-2
                                for (int kk = 0; kk < labelIndexSize; kk++)
                                {
                                    int[] prevLabel = labelIndex.Get(kk).GetLabel();
                                    aa = (prevGivenCurr[i_1 - 1][y][yPrime]);
                                    bb = (prevFeaturePresent && ((j == 0 && prevLabel[0] == y) || (j == 1 && prevLabel[1] == y && prevLabel[0] == yPrime)) ? 1 : 0);
                                    cc = 0;
                                    if (FAlpha[i_1 - 1][fIndexPos][yPrime] != null)
                                    {
                                        cc = FAlpha[i_1 - 1][fIndexPos][yPrime][kk];
                                    }
                                    sum[kk] += aa * (bb + cc);
                                }
                            }
                            // sum[kk] += (prevGivenCurr[i-1][y][yPrime]) * ((prevFeaturePresent && ((j == 0 && prevLabel[0] == y) || (j == 1 && prevLabel[1] == y && prevLabel[0] == yPrime)) ? 1 : 0) + FAlpha[i-1][fIndexPos][yPrime][kk]);
                            if (FAlpha[i_1][fIndexPos] == null)
                            {
                                FAlpha[i_1][fIndexPos] = new double[numClasses][];
                            }
                            FAlpha[i_1][fIndexPos][y] = sum;
                        }
                    }
                }
                // computing FBeta
                int docDataLen = docData.Length;
                for (int i_2 = docDataLen - 2; i_2 >= 0; i_2--)
                {
                    ICollection <int> docDataHashIPlusOne = docDataHash[i_2 + 1];
                    // for each possible clique at this position
                    for (int fIndexPos = 0; fIndexPos < activeFeatures.Length; fIndexPos++)
                    {
                        fIndex = activeFeatures[fIndexPos];
                        bool nextFeaturePresent = docDataHashIPlusOne.Contains(fIndex);
                        int  j = map[fIndex];
                        IIndex <CRFLabel> labelIndex = labelIndices[j];
                        int labelIndexSize           = labelIndex.Size();
                        if (FBeta[i_2 + 1][fIndexPos] == null)
                        {
                            FBeta[i_2 + 1][fIndexPos] = new double[numClasses][];
                            for (int q = 0; q < numClasses; q++)
                            {
                                FBeta[i_2 + 1][fIndexPos][q] = new double[labelIndexSize];
                            }
                        }
                        foreach (KeyValuePair <int, IList <int> > entry in currNextLabelsMap)
                        {
                            int y = entry.Key;
                            // value at i
                            double[] sum = new double[labelIndexSize];
                            foreach (int yPrime in entry.Value)
                            {
                                // value at i+1
                                for (int kk = 0; kk < labelIndexSize; kk++)
                                {
                                    int[] nextLabel = labelIndex.Get(kk).GetLabel();
                                    // log.info("labelIndexSize:"+labelIndexSize+", nextGivenCurr:"+nextGivenCurr+", nextLabel:"+nextLabel+", FBeta["+(i+1)+"]["+ fIndexPos +"]["+yPrime+"] :"+FBeta[i+1][fIndexPos][yPrime]);
                                    aa = (nextGivenCurr[i_2][y][yPrime]);
                                    bb = (nextFeaturePresent && ((j == 0 && nextLabel[0] == yPrime) || (j == 1 && nextLabel[0] == y && nextLabel[1] == yPrime)) ? 1 : 0);
                                    cc = 0;
                                    if (FBeta[i_2 + 1][fIndexPos][yPrime] != null)
                                    {
                                        cc = FBeta[i_2 + 1][fIndexPos][yPrime][kk];
                                    }
                                    sum[kk] += aa * (bb + cc);
                                }
                            }
                            // sum[kk] += (nextGivenCurr[i][y][yPrime]) * ( (nextFeaturePresent && ((j == 0 && nextLabel[0] == yPrime) || (j == 1 && nextLabel[0] == y && nextLabel[1] == yPrime)) ? 1 : 0) + FBeta[i+1][fIndexPos][yPrime][kk]);
                            if (FBeta[i_2][fIndexPos] == null)
                            {
                                FBeta[i_2][fIndexPos] = new double[numClasses][];
                            }
                            FBeta[i_2][fIndexPos][y] = sum;
                        }
                    }
                }
            }
            // derivative equals: VarU' * PtYYp * (1-PtYYp) + VarU * PtYYp' * (1-PtYYp) + VarU * PtYYp * (1-PtYYp)'
            // derivative equals: VarU' * PtYYp * (1-PtYYp) + VarU * PtYYp' * (1-PtYYp) + VarU * PtYYp * -PtYYp'
            // derivative equals: VarU' * PtYYp * (1-PtYYp) + VarU * PtYYp' * (1 - 2 * PtYYp)
            double deltaDivByOneMinusDelta = delta / (1.0 - delta);
            Timing innerTimer      = new Timing();
            long   eTiming         = 0;
            long   dropoutTiming   = 0;
            bool   containsFeature = false;

            // iterate over the positions in this document
            for (int i_3 = 1; i_3 < docData.Length; i_3++)
            {
                ICollection <int>           docDataHashI   = docDataHash[i_3];
                IDictionary <int, double[]> EForADocPosAtI = null;
                if (dropoutApprox)
                {
                    EForADocPosAtI = EForADocPos[i_3];
                }
                // for each possible clique at this position
                for (int k = 0; k < edgeLabelIndexSize; k++)
                {
                    // sum over (y, y')
                    int[] label = edgeLabels[k];
                    int   y     = label[0];
                    int   yP    = label[1];
                    // important to use label as an int[] for calculating cliqueTree.prob()
                    // if it's a node clique, and label index is 2, if we don't use int[]{2} but just pass 2,
                    // cliqueTree is going to treat it as index of the edge clique labels, and convert 2
                    // into int[]{0,2}, and return the edge prob marginal instead of node marginal
                    double PtYYp = cliqueTree.Prob(i_3, label);
                    double PtYYpTimesOneMinusPtYYp = PtYYp * (1.0 - PtYYp);
                    double oneMinus2PtYYp          = (1.0 - 2 * PtYYp);
                    double USum = 0;
                    int    fIndex;
                    for (int jjj = 0; jjj < labelIndices.Count; jjj++)
                    {
                        for (int n = 0; n < docData[i_3][jjj].Length; n++)
                        {
                            fIndex = docData[i_3][jjj][n];
                            int valIndex;
                            if (jjj == 1)
                            {
                                valIndex = k;
                            }
                            else
                            {
                                valIndex = yP;
                            }
                            double theta;
                            try
                            {
                                theta = weights[fIndex][valIndex];
                            }
                            catch (Exception ex)
                            {
                                System.Console.Error.Printf("weights[%d][%d], map[%d]=%d, labelIndices.get(map[%d]).size() = %d, weights.length=%d\n", fIndex, valIndex, fIndex, map[fIndex], fIndex, labelIndices[map[fIndex]].Size(), weights.Length);
                                throw new Exception(ex);
                            }
                            USum += weightSquare[fIndex][valIndex];
                            // first half of derivative: VarU' * PtYYp * (1-PtYYp)
                            double VarUp = deltaDivByOneMinusDelta * theta;
                            IncreScoreAllowNull(dropoutPriorGradFirstHalf, fIndex, valIndex, VarUp * PtYYpTimesOneMinusPtYYp);
                        }
                    }
                    double VarU = 0.5 * deltaDivByOneMinusDelta * USum;
                    // update function objective
                    priorValue += VarU * PtYYpTimesOneMinusPtYYp;
                    double VarUTimesOneMinus2PtYYp = VarU * oneMinus2PtYYp;
                    // second half of derivative: VarU * PtYYp' * (1 - 2 * PtYYp)
                    // boolean prevFeaturePresent = false;
                    // boolean nextFeaturePresent = false;
                    for (int fIndexPos = 0; fIndexPos < activeFeatures.Length; fIndexPos++)
                    {
                        fIndex          = activeFeatures[fIndexPos];
                        containsFeature = docDataHashI.Contains(fIndex);
                        // if (!containsFeature) continue;
                        int jj = map[fIndex];
                        IIndex <CRFLabel> fLabelIndex = labelIndices[jj];
                        for (int kk = 0; kk < fLabelIndex.Size(); kk++)
                        {
                            // for all parameter \theta
                            int[] fLabel = fLabelIndex.Get(kk).GetLabel();
                            // if (FAlpha[i] != null)
                            //   log.info("fIndex: " + fIndex+", FAlpha[i].size:"+FAlpha[i].length);
                            double fCount = containsFeature && ((jj == 0 && fLabel[0] == yP) || (jj == 1 && k == kk)) ? 1 : 0;
                            double alpha;
                            double beta;
                            double condE;
                            double PtYYpPrime;
                            if (!dropoutApprox)
                            {
                                alpha      = ((FAlpha[i_3][fIndexPos] == null || FAlpha[i_3][fIndexPos][y] == null) ? 0 : FAlpha[i_3][fIndexPos][y][kk]);
                                beta       = ((FBeta[i_3][fIndexPos] == null || FBeta[i_3][fIndexPos][yP] == null) ? 0 : FBeta[i_3][fIndexPos][yP][kk]);
                                condE      = fCount + alpha + beta;
                                PtYYpPrime = PtYYp * (condE - EForADoc[fIndex][kk]);
                            }
                            else
                            {
                                double E = 0;
                                if (EForADocPosAtI.Contains(fIndex))
                                {
                                    E = EForADocPosAtI[fIndex][kk];
                                }
                                condE      = fCount;
                                PtYYpPrime = PtYYp * (condE - E);
                            }
                            IncreScore(dropoutPriorGrad, fIndex, kk, VarUTimesOneMinus2PtYYp * PtYYpPrime);
                        }
                    }
                }
            }
            // copy for condensedFeaturesMap
            foreach (KeyValuePair <int, IList <int> > entry in condensedFeaturesMap)
            {
                int         key   = entry.Key;
                IList <int> aList = entry.Value;
                foreach (int toCopyInto in aList)
                {
                    double[] arr       = dropoutPriorGrad[key];
                    double[] targetArr = new double[arr.Length];
                    for (int i_1 = 0; i_1 < arr.Length; i_1++)
                    {
                        targetArr[i_1] = arr[i_1];
                    }
                    dropoutPriorGrad[toCopyInto] = targetArr;
                }
            }
            foreach (KeyValuePair <int, double[]> entry_1 in dropoutPriorGrad)
            {
                int      key    = entry_1.Key;
                double[] target = entry_1.Value;
                if (dropoutPriorGradFirstHalf.Contains(key))
                {
                    double[] source = dropoutPriorGradFirstHalf[key];
                    for (int i_1 = 0; i_1 < target.Length; i_1++)
                    {
                        target[i_1] += source[i_1];
                    }
                }
            }
            // for (int i=0;i<dropoutPriorGrad.length;i++)
            //   for (int j=0; j<dropoutPriorGrad[i].length;j++) {
            //     if (DEBUG3)
            //       System.err.printf("f=%d, k=%d, dropoutPriorGradFirstHalf[%d][%d]=% 5.3f, dropoutPriorGrad[%d][%d]=% 5.3f\n", i, j, i, j, dropoutPriorGradFirstHalf[i][j], i, j, dropoutPriorGrad[i][j]);
            //     dropoutPriorGrad[i][j] += dropoutPriorGradFirstHalf[i][j];
            //   }
            return(dropoutScale * priorValue);
        }
        private Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > ExpectedCountsAndValueForADoc(int docIndex, bool skipExpectedCountCalc, bool skipValCalc)
        {
            int[] activeFeatures = dataFeatureHashByDoc[docIndex];
            IList <ICollection <int> >      docDataHash          = dataFeatureHash[docIndex];
            IDictionary <int, IList <int> > condensedFeaturesMap = condensedMap[docIndex];
            double prob = 0;

            int[][][] docData   = totalData[docIndex];
            int[]     docLabels = null;
            if (docIndex < labels.Length)
            {
                docLabels = labels[docIndex];
            }
            Timing timer = new Timing();

            double[][][] featureVal3DArr = null;
            if (featureVal != null)
            {
                featureVal3DArr = featureVal[docIndex];
            }
            // make a clique tree for this document
            CRFCliqueTree <string> cliqueTree = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, featureVal3DArr);

            if (!skipValCalc)
            {
                // compute the log probability of the document given the model with the parameters x
                int[] given = new int[window - 1];
                Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol));
                if (docLabels.Length > docData.Length)
                {
                    // only true for self-training
                    // fill the given array with the extra docLabels
                    System.Array.Copy(docLabels, 0, given, 0, given.Length);
                    // shift the docLabels array left
                    int[] newDocLabels = new int[docData.Length];
                    System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length);
                    docLabels = newDocLabels;
                }
                double startPosLogProb = cliqueTree.LogProbStartPos();
                if (Verbose)
                {
                    System.Console.Error.Printf("P_-1(Background) = % 5.3f\n", startPosLogProb);
                }
                prob += startPosLogProb;
                // iterate over the positions in this document
                for (int i = 0; i < docData.Length; i++)
                {
                    int    label = docLabels[i];
                    double p     = cliqueTree.CondLogProbGivenPrevious(i, label, given);
                    if (Verbose)
                    {
                        log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + System.Math.Exp(p));
                    }
                    prob += p;
                    System.Array.Copy(given, 1, given, 0, given.Length - 1);
                    given[given.Length - 1] = label;
                }
            }
            IDictionary <int, double[]>          EForADoc    = SparseE(activeFeatures);
            IList <IDictionary <int, double[]> > EForADocPos = null;

            if (dropoutApprox)
            {
                EForADocPos = new List <IDictionary <int, double[]> >(docData.Length);
            }
            if (!skipExpectedCountCalc)
            {
                // compute the expected counts for this document, which we will need to compute the derivative
                // iterate over the positions in this document
                double fVal = 1.0;
                for (int i = 0; i < docData.Length; i++)
                {
                    ICollection <int>           docDataHashI   = docDataHash[i];
                    IDictionary <int, double[]> EForADocPosAtI = null;
                    if (dropoutApprox)
                    {
                        EForADocPosAtI = SparseE(docDataHashI);
                    }
                    foreach (int fIndex in docDataHashI)
                    {
                        int j = map[fIndex];
                        IIndex <CRFLabel> labelIndex = labelIndices[j];
                        // for each possible labeling for that clique
                        for (int k = 0; k < labelIndex.Size(); k++)
                        {
                            int[]  label = labelIndex.Get(k).GetLabel();
                            double p     = cliqueTree.Prob(i, label);
                            // probability of these labels occurring in this clique with these features
                            if (dropoutApprox)
                            {
                                IncreScore(EForADocPosAtI, fIndex, k, fVal * p);
                            }
                            IncreScore(EForADoc, fIndex, k, fVal * p);
                        }
                    }
                    if (dropoutApprox)
                    {
                        foreach (int fIndex_1 in docDataHashI)
                        {
                            if (condensedFeaturesMap.Contains(fIndex_1))
                            {
                                IList <int> aList = condensedFeaturesMap[fIndex_1];
                                foreach (int toCopyInto in aList)
                                {
                                    double[] arr       = EForADocPosAtI[fIndex_1];
                                    double[] targetArr = new double[arr.Length];
                                    for (int q = 0; q < arr.Length; q++)
                                    {
                                        targetArr[q] = arr[q];
                                    }
                                    EForADocPosAtI[toCopyInto] = targetArr;
                                }
                            }
                        }
                        EForADocPos.Add(EForADocPosAtI);
                    }
                }
                // copy for condensedFeaturesMap
                foreach (KeyValuePair <int, IList <int> > entry in condensedFeaturesMap)
                {
                    int         key   = entry.Key;
                    IList <int> aList = entry.Value;
                    foreach (int toCopyInto in aList)
                    {
                        double[] arr       = EForADoc[key];
                        double[] targetArr = new double[arr.Length];
                        for (int i_1 = 0; i_1 < arr.Length; i_1++)
                        {
                            targetArr[i_1] = arr[i_1];
                        }
                        EForADoc[toCopyInto] = targetArr;
                    }
                }
            }
            IDictionary <int, double[]> dropoutPriorGrad = null;

            if (prior == DropoutPrior)
            {
                // we can optimize this, this is too large, don't need this big
                dropoutPriorGrad = SparseE(activeFeatures);
                // log.info("computing dropout prior for doc " + docIndex + " ... ");
                prob -= GetDropoutPrior(cliqueTree, docData, EForADoc, docDataHash, activeFeatures, dropoutPriorGrad, condensedFeaturesMap, EForADocPos);
            }
            // log.info(" done!");
            return(new Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> >(docIndex, prob, EForADoc, dropoutPriorGrad));
        }
        // todo [cdm]: Below data[m] --> docData
        /// <summary>Calculates both value and partial derivatives at the point x, and save them internally.</summary>
        protected internal override void Calculate(double[] x)
        {
            double prob = 0.0;

            // the log prob of the sequence given the model, which is the negation of value at this point
            double[][][] E         = Empty2D();
            double[]     eScales   = new double[numLopExpert];
            double[]     rawScales = SeparateLopScales(x);
            double[]     scales    = ArrayMath.Softmax(rawScales);
            double[][][] learnedLopExpertWeights2D = lopExpertWeights2D;
            if (backpropTraining)
            {
                learnedLopExpertWeights2D = SeparateLopExpertWeights2D(x);
                LogPotential(learnedLopExpertWeights2D);
            }
            double[][] combinedWeights2D = CombineAndScaleLopWeights2D(numLopExpert, learnedLopExpertWeights2D, scales);
            // iterate over all the documents
            for (int m = 0; m < data.Length; m++)
            {
                int[][][]      docData   = data[m];
                int[]          docLabels = labels[m];
                double[][][][] sumOfELPm = sumOfExpectedLogPotential[m];
                // sumOfExpectedLogPotential[m][i][j][lopIter][k] m-docNo;i-position;j-cliqueNo;k-label
                // make a clique tree for this document
                ICliquePotentialFunction cliquePotentialFunc = new LinearCliquePotentialFunction(combinedWeights2D);
                CRFCliqueTree <string>   cliqueTree          = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, null);
                // compute the log probability of the document given the model with the parameters x
                int[] given = new int[window - 1];
                Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol));
                if (docLabels.Length > docData.Length)
                {
                    // only true for self-training
                    // fill the given array with the extra docLabels
                    System.Array.Copy(docLabels, 0, given, 0, given.Length);
                    // shift the docLabels array left
                    int[] newDocLabels = new int[docData.Length];
                    System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length);
                    docLabels = newDocLabels;
                }
                // iterate over the positions in this document
                for (int i = 0; i < docData.Length; i++)
                {
                    int    label = docLabels[i];
                    double p     = cliqueTree.CondLogProbGivenPrevious(i, label, given);
                    if (Verbose)
                    {
                        log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + p);
                    }
                    prob += p;
                    System.Array.Copy(given, 1, given, 0, given.Length - 1);
                    given[given.Length - 1] = label;
                }
                // compute the expected counts for this document, which we will need to compute the derivative
                // iterate over the positions in this document
                for (int i_1 = 0; i_1 < docData.Length; i_1++)
                {
                    // for each possible clique at this position
                    double[][][] sumOfELPmi = sumOfELPm[i_1];
                    for (int j = 0; j < docData[i_1].Length; j++)
                    {
                        double[][]        sumOfELPmij = sumOfELPmi[j];
                        IIndex <CRFLabel> labelIndex  = labelIndices[j];
                        // for each possible labeling for that clique
                        for (int l = 0; l < labelIndex.Size(); l++)
                        {
                            int[]  label = labelIndex.Get(l).GetLabel();
                            double p     = cliqueTree.Prob(i_1, label);
                            // probability of these labels occurring in this clique with these features
                            for (int lopIter = 0; lopIter < numLopExpert; lopIter++)
                            {
                                ICollection <int> indicesSet = featureIndicesSetArray[lopIter];
                                double            scale      = scales[lopIter];
                                double            expected   = sumOfELPmij[lopIter][l];
                                for (int innerLopIter = 0; innerLopIter < numLopExpert; innerLopIter++)
                                {
                                    expected -= scales[innerLopIter] * sumOfELPmij[innerLopIter][l];
                                }
                                expected         *= scale;
                                eScales[lopIter] += (p * expected);
                                double[][] eOfIter = E[lopIter];
                                if (backpropTraining)
                                {
                                    for (int k = 0; k < docData[i_1][j].Length; k++)
                                    {
                                        // k iterates over features
                                        int featureIdx = docData[i_1][j][k];
                                        if (indicesSet.Contains(featureIdx))
                                        {
                                            eOfIter[featureIdx][l] += p;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (double.IsNaN(prob))
            {
                // shouldn't be the case
                throw new Exception("Got NaN for prob in CRFLogConditionalObjectiveFunctionForLOP.calculate()");
            }
            value = -prob;
            if (Verbose)
            {
                log.Info("value is " + value);
            }
            // compute the partial derivative for each feature by comparing expected counts to empirical counts
            for (int lopIter_1 = 0; lopIter_1 < numLopExpert; lopIter_1++)
            {
                double scale    = scales[lopIter_1];
                double observed = sumOfObservedLogPotential[lopIter_1];
                for (int j = 0; j < numLopExpert; j++)
                {
                    observed -= scales[j] * sumOfObservedLogPotential[j];
                }
                observed *= scale;
                double expected = eScales[lopIter_1];
                derivative[lopIter_1] = (expected - observed);
                if (Verbose)
                {
                    log.Info("deriv(" + lopIter_1 + ") = " + expected + " - " + observed + " = " + derivative[lopIter_1]);
                }
            }
            if (backpropTraining)
            {
                int dIndex = numLopExpert;
                for (int lopIter = 0; lopIter_1 < numLopExpert; lopIter_1++)
                {
                    double      scale              = scales[lopIter_1];
                    double[][]  eOfExpert          = E[lopIter_1];
                    double[][]  ehatOfExpert       = Ehat[lopIter_1];
                    IList <int> featureIndicesList = featureIndicesListArray[lopIter_1];
                    foreach (int fIndex in featureIndicesList)
                    {
                        for (int j = 0; j < eOfExpert[fIndex].Length; j++)
                        {
                            derivative[dIndex++] = scale * (eOfExpert[fIndex][j] - ehatOfExpert[fIndex][j]);
                            if (Verbose)
                            {
                                log.Info("deriv[" + lopIter_1 + "](" + fIndex + "," + j + ") = " + scale + " * (" + eOfExpert[fIndex][j] + " - " + ehatOfExpert[fIndex][j] + ") = " + derivative[dIndex - 1]);
                            }
                        }
                    }
                }
                System.Diagnostics.Debug.Assert((dIndex == DomainDimension()));
            }
        }