/// <summary>Compute the expected counts for this document, which we will need to compute the derivative.</summary> protected internal virtual void DocumentExpectedCounts(double[][] E, int[][][] docData, double[][][] featureVal3DArr, CRFCliqueTree <string> cliqueTree) { // iterate over the positions in this document for (int i = 0; i < docData.Length; i++) { // for each possible clique at this position for (int j = 0; j < docData[i].Length; j++) { IIndex <CRFLabel> labelIndex = labelIndices[j]; // for each possible labeling for that clique for (int k = 0; k < liSize; k++) { int[] label = labelIndex.Get(k).GetLabel(); double p = cliqueTree.Prob(i, label); // probability of these labels occurring in this clique with these features for (int n = 0; n < docData[i][j].Length; n++) { double fVal = 1.0; if (j == 0 && featureVal3DArr != null) { // j == 0 because only node features gets feature values fVal = featureVal3DArr[i][j][n]; } E[docData[i][j][n]][k] += p * fVal; } } } } }
// todo [cdm]: Below data[m] --> docData /// <summary>Calculates both value and partial derivatives at the point x, and save them internally.</summary> protected internal override void Calculate(double[] x) { double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point Quadruple <double[][], double[][], double[][], double[][]> allParams = SeparateWeights(x); double[][] W4Edge = allParams.First(); // inputLayerWeights4Edge double[][] U4Edge = allParams.Second(); // outputLayerWeights4Edge double[][] W = allParams.Third(); // inputLayerWeights double[][] U = allParams.Fourth(); // outputLayerWeights double[][] Y4Edge = null; double[][] Y = null; if (flags.softmaxOutputLayer) { Y4Edge = new double[U4Edge.Length][]; for (int i = 0; i < U4Edge.Length; i++) { Y4Edge[i] = ArrayMath.Softmax(U4Edge[i]); } Y = new double[U.Length][]; for (int i_1 = 0; i_1 < U.Length; i_1++) { Y[i_1] = ArrayMath.Softmax(U[i_1]); } } double[][] What4Edge = EmptyW4Edge(); double[][] Uhat4Edge = EmptyU4Edge(); double[][] What = EmptyW(); double[][] Uhat = EmptyU(); // the expectations over counts // first index is feature index, second index is of possible labeling double[][] eW4Edge = EmptyW4Edge(); double[][] eU4Edge = EmptyU4Edge(); double[][] eW = EmptyW(); double[][] eU = EmptyU(); // iterate over all the documents for (int m = 0; m < data.Length; m++) { int[][][] docData = data[m]; int[] docLabels = labels[m]; NonLinearSecondOrderCliquePotentialFunction cliquePotentialFunction = new NonLinearSecondOrderCliquePotentialFunction(W4Edge, U4Edge, W, U, flags); // make a clique tree for this document CRFCliqueTree <string> cliqueTree = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunction, null); // compute the log probability of the document given the model with the parameters x int[] given = new int[window - 1]; Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol)); int[] windowLabels = new int[window]; Arrays.Fill(windowLabels, classIndex.IndexOf(backgroundSymbol)); if (docLabels.Length > docData.Length) { // only true for self-training // fill the given array with the extra docLabels System.Array.Copy(docLabels, 0, given, 0, given.Length); System.Array.Copy(docLabels, 0, windowLabels, 0, windowLabels.Length); // shift the docLabels array left int[] newDocLabels = new int[docData.Length]; System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length); docLabels = newDocLabels; } // iterate over the positions in this document for (int i = 0; i < docData.Length; i++) { int label = docLabels[i]; double p = cliqueTree.CondLogProbGivenPrevious(i, label, given); if (Verbose) { log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + p); } prob += p; System.Array.Copy(given, 1, given, 0, given.Length - 1); given[given.Length - 1] = label; } // compute the expected counts for this document, which we will need to compute the derivative // iterate over the positions in this document for (int i_1 = 0; i_1 < docData.Length; i_1++) { // for each possible clique at this position System.Array.Copy(windowLabels, 1, windowLabels, 0, window - 1); windowLabels[window - 1] = docLabels[i_1]; for (int j = 0; j < docData[i_1].Length; j++) { IIndex <CRFLabel> labelIndex = labelIndices[j]; // for each possible labeling for that clique int[] cliqueFeatures = docData[i_1][j]; double[] As = null; double[] fDeriv = null; double[][] yTimesA = null; double[] sumOfYTimesA = null; int inputSize; int outputSize = -1; if (j == 0) { inputSize = inputLayerSize; outputSize = outputLayerSize; As = cliquePotentialFunction.HiddenLayerOutput(W, cliqueFeatures, flags, null, j + 1); } else { inputSize = inputLayerSize4Edge; outputSize = outputLayerSize4Edge; As = cliquePotentialFunction.HiddenLayerOutput(W4Edge, cliqueFeatures, flags, null, j + 1); } fDeriv = new double[inputSize]; double fD = 0; for (int q = 0; q < inputSize; q++) { if (useSigmoid) { fD = As[q] * (1 - As[q]); } else { fD = 1 - As[q] * As[q]; } fDeriv[q] = fD; } // calculating yTimesA for softmax if (flags.softmaxOutputLayer) { double val = 0; yTimesA = new double[outputSize][]; for (int ii = 0; ii < outputSize; ii++) { yTimesA[ii] = new double[numHiddenUnits]; } sumOfYTimesA = new double[outputSize]; for (int k = 0; k < outputSize; k++) { double[] Yk = null; if (flags.tieOutputLayer) { if (j == 0) { Yk = Y[0]; } else { Yk = Y4Edge[0]; } } else { if (j == 0) { Yk = Y[k]; } else { Yk = Y4Edge[k]; } } double sum = 0; for (int q_1 = 0; q_1 < inputSize; q_1++) { if (q_1 % outputSize == k) { int hiddenUnitNo = q_1 / outputSize; val = As[q_1] * Yk[hiddenUnitNo]; yTimesA[k][hiddenUnitNo] = val; sum += val; } } sumOfYTimesA[k] = sum; } } // calculating Uhat What int[] cliqueLabel = new int[j + 1]; System.Array.Copy(windowLabels, window - 1 - j, cliqueLabel, 0, j + 1); CRFLabel crfLabel = new CRFLabel(cliqueLabel); int givenLabelIndex = labelIndex.IndexOf(crfLabel); double[] Uk = null; double[] UhatK = null; double[] Yk_1 = null; double[] yTimesAK = null; double sumOfYTimesAK = 0; if (flags.tieOutputLayer) { if (j == 0) { Uk = U[0]; UhatK = Uhat[0]; } else { Uk = U4Edge[0]; UhatK = Uhat4Edge[0]; } if (flags.softmaxOutputLayer) { if (j == 0) { Yk_1 = Y[0]; } else { Yk_1 = Y4Edge[0]; } } } else { if (j == 0) { Uk = U[givenLabelIndex]; UhatK = Uhat[givenLabelIndex]; } else { Uk = U4Edge[givenLabelIndex]; UhatK = Uhat4Edge[givenLabelIndex]; } if (flags.softmaxOutputLayer) { if (j == 0) { Yk_1 = Y[givenLabelIndex]; } else { Yk_1 = Y4Edge[givenLabelIndex]; } } } if (flags.softmaxOutputLayer) { yTimesAK = yTimesA[givenLabelIndex]; sumOfYTimesAK = sumOfYTimesA[givenLabelIndex]; } for (int k_1 = 0; k_1 < inputSize; k_1++) { double deltaK = 1; if (flags.sparseOutputLayer || flags.tieOutputLayer) { if (k_1 % outputSize == givenLabelIndex) { int hiddenUnitNo = k_1 / outputSize; if (flags.softmaxOutputLayer) { UhatK[hiddenUnitNo] += (yTimesAK[hiddenUnitNo] - Yk_1[hiddenUnitNo] * sumOfYTimesAK); deltaK *= Yk_1[hiddenUnitNo]; } else { UhatK[hiddenUnitNo] += As[k_1]; deltaK *= Uk[hiddenUnitNo]; } } } else { UhatK[k_1] += As[k_1]; if (useOutputLayer) { deltaK *= Uk[k_1]; } } if (useHiddenLayer) { deltaK *= fDeriv[k_1]; } if (useOutputLayer) { if (flags.sparseOutputLayer || flags.tieOutputLayer) { if (k_1 % outputSize == givenLabelIndex) { double[] WhatK = null; if (j == 0) { WhatK = What[k_1]; } else { WhatK = What4Edge[k_1]; } foreach (int cliqueFeature in cliqueFeatures) { WhatK[cliqueFeature] += deltaK; } } } else { double[] WhatK = null; if (j == 0) { WhatK = What[k_1]; } else { WhatK = What4Edge[k_1]; } foreach (int cliqueFeature in cliqueFeatures) { WhatK[cliqueFeature] += deltaK; } } } else { if (k_1 == givenLabelIndex) { double[] WhatK = null; if (j == 0) { WhatK = What[k_1]; } else { WhatK = What4Edge[k_1]; } foreach (int cliqueFeature in cliqueFeatures) { WhatK[cliqueFeature] += deltaK; } } } } for (int k_2 = 0; k_2 < labelIndex.Size(); k_2++) { // labelIndex.size() == numClasses int[] label = labelIndex.Get(k_2).GetLabel(); double p = cliqueTree.Prob(i_1, label); // probability of these labels occurring in this clique with these features double[] Uk2 = null; double[] eUK = null; double[] Yk2 = null; if (flags.tieOutputLayer) { if (j == 0) { // for node features Uk2 = U[0]; eUK = eU[0]; } else { Uk2 = U4Edge[0]; eUK = eU4Edge[0]; } if (flags.softmaxOutputLayer) { if (j == 0) { Yk2 = Y[0]; } else { Yk2 = Y4Edge[0]; } } } else { if (j == 0) { Uk2 = U[k_2]; eUK = eU[k_2]; } else { Uk2 = U4Edge[k_2]; eUK = eU4Edge[k_2]; } if (flags.softmaxOutputLayer) { if (j == 0) { Yk2 = Y[k_2]; } else { Yk2 = Y4Edge[k_2]; } } } if (useOutputLayer) { for (int q_1 = 0; q_1 < inputSize; q_1++) { double deltaQ = 1; if (flags.sparseOutputLayer || flags.tieOutputLayer) { if (q_1 % outputSize == k_2) { int hiddenUnitNo = q_1 / outputSize; if (flags.softmaxOutputLayer) { eUK[hiddenUnitNo] += (yTimesA[k_2][hiddenUnitNo] - Yk2[hiddenUnitNo] * sumOfYTimesA[k_2]) * p; deltaQ = Yk2[hiddenUnitNo]; } else { eUK[hiddenUnitNo] += As[q_1] * p; deltaQ = Uk2[hiddenUnitNo]; } } } else { eUK[q_1] += As[q_1] * p; deltaQ = Uk2[q_1]; } if (useHiddenLayer) { deltaQ *= fDeriv[q_1]; } if (flags.sparseOutputLayer || flags.tieOutputLayer) { if (q_1 % outputSize == k_2) { double[] eWq = null; if (j == 0) { eWq = eW[q_1]; } else { eWq = eW4Edge[q_1]; } foreach (int cliqueFeature in cliqueFeatures) { eWq[cliqueFeature] += deltaQ * p; } } } else { double[] eWq = null; if (j == 0) { eWq = eW[q_1]; } else { eWq = eW4Edge[q_1]; } foreach (int cliqueFeature in cliqueFeatures) { eWq[cliqueFeature] += deltaQ * p; } } } } else { double deltaK = 1; if (useHiddenLayer) { deltaK *= fDeriv[k_2]; } double[] eWK = null; if (j == 0) { eWK = eW[k_2]; } else { eWK = eW4Edge[k_2]; } foreach (int cliqueFeature in cliqueFeatures) { eWK[cliqueFeature] += deltaK * p; } } } } } } if (double.IsNaN(prob)) { // shouldn't be the case throw new Exception("Got NaN for prob in CRFNonLinearSecondOrderLogConditionalObjectiveFunction.calculate()"); } value = -prob; if (Verbose) { log.Info("value is " + value); } // compute the partial derivative for each feature by comparing expected counts to empirical counts int index = 0; for (int i_2 = 0; i_2 < eW4Edge.Length; i_2++) { for (int j = 0; j < eW4Edge[i_2].Length; j++) { derivative[index++] = (eW4Edge[i_2][j] - What4Edge[i_2][j]); if (Verbose) { log.Info("inputLayerWeights4Edge deriv(" + i_2 + "," + j + ") = " + eW4Edge[i_2][j] + " - " + What4Edge[i_2][j] + " = " + derivative[index - 1]); } } } for (int i_3 = 0; i_3 < eW.Length; i_3++) { for (int j = 0; j < eW[i_3].Length; j++) { derivative[index++] = (eW[i_3][j] - What[i_3][j]); if (Verbose) { log.Info("inputLayerWeights deriv(" + i_3 + "," + j + ") = " + eW[i_3][j] + " - " + What[i_3][j] + " = " + derivative[index - 1]); } } } if (index != beforeOutputWeights) { throw new Exception("after W derivative, index(" + index + ") != beforeOutputWeights(" + beforeOutputWeights + ")"); } if (useOutputLayer) { for (int i = 0; i_3 < eU4Edge.Length; i_3++) { for (int j = 0; j < eU4Edge[i_3].Length; j++) { derivative[index++] = (eU4Edge[i_3][j] - Uhat4Edge[i_3][j]); if (Verbose) { log.Info("outputLayerWeights4Edge deriv(" + i_3 + "," + j + ") = " + eU4Edge[i_3][j] + " - " + Uhat4Edge[i_3][j] + " = " + derivative[index - 1]); } } } for (int i_1 = 0; i_1 < eU.Length; i_1++) { for (int j = 0; j < eU[i_1].Length; j++) { derivative[index++] = (eU[i_1][j] - Uhat[i_1][j]); if (Verbose) { log.Info("outputLayerWeights deriv(" + i_1 + "," + j + ") = " + eU[i_1][j] + " - " + Uhat[i_1][j] + " = " + derivative[index - 1]); } } } } if (index != x.Length) { throw new Exception("after W derivative, index(" + index + ") != x.length(" + x.Length + ")"); } int regSize = x.Length; if (flags.skipOutputRegularization || flags.softmaxOutputLayer) { regSize = beforeOutputWeights; } // incorporate priors if (prior == QuadraticPrior) { double sigmaSq = sigma * sigma; for (int i = 0; i_3 < regSize; i_3++) { double k = 1.0; double w = x[i_3]; value += k * w * w / 2.0 / sigmaSq; derivative[i_3] += k * w / sigmaSq; } } else { if (prior == HuberPrior) { double sigmaSq = sigma * sigma; for (int i = 0; i_3 < regSize; i_3++) { double w = x[i_3]; double wabs = System.Math.Abs(w); if (wabs < epsilon) { value += w * w / 2.0 / epsilon / sigmaSq; derivative[i_3] += w / epsilon / sigmaSq; } else { value += (wabs - epsilon / 2) / sigmaSq; derivative[i_3] += ((w < 0.0) ? -1.0 : 1.0) / sigmaSq; } } } else { if (prior == QuarticPrior) { double sigmaQu = sigma * sigma * sigma * sigma; for (int i = 0; i_3 < regSize; i_3++) { double k = 1.0; double w = x[i_3]; value += k * w * w * w * w / 2.0 / sigmaQu; derivative[i_3] += k * w / sigmaQu; } } } } }
private double GetDropoutPrior(CRFCliqueTree <string> cliqueTree, int[][][] docData, IDictionary <int, double[]> EForADoc, IList <ICollection <int> > docDataHash, int[] activeFeatures, IDictionary <int, double[]> dropoutPriorGrad, IDictionary <int, IList <int> > condensedFeaturesMap, IList <IDictionary <int, double[]> > EForADocPos) { IDictionary <int, double[]> dropoutPriorGradFirstHalf = SparseE(activeFeatures); Timing timer = new Timing(); double priorValue = 0; long elapsedMs = 0; Pair <double[][][], double[][][]> condProbs = GetCondProbs(cliqueTree, docData); // first index position is curr index, second index curr-class, third index prev-class // e.g. [1][2][3] means curr is at position 1 with class 2, prev is at position 0 with class 3 double[][][] prevGivenCurr = condProbs.First(); // first index position is curr index, second index curr-class, third index next-class // e.g. [0][2][3] means curr is at position 0 with class 2, next is at position 1 with class 3 double[][][] nextGivenCurr = condProbs.Second(); // first dim is doc length (i) // second dim is numOfFeatures (fIndex) // third dim is numClasses (y) // fourth dim is labelIndexSize (matching the clique type of fIndex, for \theta) double[][][][] FAlpha = null; double[][][][] FBeta = null; if (!dropoutApprox) { FAlpha = new double[docData.Length][][][]; FBeta = new double[docData.Length][][][]; } for (int i = 0; i < docData.Length; i++) { if (!dropoutApprox) { FAlpha[i] = new double[activeFeatures.Length][][]; FBeta[i] = new double[activeFeatures.Length][][]; } } if (!dropoutApprox) { // computing FAlpha int fIndex = 0; double aa; double bb; double cc = 0; bool prevFeaturePresent = false; for (int i_1 = 1; i_1 < docData.Length; i_1++) { // for each possible clique at this position ICollection <int> docDataHashIMinusOne = docDataHash[i_1 - 1]; for (int fIndexPos = 0; fIndexPos < activeFeatures.Length; fIndexPos++) { fIndex = activeFeatures[fIndexPos]; prevFeaturePresent = docDataHashIMinusOne.Contains(fIndex); int j = map[fIndex]; IIndex <CRFLabel> labelIndex = labelIndices[j]; int labelIndexSize = labelIndex.Size(); if (FAlpha[i_1 - 1][fIndexPos] == null) { FAlpha[i_1 - 1][fIndexPos] = new double[numClasses][]; for (int q = 0; q < numClasses; q++) { FAlpha[i_1 - 1][fIndexPos][q] = new double[labelIndexSize]; } } foreach (KeyValuePair <int, IList <int> > entry in currPrevLabelsMap) { int y = entry.Key; // value at i-1 double[] sum = new double[labelIndexSize]; foreach (int yPrime in entry.Value) { // value at i-2 for (int kk = 0; kk < labelIndexSize; kk++) { int[] prevLabel = labelIndex.Get(kk).GetLabel(); aa = (prevGivenCurr[i_1 - 1][y][yPrime]); bb = (prevFeaturePresent && ((j == 0 && prevLabel[0] == y) || (j == 1 && prevLabel[1] == y && prevLabel[0] == yPrime)) ? 1 : 0); cc = 0; if (FAlpha[i_1 - 1][fIndexPos][yPrime] != null) { cc = FAlpha[i_1 - 1][fIndexPos][yPrime][kk]; } sum[kk] += aa * (bb + cc); } } // sum[kk] += (prevGivenCurr[i-1][y][yPrime]) * ((prevFeaturePresent && ((j == 0 && prevLabel[0] == y) || (j == 1 && prevLabel[1] == y && prevLabel[0] == yPrime)) ? 1 : 0) + FAlpha[i-1][fIndexPos][yPrime][kk]); if (FAlpha[i_1][fIndexPos] == null) { FAlpha[i_1][fIndexPos] = new double[numClasses][]; } FAlpha[i_1][fIndexPos][y] = sum; } } } // computing FBeta int docDataLen = docData.Length; for (int i_2 = docDataLen - 2; i_2 >= 0; i_2--) { ICollection <int> docDataHashIPlusOne = docDataHash[i_2 + 1]; // for each possible clique at this position for (int fIndexPos = 0; fIndexPos < activeFeatures.Length; fIndexPos++) { fIndex = activeFeatures[fIndexPos]; bool nextFeaturePresent = docDataHashIPlusOne.Contains(fIndex); int j = map[fIndex]; IIndex <CRFLabel> labelIndex = labelIndices[j]; int labelIndexSize = labelIndex.Size(); if (FBeta[i_2 + 1][fIndexPos] == null) { FBeta[i_2 + 1][fIndexPos] = new double[numClasses][]; for (int q = 0; q < numClasses; q++) { FBeta[i_2 + 1][fIndexPos][q] = new double[labelIndexSize]; } } foreach (KeyValuePair <int, IList <int> > entry in currNextLabelsMap) { int y = entry.Key; // value at i double[] sum = new double[labelIndexSize]; foreach (int yPrime in entry.Value) { // value at i+1 for (int kk = 0; kk < labelIndexSize; kk++) { int[] nextLabel = labelIndex.Get(kk).GetLabel(); // log.info("labelIndexSize:"+labelIndexSize+", nextGivenCurr:"+nextGivenCurr+", nextLabel:"+nextLabel+", FBeta["+(i+1)+"]["+ fIndexPos +"]["+yPrime+"] :"+FBeta[i+1][fIndexPos][yPrime]); aa = (nextGivenCurr[i_2][y][yPrime]); bb = (nextFeaturePresent && ((j == 0 && nextLabel[0] == yPrime) || (j == 1 && nextLabel[0] == y && nextLabel[1] == yPrime)) ? 1 : 0); cc = 0; if (FBeta[i_2 + 1][fIndexPos][yPrime] != null) { cc = FBeta[i_2 + 1][fIndexPos][yPrime][kk]; } sum[kk] += aa * (bb + cc); } } // sum[kk] += (nextGivenCurr[i][y][yPrime]) * ( (nextFeaturePresent && ((j == 0 && nextLabel[0] == yPrime) || (j == 1 && nextLabel[0] == y && nextLabel[1] == yPrime)) ? 1 : 0) + FBeta[i+1][fIndexPos][yPrime][kk]); if (FBeta[i_2][fIndexPos] == null) { FBeta[i_2][fIndexPos] = new double[numClasses][]; } FBeta[i_2][fIndexPos][y] = sum; } } } } // derivative equals: VarU' * PtYYp * (1-PtYYp) + VarU * PtYYp' * (1-PtYYp) + VarU * PtYYp * (1-PtYYp)' // derivative equals: VarU' * PtYYp * (1-PtYYp) + VarU * PtYYp' * (1-PtYYp) + VarU * PtYYp * -PtYYp' // derivative equals: VarU' * PtYYp * (1-PtYYp) + VarU * PtYYp' * (1 - 2 * PtYYp) double deltaDivByOneMinusDelta = delta / (1.0 - delta); Timing innerTimer = new Timing(); long eTiming = 0; long dropoutTiming = 0; bool containsFeature = false; // iterate over the positions in this document for (int i_3 = 1; i_3 < docData.Length; i_3++) { ICollection <int> docDataHashI = docDataHash[i_3]; IDictionary <int, double[]> EForADocPosAtI = null; if (dropoutApprox) { EForADocPosAtI = EForADocPos[i_3]; } // for each possible clique at this position for (int k = 0; k < edgeLabelIndexSize; k++) { // sum over (y, y') int[] label = edgeLabels[k]; int y = label[0]; int yP = label[1]; // important to use label as an int[] for calculating cliqueTree.prob() // if it's a node clique, and label index is 2, if we don't use int[]{2} but just pass 2, // cliqueTree is going to treat it as index of the edge clique labels, and convert 2 // into int[]{0,2}, and return the edge prob marginal instead of node marginal double PtYYp = cliqueTree.Prob(i_3, label); double PtYYpTimesOneMinusPtYYp = PtYYp * (1.0 - PtYYp); double oneMinus2PtYYp = (1.0 - 2 * PtYYp); double USum = 0; int fIndex; for (int jjj = 0; jjj < labelIndices.Count; jjj++) { for (int n = 0; n < docData[i_3][jjj].Length; n++) { fIndex = docData[i_3][jjj][n]; int valIndex; if (jjj == 1) { valIndex = k; } else { valIndex = yP; } double theta; try { theta = weights[fIndex][valIndex]; } catch (Exception ex) { System.Console.Error.Printf("weights[%d][%d], map[%d]=%d, labelIndices.get(map[%d]).size() = %d, weights.length=%d\n", fIndex, valIndex, fIndex, map[fIndex], fIndex, labelIndices[map[fIndex]].Size(), weights.Length); throw new Exception(ex); } USum += weightSquare[fIndex][valIndex]; // first half of derivative: VarU' * PtYYp * (1-PtYYp) double VarUp = deltaDivByOneMinusDelta * theta; IncreScoreAllowNull(dropoutPriorGradFirstHalf, fIndex, valIndex, VarUp * PtYYpTimesOneMinusPtYYp); } } double VarU = 0.5 * deltaDivByOneMinusDelta * USum; // update function objective priorValue += VarU * PtYYpTimesOneMinusPtYYp; double VarUTimesOneMinus2PtYYp = VarU * oneMinus2PtYYp; // second half of derivative: VarU * PtYYp' * (1 - 2 * PtYYp) // boolean prevFeaturePresent = false; // boolean nextFeaturePresent = false; for (int fIndexPos = 0; fIndexPos < activeFeatures.Length; fIndexPos++) { fIndex = activeFeatures[fIndexPos]; containsFeature = docDataHashI.Contains(fIndex); // if (!containsFeature) continue; int jj = map[fIndex]; IIndex <CRFLabel> fLabelIndex = labelIndices[jj]; for (int kk = 0; kk < fLabelIndex.Size(); kk++) { // for all parameter \theta int[] fLabel = fLabelIndex.Get(kk).GetLabel(); // if (FAlpha[i] != null) // log.info("fIndex: " + fIndex+", FAlpha[i].size:"+FAlpha[i].length); double fCount = containsFeature && ((jj == 0 && fLabel[0] == yP) || (jj == 1 && k == kk)) ? 1 : 0; double alpha; double beta; double condE; double PtYYpPrime; if (!dropoutApprox) { alpha = ((FAlpha[i_3][fIndexPos] == null || FAlpha[i_3][fIndexPos][y] == null) ? 0 : FAlpha[i_3][fIndexPos][y][kk]); beta = ((FBeta[i_3][fIndexPos] == null || FBeta[i_3][fIndexPos][yP] == null) ? 0 : FBeta[i_3][fIndexPos][yP][kk]); condE = fCount + alpha + beta; PtYYpPrime = PtYYp * (condE - EForADoc[fIndex][kk]); } else { double E = 0; if (EForADocPosAtI.Contains(fIndex)) { E = EForADocPosAtI[fIndex][kk]; } condE = fCount; PtYYpPrime = PtYYp * (condE - E); } IncreScore(dropoutPriorGrad, fIndex, kk, VarUTimesOneMinus2PtYYp * PtYYpPrime); } } } } // copy for condensedFeaturesMap foreach (KeyValuePair <int, IList <int> > entry in condensedFeaturesMap) { int key = entry.Key; IList <int> aList = entry.Value; foreach (int toCopyInto in aList) { double[] arr = dropoutPriorGrad[key]; double[] targetArr = new double[arr.Length]; for (int i_1 = 0; i_1 < arr.Length; i_1++) { targetArr[i_1] = arr[i_1]; } dropoutPriorGrad[toCopyInto] = targetArr; } } foreach (KeyValuePair <int, double[]> entry_1 in dropoutPriorGrad) { int key = entry_1.Key; double[] target = entry_1.Value; if (dropoutPriorGradFirstHalf.Contains(key)) { double[] source = dropoutPriorGradFirstHalf[key]; for (int i_1 = 0; i_1 < target.Length; i_1++) { target[i_1] += source[i_1]; } } } // for (int i=0;i<dropoutPriorGrad.length;i++) // for (int j=0; j<dropoutPriorGrad[i].length;j++) { // if (DEBUG3) // System.err.printf("f=%d, k=%d, dropoutPriorGradFirstHalf[%d][%d]=% 5.3f, dropoutPriorGrad[%d][%d]=% 5.3f\n", i, j, i, j, dropoutPriorGradFirstHalf[i][j], i, j, dropoutPriorGrad[i][j]); // dropoutPriorGrad[i][j] += dropoutPriorGradFirstHalf[i][j]; // } return(dropoutScale * priorValue); }
private Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > ExpectedCountsAndValueForADoc(int docIndex, bool skipExpectedCountCalc, bool skipValCalc) { int[] activeFeatures = dataFeatureHashByDoc[docIndex]; IList <ICollection <int> > docDataHash = dataFeatureHash[docIndex]; IDictionary <int, IList <int> > condensedFeaturesMap = condensedMap[docIndex]; double prob = 0; int[][][] docData = totalData[docIndex]; int[] docLabels = null; if (docIndex < labels.Length) { docLabels = labels[docIndex]; } Timing timer = new Timing(); double[][][] featureVal3DArr = null; if (featureVal != null) { featureVal3DArr = featureVal[docIndex]; } // make a clique tree for this document CRFCliqueTree <string> cliqueTree = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, featureVal3DArr); if (!skipValCalc) { // compute the log probability of the document given the model with the parameters x int[] given = new int[window - 1]; Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol)); if (docLabels.Length > docData.Length) { // only true for self-training // fill the given array with the extra docLabels System.Array.Copy(docLabels, 0, given, 0, given.Length); // shift the docLabels array left int[] newDocLabels = new int[docData.Length]; System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length); docLabels = newDocLabels; } double startPosLogProb = cliqueTree.LogProbStartPos(); if (Verbose) { System.Console.Error.Printf("P_-1(Background) = % 5.3f\n", startPosLogProb); } prob += startPosLogProb; // iterate over the positions in this document for (int i = 0; i < docData.Length; i++) { int label = docLabels[i]; double p = cliqueTree.CondLogProbGivenPrevious(i, label, given); if (Verbose) { log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + System.Math.Exp(p)); } prob += p; System.Array.Copy(given, 1, given, 0, given.Length - 1); given[given.Length - 1] = label; } } IDictionary <int, double[]> EForADoc = SparseE(activeFeatures); IList <IDictionary <int, double[]> > EForADocPos = null; if (dropoutApprox) { EForADocPos = new List <IDictionary <int, double[]> >(docData.Length); } if (!skipExpectedCountCalc) { // compute the expected counts for this document, which we will need to compute the derivative // iterate over the positions in this document double fVal = 1.0; for (int i = 0; i < docData.Length; i++) { ICollection <int> docDataHashI = docDataHash[i]; IDictionary <int, double[]> EForADocPosAtI = null; if (dropoutApprox) { EForADocPosAtI = SparseE(docDataHashI); } foreach (int fIndex in docDataHashI) { int j = map[fIndex]; IIndex <CRFLabel> labelIndex = labelIndices[j]; // for each possible labeling for that clique for (int k = 0; k < labelIndex.Size(); k++) { int[] label = labelIndex.Get(k).GetLabel(); double p = cliqueTree.Prob(i, label); // probability of these labels occurring in this clique with these features if (dropoutApprox) { IncreScore(EForADocPosAtI, fIndex, k, fVal * p); } IncreScore(EForADoc, fIndex, k, fVal * p); } } if (dropoutApprox) { foreach (int fIndex_1 in docDataHashI) { if (condensedFeaturesMap.Contains(fIndex_1)) { IList <int> aList = condensedFeaturesMap[fIndex_1]; foreach (int toCopyInto in aList) { double[] arr = EForADocPosAtI[fIndex_1]; double[] targetArr = new double[arr.Length]; for (int q = 0; q < arr.Length; q++) { targetArr[q] = arr[q]; } EForADocPosAtI[toCopyInto] = targetArr; } } } EForADocPos.Add(EForADocPosAtI); } } // copy for condensedFeaturesMap foreach (KeyValuePair <int, IList <int> > entry in condensedFeaturesMap) { int key = entry.Key; IList <int> aList = entry.Value; foreach (int toCopyInto in aList) { double[] arr = EForADoc[key]; double[] targetArr = new double[arr.Length]; for (int i_1 = 0; i_1 < arr.Length; i_1++) { targetArr[i_1] = arr[i_1]; } EForADoc[toCopyInto] = targetArr; } } } IDictionary <int, double[]> dropoutPriorGrad = null; if (prior == DropoutPrior) { // we can optimize this, this is too large, don't need this big dropoutPriorGrad = SparseE(activeFeatures); // log.info("computing dropout prior for doc " + docIndex + " ... "); prob -= GetDropoutPrior(cliqueTree, docData, EForADoc, docDataHash, activeFeatures, dropoutPriorGrad, condensedFeaturesMap, EForADocPos); } // log.info(" done!"); return(new Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> >(docIndex, prob, EForADoc, dropoutPriorGrad)); }
// todo [cdm]: Below data[m] --> docData /// <summary>Calculates both value and partial derivatives at the point x, and save them internally.</summary> protected internal override void Calculate(double[] x) { double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point double[][][] E = Empty2D(); double[] eScales = new double[numLopExpert]; double[] rawScales = SeparateLopScales(x); double[] scales = ArrayMath.Softmax(rawScales); double[][][] learnedLopExpertWeights2D = lopExpertWeights2D; if (backpropTraining) { learnedLopExpertWeights2D = SeparateLopExpertWeights2D(x); LogPotential(learnedLopExpertWeights2D); } double[][] combinedWeights2D = CombineAndScaleLopWeights2D(numLopExpert, learnedLopExpertWeights2D, scales); // iterate over all the documents for (int m = 0; m < data.Length; m++) { int[][][] docData = data[m]; int[] docLabels = labels[m]; double[][][][] sumOfELPm = sumOfExpectedLogPotential[m]; // sumOfExpectedLogPotential[m][i][j][lopIter][k] m-docNo;i-position;j-cliqueNo;k-label // make a clique tree for this document ICliquePotentialFunction cliquePotentialFunc = new LinearCliquePotentialFunction(combinedWeights2D); CRFCliqueTree <string> cliqueTree = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, null); // compute the log probability of the document given the model with the parameters x int[] given = new int[window - 1]; Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol)); if (docLabels.Length > docData.Length) { // only true for self-training // fill the given array with the extra docLabels System.Array.Copy(docLabels, 0, given, 0, given.Length); // shift the docLabels array left int[] newDocLabels = new int[docData.Length]; System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length); docLabels = newDocLabels; } // iterate over the positions in this document for (int i = 0; i < docData.Length; i++) { int label = docLabels[i]; double p = cliqueTree.CondLogProbGivenPrevious(i, label, given); if (Verbose) { log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + p); } prob += p; System.Array.Copy(given, 1, given, 0, given.Length - 1); given[given.Length - 1] = label; } // compute the expected counts for this document, which we will need to compute the derivative // iterate over the positions in this document for (int i_1 = 0; i_1 < docData.Length; i_1++) { // for each possible clique at this position double[][][] sumOfELPmi = sumOfELPm[i_1]; for (int j = 0; j < docData[i_1].Length; j++) { double[][] sumOfELPmij = sumOfELPmi[j]; IIndex <CRFLabel> labelIndex = labelIndices[j]; // for each possible labeling for that clique for (int l = 0; l < labelIndex.Size(); l++) { int[] label = labelIndex.Get(l).GetLabel(); double p = cliqueTree.Prob(i_1, label); // probability of these labels occurring in this clique with these features for (int lopIter = 0; lopIter < numLopExpert; lopIter++) { ICollection <int> indicesSet = featureIndicesSetArray[lopIter]; double scale = scales[lopIter]; double expected = sumOfELPmij[lopIter][l]; for (int innerLopIter = 0; innerLopIter < numLopExpert; innerLopIter++) { expected -= scales[innerLopIter] * sumOfELPmij[innerLopIter][l]; } expected *= scale; eScales[lopIter] += (p * expected); double[][] eOfIter = E[lopIter]; if (backpropTraining) { for (int k = 0; k < docData[i_1][j].Length; k++) { // k iterates over features int featureIdx = docData[i_1][j][k]; if (indicesSet.Contains(featureIdx)) { eOfIter[featureIdx][l] += p; } } } } } } } } if (double.IsNaN(prob)) { // shouldn't be the case throw new Exception("Got NaN for prob in CRFLogConditionalObjectiveFunctionForLOP.calculate()"); } value = -prob; if (Verbose) { log.Info("value is " + value); } // compute the partial derivative for each feature by comparing expected counts to empirical counts for (int lopIter_1 = 0; lopIter_1 < numLopExpert; lopIter_1++) { double scale = scales[lopIter_1]; double observed = sumOfObservedLogPotential[lopIter_1]; for (int j = 0; j < numLopExpert; j++) { observed -= scales[j] * sumOfObservedLogPotential[j]; } observed *= scale; double expected = eScales[lopIter_1]; derivative[lopIter_1] = (expected - observed); if (Verbose) { log.Info("deriv(" + lopIter_1 + ") = " + expected + " - " + observed + " = " + derivative[lopIter_1]); } } if (backpropTraining) { int dIndex = numLopExpert; for (int lopIter = 0; lopIter_1 < numLopExpert; lopIter_1++) { double scale = scales[lopIter_1]; double[][] eOfExpert = E[lopIter_1]; double[][] ehatOfExpert = Ehat[lopIter_1]; IList <int> featureIndicesList = featureIndicesListArray[lopIter_1]; foreach (int fIndex in featureIndicesList) { for (int j = 0; j < eOfExpert[fIndex].Length; j++) { derivative[dIndex++] = scale * (eOfExpert[fIndex][j] - ehatOfExpert[fIndex][j]); if (Verbose) { log.Info("deriv[" + lopIter_1 + "](" + fIndex + "," + j + ") = " + scale + " * (" + eOfExpert[fIndex][j] + " - " + ehatOfExpert[fIndex][j] + ") = " + derivative[dIndex - 1]); } } } } System.Diagnostics.Debug.Assert((dIndex == DomainDimension())); } }