/* * Back propagation of the unfolded LDA model (Mirror descent approach) */ // Implemented without atomic operation public static void BackPropagation_LDA(SparseMatrix Xt, SparseMatrix Dt, DNNRun_t DNNRun, paramModel_t paramModel, Grad_t Grad) { // -------- Extract parameters -------- int nHid = paramModel.nHid; int nHidLayer = paramModel.nHidLayer; int nOutput = paramModel.nOutput; float To = paramModel.To; string OutputType = paramModel.OutputType; int BatchSize = Xt.nCols; int nInput = paramModel.nInput; // -------- Back propagation -------- DenseMatrix grad_Q_po = new DenseMatrix(DNNRun.y); SparseMatrix TmpSparseMat = new SparseMatrix(Xt); SparseMatrix grad_Q_po_Sparse = new SparseMatrix(Xt); DenseMatrix xi = new DenseMatrix(nHid, BatchSize); DenseMatrix TmpDenseMat = new DenseMatrix(nHid, BatchSize); DenseMatrix ThetaRatio = new DenseMatrix(nHid, BatchSize); DenseRowVector TmpDenseRowVec = new DenseRowVector(BatchSize); DenseMatrix tmp_theta_xi_b_T_OVER_theta_lm1_2 = new DenseMatrix(nHid, BatchSize); SparseMatrix tmp_Xt_OVER_Phitheta = new SparseMatrix(Xt); SparseMatrix tmp_Phi_theta_xi = new SparseMatrix(Xt); Grad.grad_Q_Phi.ClearValue(); // ---- Offset of effective number of layers ---- int[] OffsetEffNumLayer = new int[BatchSize]; OffsetEffNumLayer[0] = 0; int NumTotalLayer = DNNRun.nHidLayerEffective[0]; for (int IdxSample = 1; IdxSample < BatchSize; ++IdxSample) { OffsetEffNumLayer[IdxSample] = OffsetEffNumLayer[IdxSample - 1] + DNNRun.nHidLayerEffective[IdxSample-1]; NumTotalLayer += DNNRun.nHidLayerEffective[IdxSample]; } // ---- Temporary variables that stores the intermediate results for computing the gradients ---- DenseMatrix tmp_theta_xi_pool = new DenseMatrix(nHid, NumTotalLayer, 0.0f); DenseMatrix tmp_theta_xi = new DenseMatrix(nHid, BatchSize, 0.0f); DenseMatrix theta_l_minus_one = new DenseMatrix(nHid, NumTotalLayer, 0.0f); SparseMatrix tmp_Xt_OVER_Phitheta_pool = new SparseMatrix(nInput, NumTotalLayer); SparseMatrix TmpSparseMat_pool = new SparseMatrix(nInput, NumTotalLayer); int NumTotalNz = 0; for (int IdxSample = 0; IdxSample < BatchSize; ++IdxSample) { int Layer_begin = OffsetEffNumLayer[IdxSample]; int Layer_end = Layer_begin + DNNRun.nHidLayerEffective[IdxSample]; SparseColumnVector[] tmp1 = tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors; SparseColumnVector[] tmp2 = TmpSparseMat_pool.SparseColumnVectors; SparseColumnVector xt = Xt.SparseColumnVectors[IdxSample]; NumTotalNz += xt.nNonzero; for (int IdxLayer = Layer_begin; IdxLayer < Layer_end; ++IdxLayer) { tmp1[IdxLayer] = new SparseColumnVector(xt); tmp2[IdxLayer] = new SparseColumnVector(xt); } } int[] SparsePatternGradPhi = Xt.GetHorizontalUnionSparsePattern(); SparseMatrix TmpGrad = new SparseMatrix(nInput, nHid, true); TmpGrad.SetSparsePatternForAllColumn(SparsePatternGradPhi); // ---- Compute grad Q wrt po if possible ---- switch (OutputType) { case "softmaxCE": MatrixOperation.MatrixSubtractMatrix(grad_Q_po, Dt); MatrixOperation.ScalarMultiplyMatrix(grad_Q_po, To); Grad.grad_Q_U.ClearValue(); break; case "linearQuad": MatrixOperation.MatrixSubtractMatrix(grad_Q_po, Dt); MatrixOperation.ScalarMultiplyMatrix(grad_Q_po, 2.0f); Grad.grad_Q_U.ClearValue(); break; case "unsupLDA": Grad.grad_Q_TopPhi.SetAllValuesToZero(); break; case "linearCE": throw new Exception("linearCE is not implemented."); default: throw new Exception("Unknown OutputType"); } Parallel.For(0, BatchSize, new ParallelOptions { MaxDegreeOfParallelism = MatrixOperation.MaxMultiThreadDegree }, IdxSample => { // *************************************************************************** // -------- Back propagation: top layer -------- switch (OutputType) { case "softmaxCE": // ---- grad Q wrt pL (x_L) ---- MatrixOperation.MatrixTransposeMultiplyVector( xi.DenseMatrixValue[IdxSample], paramModel.U, grad_Q_po.DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample] ); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], TmpDenseRowVec.VectorValue[IdxSample] * (-1.0f) ); break; case "linearQuad": // ---- grad Q wrt pL (x_L) ---- MatrixOperation.MatrixTransposeMultiplyVector( xi.DenseMatrixValue[IdxSample], paramModel.U, grad_Q_po.DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample] ); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], (-1.0f) * TmpDenseRowVec.VectorValue[IdxSample] ); break; case "unsupLDA": // ---- grad Q wrt po ---- MatrixOperation.MatrixMultiplyVector( grad_Q_po_Sparse.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorDivideVector( grad_Q_po_Sparse.SparseColumnVectors[IdxSample], Xt.SparseColumnVectors[IdxSample], grad_Q_po_Sparse.SparseColumnVectors[IdxSample] ); // ---- grad Q wrt pL (x_L) ---- MatrixOperation.MatrixTransposeMultiplyVector( xi.DenseMatrixValue[IdxSample], paramModel.Phi, grad_Q_po_Sparse.SparseColumnVectors[IdxSample] ); MatrixOperation.ScalarMultiplyVector( xi.DenseMatrixValue[IdxSample], -1.0f ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], (-1.0f) * TmpDenseRowVec.VectorValue[IdxSample] ); break; case "linearCE": throw new Exception("linearCE is not implemented."); //break; default: throw new Exception("Unknown OutputType"); } // *************************************************************************** // -------- Back propagation: hidden layers -------- for (int IdxLayer = DNNRun.nHidLayerEffective[IdxSample] - 1; IdxLayer >= 0; IdxLayer--) { // ---- Compute the position in the temporary variable for the current layer at the current sample ---- int IdxTmpVar = OffsetEffNumLayer[IdxSample] + IdxLayer; // ---- grad wrt b --- // Not implemented at the moment. (Can be used to update the Dirichlet parameter automatically.) // ---- Compute the intermediate variables ---- MatrixOperation.ElementwiseVectorMultiplyVector( tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample] ); if (IdxLayer == 0) { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar], DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } if (IdxLayer == 0) { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } MatrixOperation.ElementwiseVectorMultiplyVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], paramModel.b ); MatrixOperation.ScalarMultiplyVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] ); // Reset the elements to zero if theta_{l-1} is zero at these positions (mainly for alpha<1 case) if (IdxLayer > 0) { MatrixOperation.ResetVectorSparsePattern( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } // Continue to intermediate variable computation if (IdxLayer == 0) // TmpSparseMat is Phitheta_lm1 { MatrixOperation.MatrixMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.MatrixMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } MatrixOperation.ElementwiseVectorDivideVector( tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar], Xt.SparseColumnVectors[IdxSample], TmpSparseMat.SparseColumnVectors[IdxSample] ); MatrixOperation.ElementwiseVectorDivideVector( TmpSparseMat.SparseColumnVectors[IdxSample], tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar], TmpSparseMat.SparseColumnVectors[IdxSample] ); // TmpSparseMat is tmp_Xt_OVER_Phitheta2 MatrixOperation.MatrixMultiplyVector( tmp_Phi_theta_xi.SparseColumnVectors[IdxSample], paramModel.Phi, tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar] ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], tmp_Phi_theta_xi.SparseColumnVectors[IdxSample] ); // TmpSparseMat is ( tmp_Phi_theta_xi.*tmp_Xt_OVER_Phitheta2 ) MatrixOperation.MatrixTransposeMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], paramModel.Phi, TmpSparseMat.SparseColumnVectors[IdxSample] ); MatrixOperation.ScalarMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] ); // TmpDenseMat is tmp_Tl_Phit_xtPhiTheta2_Phi_theta_xi // ---- Compute the gradient wrt Phi ---- MatrixOperation.ScalarMultiplyVector( tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] ); MatrixOperation.ScalarMultiplyVector( TmpSparseMat_pool.SparseColumnVectors[IdxTmpVar], TmpSparseMat.SparseColumnVectors[IdxSample], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample]*(-1.0f) ); if (IdxLayer == 0) { theta_l_minus_one.DenseMatrixValue[IdxTmpVar] = DNNRun.theta0.DenseMatrixValue[IdxSample]; } else { theta_l_minus_one.DenseMatrixValue[IdxTmpVar] = DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]; } // ---- Compute xi_{l-1} via back propagation ---- if (IdxLayer > 0) { // Reset the elements to zero if theta_{l-1} is zero at these positions (mainly for alpha<1 case) MatrixOperation.ElementwiseVectorDivideVector( ThetaRatio.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ResetVectorSparsePattern( ThetaRatio.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], ThetaRatio.DenseMatrixValue[IdxSample] ); // Compute xi_{l-1} now MatrixOperation.VectorSubtractVector( TmpDenseMat.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample] ); MatrixOperation.VectorSubtractVector( TmpDenseMat.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample], tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( tmp_theta_xi.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample] ); // tmp_theta_xi is tmp1 in matlab code TmpDenseRowVec.VectorValue[IdxSample] = tmp_theta_xi.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample], TmpDenseRowVec.VectorValue[IdxSample] * (-1.0f) ); } } }); // -------- Compute the gradients -------- // ---- Gradient with respect to U ---- DenseMatrix Theta_Top = new DenseMatrix(nHid, BatchSize); for (int IdxSample = 0; IdxSample < BatchSize; ++IdxSample ) { Theta_Top.DenseMatrixValue[IdxSample] = DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample]; } switch (OutputType) { case "softmaxCE": // ---- grad Q wrt U ---- MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_U, grad_Q_po, Theta_Top); MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (1.0f / (float)BatchSize)); break; case "linearQuad": // ---- grad Q wrt U ---- MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_U, grad_Q_po, Theta_Top); MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (1.0f / (float)BatchSize)); break; case "unsupLDA": // ---- grad Q wrt Phi on top ---- MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_TopPhi, grad_Q_po_Sparse, Theta_Top, false); MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_TopPhi, Grad.grad_Q_TopPhi, (-1.0f / (float)BatchSize)); break; case "linearCE": throw new Exception("linearCE is not implemented."); //break; default: throw new Exception("Unknown OutputType"); } // ---- Gradient with respect to Phi ---- TmpGrad.SetAllValuesToZero(); MatrixOperation.MatrixMultiplyMatrixTranspose(TmpGrad, tmp_Xt_OVER_Phitheta_pool, tmp_theta_xi_pool, true); MatrixOperation.MatrixMultiplyMatrixTranspose(TmpGrad, TmpSparseMat_pool, theta_l_minus_one, true); MatrixOperation.ScalarMultiplyMatrix(TmpGrad, TmpGrad, (1.0f / (float)BatchSize)); MatrixOperation.MatrixAddMatrix(Grad.grad_Q_Phi, TmpGrad); }