/* * Back propagation of the unfolded LDA model (Mirror descent approach) */ // Implemented without atomic operation public static void BackPropagation_LDA(SparseMatrix Xt, SparseMatrix Dt, DNNRun_t DNNRun, paramModel_t paramModel, Grad_t Grad) { // -------- Extract parameters -------- int nHid = paramModel.nHid; int nHidLayer = paramModel.nHidLayer; int nOutput = paramModel.nOutput; float To = paramModel.To; string OutputType = paramModel.OutputType; int BatchSize = Xt.nCols; int nInput = paramModel.nInput; // -------- Back propagation -------- DenseMatrix grad_Q_po = new DenseMatrix(DNNRun.y); SparseMatrix TmpSparseMat = new SparseMatrix(Xt); SparseMatrix grad_Q_po_Sparse = new SparseMatrix(Xt); DenseMatrix xi = new DenseMatrix(nHid, BatchSize); DenseMatrix TmpDenseMat = new DenseMatrix(nHid, BatchSize); DenseMatrix ThetaRatio = new DenseMatrix(nHid, BatchSize); DenseRowVector TmpDenseRowVec = new DenseRowVector(BatchSize); DenseMatrix tmp_theta_xi_b_T_OVER_theta_lm1_2 = new DenseMatrix(nHid, BatchSize); SparseMatrix tmp_Xt_OVER_Phitheta = new SparseMatrix(Xt); SparseMatrix tmp_Phi_theta_xi = new SparseMatrix(Xt); Grad.grad_Q_Phi.ClearValue(); // ---- Offset of effective number of layers ---- int[] OffsetEffNumLayer = new int[BatchSize]; OffsetEffNumLayer[0] = 0; int NumTotalLayer = DNNRun.nHidLayerEffective[0]; for (int IdxSample = 1; IdxSample < BatchSize; ++IdxSample) { OffsetEffNumLayer[IdxSample] = OffsetEffNumLayer[IdxSample - 1] + DNNRun.nHidLayerEffective[IdxSample-1]; NumTotalLayer += DNNRun.nHidLayerEffective[IdxSample]; } // ---- Temporary variables that stores the intermediate results for computing the gradients ---- DenseMatrix tmp_theta_xi_pool = new DenseMatrix(nHid, NumTotalLayer, 0.0f); DenseMatrix tmp_theta_xi = new DenseMatrix(nHid, BatchSize, 0.0f); DenseMatrix theta_l_minus_one = new DenseMatrix(nHid, NumTotalLayer, 0.0f); SparseMatrix tmp_Xt_OVER_Phitheta_pool = new SparseMatrix(nInput, NumTotalLayer); SparseMatrix TmpSparseMat_pool = new SparseMatrix(nInput, NumTotalLayer); int NumTotalNz = 0; for (int IdxSample = 0; IdxSample < BatchSize; ++IdxSample) { int Layer_begin = OffsetEffNumLayer[IdxSample]; int Layer_end = Layer_begin + DNNRun.nHidLayerEffective[IdxSample]; SparseColumnVector[] tmp1 = tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors; SparseColumnVector[] tmp2 = TmpSparseMat_pool.SparseColumnVectors; SparseColumnVector xt = Xt.SparseColumnVectors[IdxSample]; NumTotalNz += xt.nNonzero; for (int IdxLayer = Layer_begin; IdxLayer < Layer_end; ++IdxLayer) { tmp1[IdxLayer] = new SparseColumnVector(xt); tmp2[IdxLayer] = new SparseColumnVector(xt); } } int[] SparsePatternGradPhi = Xt.GetHorizontalUnionSparsePattern(); SparseMatrix TmpGrad = new SparseMatrix(nInput, nHid, true); TmpGrad.SetSparsePatternForAllColumn(SparsePatternGradPhi); // ---- Compute grad Q wrt po if possible ---- switch (OutputType) { case "softmaxCE": MatrixOperation.MatrixSubtractMatrix(grad_Q_po, Dt); MatrixOperation.ScalarMultiplyMatrix(grad_Q_po, To); Grad.grad_Q_U.ClearValue(); break; case "linearQuad": MatrixOperation.MatrixSubtractMatrix(grad_Q_po, Dt); MatrixOperation.ScalarMultiplyMatrix(grad_Q_po, 2.0f); Grad.grad_Q_U.ClearValue(); break; case "unsupLDA": Grad.grad_Q_TopPhi.SetAllValuesToZero(); break; case "linearCE": throw new Exception("linearCE is not implemented."); default: throw new Exception("Unknown OutputType"); } Parallel.For(0, BatchSize, new ParallelOptions { MaxDegreeOfParallelism = MatrixOperation.MaxMultiThreadDegree }, IdxSample => { // *************************************************************************** // -------- Back propagation: top layer -------- switch (OutputType) { case "softmaxCE": // ---- grad Q wrt pL (x_L) ---- MatrixOperation.MatrixTransposeMultiplyVector( xi.DenseMatrixValue[IdxSample], paramModel.U, grad_Q_po.DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample] ); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], TmpDenseRowVec.VectorValue[IdxSample] * (-1.0f) ); break; case "linearQuad": // ---- grad Q wrt pL (x_L) ---- MatrixOperation.MatrixTransposeMultiplyVector( xi.DenseMatrixValue[IdxSample], paramModel.U, grad_Q_po.DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample] ); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], (-1.0f) * TmpDenseRowVec.VectorValue[IdxSample] ); break; case "unsupLDA": // ---- grad Q wrt po ---- MatrixOperation.MatrixMultiplyVector( grad_Q_po_Sparse.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorDivideVector( grad_Q_po_Sparse.SparseColumnVectors[IdxSample], Xt.SparseColumnVectors[IdxSample], grad_Q_po_Sparse.SparseColumnVectors[IdxSample] ); // ---- grad Q wrt pL (x_L) ---- MatrixOperation.MatrixTransposeMultiplyVector( xi.DenseMatrixValue[IdxSample], paramModel.Phi, grad_Q_po_Sparse.SparseColumnVectors[IdxSample] ); MatrixOperation.ScalarMultiplyVector( xi.DenseMatrixValue[IdxSample], -1.0f ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], (-1.0f) * TmpDenseRowVec.VectorValue[IdxSample] ); break; case "linearCE": throw new Exception("linearCE is not implemented."); //break; default: throw new Exception("Unknown OutputType"); } // *************************************************************************** // -------- Back propagation: hidden layers -------- for (int IdxLayer = DNNRun.nHidLayerEffective[IdxSample] - 1; IdxLayer >= 0; IdxLayer--) { // ---- Compute the position in the temporary variable for the current layer at the current sample ---- int IdxTmpVar = OffsetEffNumLayer[IdxSample] + IdxLayer; // ---- grad wrt b --- // Not implemented at the moment. (Can be used to update the Dirichlet parameter automatically.) // ---- Compute the intermediate variables ---- MatrixOperation.ElementwiseVectorMultiplyVector( tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample] ); if (IdxLayer == 0) { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar], DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } if (IdxLayer == 0) { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.ElementwiseVectorDivideVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } MatrixOperation.ElementwiseVectorMultiplyVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], paramModel.b ); MatrixOperation.ScalarMultiplyVector( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] ); // Reset the elements to zero if theta_{l-1} is zero at these positions (mainly for alpha<1 case) if (IdxLayer > 0) { MatrixOperation.ResetVectorSparsePattern( tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } // Continue to intermediate variable computation if (IdxLayer == 0) // TmpSparseMat is Phitheta_lm1 { MatrixOperation.MatrixMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.MatrixMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } MatrixOperation.ElementwiseVectorDivideVector( tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar], Xt.SparseColumnVectors[IdxSample], TmpSparseMat.SparseColumnVectors[IdxSample] ); MatrixOperation.ElementwiseVectorDivideVector( TmpSparseMat.SparseColumnVectors[IdxSample], tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar], TmpSparseMat.SparseColumnVectors[IdxSample] ); // TmpSparseMat is tmp_Xt_OVER_Phitheta2 MatrixOperation.MatrixMultiplyVector( tmp_Phi_theta_xi.SparseColumnVectors[IdxSample], paramModel.Phi, tmp_theta_xi_pool.DenseMatrixValue[IdxTmpVar] ); MatrixOperation.ElementwiseVectorMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], tmp_Phi_theta_xi.SparseColumnVectors[IdxSample] ); // TmpSparseMat is ( tmp_Phi_theta_xi.*tmp_Xt_OVER_Phitheta2 ) MatrixOperation.MatrixTransposeMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], paramModel.Phi, TmpSparseMat.SparseColumnVectors[IdxSample] ); MatrixOperation.ScalarMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] ); // TmpDenseMat is tmp_Tl_Phit_xtPhiTheta2_Phi_theta_xi // ---- Compute the gradient wrt Phi ---- MatrixOperation.ScalarMultiplyVector( tmp_Xt_OVER_Phitheta_pool.SparseColumnVectors[IdxTmpVar], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] ); MatrixOperation.ScalarMultiplyVector( TmpSparseMat_pool.SparseColumnVectors[IdxTmpVar], TmpSparseMat.SparseColumnVectors[IdxSample], DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample]*(-1.0f) ); if (IdxLayer == 0) { theta_l_minus_one.DenseMatrixValue[IdxTmpVar] = DNNRun.theta0.DenseMatrixValue[IdxSample]; } else { theta_l_minus_one.DenseMatrixValue[IdxTmpVar] = DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]; } // ---- Compute xi_{l-1} via back propagation ---- if (IdxLayer > 0) { // Reset the elements to zero if theta_{l-1} is zero at these positions (mainly for alpha<1 case) MatrixOperation.ElementwiseVectorDivideVector( ThetaRatio.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ResetVectorSparsePattern( ThetaRatio.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( xi.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], ThetaRatio.DenseMatrixValue[IdxSample] ); // Compute xi_{l-1} now MatrixOperation.VectorSubtractVector( TmpDenseMat.DenseMatrixValue[IdxSample], xi.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample] ); MatrixOperation.VectorSubtractVector( TmpDenseMat.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample], tmp_theta_xi_b_T_OVER_theta_lm1_2.DenseMatrixValue[IdxSample] ); MatrixOperation.ElementwiseVectorMultiplyVector( tmp_theta_xi.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample] ); // tmp_theta_xi is tmp1 in matlab code TmpDenseRowVec.VectorValue[IdxSample] = tmp_theta_xi.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarAddVector( xi.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample], TmpDenseRowVec.VectorValue[IdxSample] * (-1.0f) ); } } }); // -------- Compute the gradients -------- // ---- Gradient with respect to U ---- DenseMatrix Theta_Top = new DenseMatrix(nHid, BatchSize); for (int IdxSample = 0; IdxSample < BatchSize; ++IdxSample ) { Theta_Top.DenseMatrixValue[IdxSample] = DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample]; } switch (OutputType) { case "softmaxCE": // ---- grad Q wrt U ---- MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_U, grad_Q_po, Theta_Top); MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (1.0f / (float)BatchSize)); break; case "linearQuad": // ---- grad Q wrt U ---- MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_U, grad_Q_po, Theta_Top); MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (1.0f / (float)BatchSize)); break; case "unsupLDA": // ---- grad Q wrt Phi on top ---- MatrixOperation.MatrixMultiplyMatrixTranspose(Grad.grad_Q_TopPhi, grad_Q_po_Sparse, Theta_Top, false); MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_TopPhi, Grad.grad_Q_TopPhi, (-1.0f / (float)BatchSize)); break; case "linearCE": throw new Exception("linearCE is not implemented."); //break; default: throw new Exception("Unknown OutputType"); } // ---- Gradient with respect to Phi ---- TmpGrad.SetAllValuesToZero(); MatrixOperation.MatrixMultiplyMatrixTranspose(TmpGrad, tmp_Xt_OVER_Phitheta_pool, tmp_theta_xi_pool, true); MatrixOperation.MatrixMultiplyMatrixTranspose(TmpGrad, TmpSparseMat_pool, theta_l_minus_one, true); MatrixOperation.ScalarMultiplyMatrix(TmpGrad, TmpGrad, (1.0f / (float)BatchSize)); MatrixOperation.MatrixAddMatrix(Grad.grad_Q_Phi, TmpGrad); }
public static void ElementwiseVectorDivideVector(DenseRowVector z, DenseRowVector x, DenseRowVector y) { if (z.Dim != x.Dim || z.Dim != y.Dim) { throw new Exception("Dimension mismatch."); } var zVal = z.VectorValue; var xVal = x.VectorValue; var yVal = y.VectorValue; int zDim = z.Dim; for (int IdxRow = 0; IdxRow < zDim; ++IdxRow) { zVal[IdxRow] = xVal[IdxRow] / (yVal[IdxRow]+1e-12f); } }
/* * z = z - x: vector subtracts vector */ public static void VectorSubtractVector(DenseRowVector z, DenseRowVector x) { // Dimension check if (z.Dim != x.Dim) { throw new Exception("Dimension mismatch."); } // Computation var zVal = z.VectorValue; var xVal = x.VectorValue; int Dim = z.Dim; for (int IdxCol=0; IdxCol<Dim; ++IdxCol) { zVal[IdxCol] -= xVal[IdxCol]; } }
public DenseRowVector(DenseRowVector SourceVector) { Dim = SourceVector.Dim; VectorValue = new float[Dim]; DeepCopyFrom(SourceVector); }
/* * z = x.^{1/2} */ public static void ElementwiseSquareRoot(DenseRowVector z, DenseRowVector x) { if (z.Dim != x.Dim) { throw new Exception("Dimension mismatch."); } int zDim = z.Dim; var zVal = z.VectorValue; var xVal = x.VectorValue; for (int Idx = 0; Idx < zDim; ++Idx ) { zVal[Idx] = (float)Math.Sqrt(xVal[Idx]); } }
/* * Count the values less than a certain value at each column of a matrix */ public static void CountValuesLessThanThreshold(DenseRowVector NumSpecialElementPerCol, DenseMatrix X, float Threshold) { if (NumSpecialElementPerCol.Dim != X.nCols) { throw new Exception("Dimension mismatch."); } for (int IdxCol = 0; IdxCol < X.nCols; IdxCol++ ) { NumSpecialElementPerCol.VectorValue[IdxCol] = 0.0f; for (int IdxRow = 0; IdxRow < X.nRows; IdxRow++) { if (X.DenseMatrixValue[IdxCol].VectorValue[IdxRow]<Threshold) { NumSpecialElementPerCol.VectorValue[IdxCol]++; } } } }
public static void ProjCols2SimplexPlane(DenseMatrix Z, DenseMatrix X) { if (Z.nCols != X.nCols || Z.nRows != X.nRows) { throw new Exception("Dimension mismatch."); } DenseRowVector TmpDenseRowVec = new DenseRowVector(X.nCols); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, X); MatrixOperation.ScalarMultiplyVector(TmpDenseRowVec, 1.0f / ((float)X.nRows)); MatrixOperation.bsxfunMatrixSubtractVector(Z, X, TmpDenseRowVec); MatrixOperation.ScalarAddMatrix(Z, 1.0f / ((float)X.nRows)); }
/* * Training: unsupervised learning of feedforward (unfolding) LDA by back propagation */ public static void TrainingBP_LDA( SparseMatrix TrainData, SparseMatrix TestData, paramModel_t paramModel, paramTrain_t paramTrain, string ModelFile, string ResultFile ) { // ---- Extract the parameters ---- // Model parameters int nInput = paramModel.nInput; int nHid = paramModel.nHid; int nHidLayer = paramModel.nHidLayer; int nOutput = paramModel.nOutput; float eta = paramModel.eta; float T_value = paramModel.T_value; string OutputType = paramModel.OutputType; float beta = paramModel.beta; // Training parameters int nEpoch = paramTrain.nEpoch; float mu_Phi = paramTrain.mu_Phi; float mu_U = paramTrain.mu_U; int nTrain = paramTrain.nTrain; float mu_Phi_ReduceFactor = paramTrain.mu_Phi_ReduceFactor; string LearnRateSchedule = paramTrain.LearnRateSchedule; int nSamplesPerDisplay = paramTrain.nSamplesPerDisplay; int nEpochPerSave = paramTrain.nEpochPerSave; int nEpochPerTest = paramTrain.nEpochPerTest; int nEpochPerDump = paramTrain.nEpochPerDump; // ---- Initialize the model ---- ModelInit_LDA_Feedforward(paramModel); // ---- Initialize the training algorithm ---- Console.WriteLine("#################################################################"); Console.WriteLine("jvking version of BP-LDA: Mirror-Descent Back Propagation"); Console.WriteLine("#################################################################"); float TotLoss = 0.0f; float TotCE = 0.0f; double TotTime = 0.0f; double TotTimeThisEpoch = 0.0f; int TotSamples = 0; int TotSamplesThisEpoch = 0; double AvgnHidLayerEffective = 0.0; int CntRunningAvg = 0; int CntModelUpdate = 0; DenseRowVector mu_phi_search = new DenseRowVector(nHid, mu_Phi); DenseRowVector TestLoss_pool = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector TestLoss_epoch = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector TestLoss_time = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); int CountTest = 0; DenseRowVector G_Phi_pool = new DenseRowVector(paramModel.nHidLayer); DenseRowVector G_Phi_trunc_pool = new DenseRowVector(paramModel.nHidLayer, 0.0f); DenseRowVector AdaGradSum = new DenseRowVector(nHid, 0.0f); DenseRowVector TmpDenseRowVec = new DenseRowVector(nHid, 0.0f); int[] SparsePatternGradPhi = null; float nLearnLineSearch = 0.0f; int[] IdxPerm = null; int BatchSize_NormalBatch = paramTrain.BatchSize; int BatchSize_tmp = paramTrain.BatchSize; int nBatch = (int)Math.Ceiling(((float)nTrain) / ((float)BatchSize_NormalBatch)); DNNRun_t DNNRun_NormalBatch = new DNNRun_t(nHid, BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); DNNRun_t DNNRun_EndBatch = new DNNRun_t(nHid, nTrain - (nBatch - 1) * BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); DNNRun_t DNNRun = null; Grad_t Grad = new Grad_t(nHid, nOutput, nInput, paramModel.nHidLayer, OutputType); DenseMatrix TmpGradDense = new DenseMatrix(nInput, nHid); DenseMatrix TmpMatDensePhi = new DenseMatrix(nInput, nHid); paramModel_t paramModel_avg = new paramModel_t(paramModel); Stopwatch stopWatch = new Stopwatch(); // ---- Compute the schedule of the learning rate double[] stepsize_pool = null; switch (LearnRateSchedule) { case "PreCompute": stepsize_pool = PrecomputeLearningRateSchedule(nBatch, nEpoch, mu_Phi, mu_Phi / mu_Phi_ReduceFactor, 1e-8f); break; case "Constant": stepsize_pool = new double[nEpoch]; for (int Idx = 0; Idx < nEpoch; Idx++) { stepsize_pool[Idx] = mu_Phi; } break; default: throw new Exception("Unknown type of LearnRateSchedule"); } // Now start training......................... for (int epoch = 0; epoch < nEpoch; epoch++) { TotSamplesThisEpoch = 0; TotTimeThisEpoch = 0.0; AvgnHidLayerEffective = 0.0; // -- Set the batch size if there is schedule -- if (paramTrain.flag_BachSizeSchedule) { if (paramTrain.BachSizeSchedule.TryGetValue(epoch + 1, out BatchSize_tmp)) { BatchSize_NormalBatch = BatchSize_tmp; nBatch = (int)Math.Ceiling(((float)nTrain) / ((float)BatchSize_NormalBatch)); DNNRun_NormalBatch = new DNNRun_t(nHid, BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); DNNRun_EndBatch = new DNNRun_t(nHid, nTrain - (nBatch - 1) * BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); } } // -- Shuffle the data (generating shuffled index) -- IdxPerm = Statistics.RandPerm(nTrain); // -- Reset the (MDA) inference step-sizes -- if (epoch > 0) { for (int Idx = 0; Idx < paramModel.nHidLayer; Idx++) { paramModel.T[Idx] = T_value; } } // -- Take the learning rate for the current epoch -- mu_Phi = (float)stepsize_pool[epoch]; // -- Start this epoch -- Console.WriteLine("############## Epoch #{0}. BatchSize: {1} Learning Rate: {2} ##################", epoch + 1, BatchSize_NormalBatch, mu_Phi); for (int IdxBatch = 0; IdxBatch < nBatch; IdxBatch++) { stopWatch.Start(); // Extract the batch int BatchSize = 0; if (IdxBatch < nBatch - 1) { BatchSize = BatchSize_NormalBatch; DNNRun = DNNRun_NormalBatch; } else { BatchSize = nTrain - IdxBatch * BatchSize_NormalBatch; DNNRun = DNNRun_EndBatch; } SparseMatrix Xt = new SparseMatrix(nInput, BatchSize); SparseMatrix Dt = null; int[] IdxSample = new int[BatchSize]; Array.Copy(IdxPerm, IdxBatch * BatchSize_NormalBatch, IdxSample, 0, BatchSize); TrainData.GetColumns(Xt, IdxSample); // Set the sparse pattern for the gradient SparsePatternGradPhi = Xt.GetHorizontalUnionSparsePattern(); Grad.SetSparsePatternForAllGradPhi(SparsePatternGradPhi); // Forward activation LDA_Learn.ForwardActivation_LDA(Xt, DNNRun, paramModel, true); // Back propagation LDA_Learn.BackPropagation_LDA(Xt, Dt, DNNRun, paramModel, Grad); // Compute the gradient and update the model (All gradients of Phi are accumulated into Grad.grad_Q_Phi) MatrixOperation.ScalarDivideMatrix(Grad.grad_Q_Phi, (-1.0f) * ((beta - 1) / ((float)nTrain)), paramModel.Phi, true); MatrixOperation.MatrixAddMatrix(Grad.grad_Q_Phi, Grad.grad_Q_TopPhi); mu_phi_search.FillValue(mu_Phi); // Different learning rate for different columns of Phi: Similar to AdaGrad but does not decay with time ++CntModelUpdate; MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpMatDensePhi, Grad.grad_Q_Phi, Grad.grad_Q_Phi); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpMatDensePhi); MatrixOperation.ScalarMultiplyVector(TmpDenseRowVec, 1.0f / ((float)nInput)); MatrixOperation.VectorSubtractVector(TmpDenseRowVec, AdaGradSum); MatrixOperation.ScalarMultiplyVector(TmpDenseRowVec, 1.0f / CntModelUpdate); MatrixOperation.VectorAddVector(AdaGradSum, TmpDenseRowVec); MatrixOperation.ElementwiseSquareRoot(TmpDenseRowVec, AdaGradSum); MatrixOperation.ScalarAddVector(TmpDenseRowVec, mu_Phi); MatrixOperation.ElementwiseVectorDivideVector(mu_phi_search, mu_phi_search, TmpDenseRowVec); nLearnLineSearch = SMD_Update(paramModel.Phi, Grad.grad_Q_Phi, mu_phi_search, eta); // Running average of the model if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { ++CntRunningAvg; MatrixOperation.MatrixSubtractMatrix(TmpMatDensePhi, paramModel.Phi, paramModel_avg.Phi); MatrixOperation.ScalarMultiplyMatrix(TmpMatDensePhi, 1.0f / CntRunningAvg); MatrixOperation.MatrixAddMatrix(paramModel_avg.Phi, TmpMatDensePhi); } // Display the result TotCE += ComputeCrossEntropy(Xt, paramModel.Phi,DNNRun.theta_pool, DNNRun.nHidLayerEffective); TotLoss = TotCE; TotSamples += BatchSize; TotSamplesThisEpoch += BatchSize; AvgnHidLayerEffective = (((float)(TotSamplesThisEpoch-BatchSize))/((float)TotSamplesThisEpoch))*AvgnHidLayerEffective + (1.0/((float)TotSamplesThisEpoch))*( DNNRun.nHidLayerEffective.Sum()); stopWatch.Stop(); TimeSpan ts = stopWatch.Elapsed; TotTime += ts.TotalSeconds; TotTimeThisEpoch += ts.TotalSeconds; stopWatch.Reset(); if (TotSamplesThisEpoch % nSamplesPerDisplay == 0) { // Display results Console.WriteLine( "* Ep#{0}/{1} Bat#{2}/{3}. Loss={4:F3}. CE={5:F3}. Speed={6} Samples/Sec.", epoch + 1, nEpoch, IdxBatch + 1, nBatch, TotLoss / TotSamples, TotCE / TotSamples, (int)((double)TotSamplesThisEpoch / TotTimeThisEpoch) ); if (paramTrain.DebugLevel == DebugLevel_t.medium) { Console.WriteLine( " muPhiMax={0} \n muPhiMin={1}", mu_phi_search.VectorValue.Max(), mu_phi_search.VectorValue.Min() ); Console.WriteLine(); } if (paramTrain.DebugLevel == DebugLevel_t.high) { Console.WriteLine( " muPhiMax={0} \n muPhiMin={1}", mu_phi_search.VectorValue.Max(), mu_phi_search.VectorValue.Min() ); Console.WriteLine( " AvgnHidLayerEff={0:F1}. G_Phi={1:F3}.", AvgnHidLayerEffective, Grad.grad_Q_Phi.MaxAbsValue() ); Console.WriteLine(); } } } // -- Test -- if ((epoch + 1) % nEpochPerTest == 0) { TestLoss_epoch.VectorValue[(epoch + 1) / nEpochPerTest - 1] = epoch + 1; TestLoss_time.VectorValue[(epoch + 1) / nEpochPerTest - 1] = (float)TotTime; if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { TestLoss_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = Testing_BP_LDA(TestData, paramModel_avg, paramTrain.BatchSize_Test); } else { TestLoss_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = Testing_BP_LDA(TestData, paramModel, paramTrain.BatchSize_Test); } CountTest++; } // -- Save -- if ((epoch + 1) % nEpochPerSave == 0) { // Save model if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { string PhiCol = null; (new FileInfo(ResultFile + ".model.Phi")).Directory.Create(); StreamWriter FileSaveModel = new StreamWriter(ResultFile + ".model.Phi", false); for (int IdxCol = 0; IdxCol < paramModel_avg.Phi.nCols; IdxCol++) { PhiCol = String.Join("\t", paramModel_avg.Phi.DenseMatrixValue[IdxCol].VectorValue); FileSaveModel.WriteLine(PhiCol); } FileSaveModel.Close(); // Save the final learning curves StreamWriter FileSavePerf = new StreamWriter(ResultFile + ".perf", false); FileSavePerf.WriteLine(String.Join("\t", TestLoss_epoch.VectorValue)); FileSavePerf.WriteLine(String.Join("\t", TestLoss_time.VectorValue)); FileSavePerf.WriteLine(String.Join("\t", TestLoss_pool.VectorValue)); FileSavePerf.Close(); } { string PhiCol = null; (new FileInfo(ResultFile + ".model.Phi")).Directory.Create(); StreamWriter FileSaveModel = new StreamWriter(ResultFile + ".model.Phi", false); for (int IdxCol = 0; IdxCol < paramModel.Phi.nCols; IdxCol++) { PhiCol = String.Join("\t", paramModel.Phi.DenseMatrixValue[IdxCol].VectorValue); FileSaveModel.WriteLine(PhiCol); } FileSaveModel.Close(); // Save the final learning curves StreamWriter FileSavePerf = new StreamWriter(ResultFile + ".perf", false); FileSavePerf.WriteLine(String.Join("\t", TestLoss_epoch.VectorValue)); FileSavePerf.WriteLine(String.Join("\t", TestLoss_time.VectorValue)); FileSavePerf.WriteLine(String.Join("\t", TestLoss_pool.VectorValue)); FileSavePerf.Close(); } } // -- Dump feature -- if (paramTrain.flag_DumpFeature && (epoch + 1) % nEpochPerDump == 0) { if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { DumpingFeature_BP_LDA(TrainData, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".train.fea", "Train"); DumpingFeature_BP_LDA(TestData, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".test.fea", "Test"); } { DumpingFeature_BP_LDA(TrainData, paramModel, paramTrain.BatchSize_Test, ResultFile + ".train.fea", "Train"); DumpingFeature_BP_LDA(TestData, paramModel, paramTrain.BatchSize_Test, ResultFile + ".test.fea", "Test"); } } } }
public static float ComputeSupervisedLoss(SparseMatrix Dt, SparseMatrix y, string OutputType) { if (Dt.nCols != y.nCols || Dt.nRows != y.nRows) { throw new Exception("The numbers of samples from label and prediction do not match."); } SparseMatrix SparseMat = new SparseMatrix(y); SparseMatrix TmpSparseMat = new SparseMatrix(Dt); DenseRowVector TmpDenseRowVec = new DenseRowVector(Dt.nCols); float TrainingLoss = 0.0f; switch (OutputType) { case "softmaxCE": MatrixOperation.ScalarAddMatrix(SparseMat, y, 1e-20f); MatrixOperation.Log(SparseMat); MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpSparseMat, Dt, SparseMat); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpSparseMat); TrainingLoss = TmpDenseRowVec.Sum() * (-1.0f); break; case "linearQuad": MatrixOperation.MatrixSubtractMatrix(SparseMat, Dt); MatrixOperation.ElementwiseSquare(SparseMat); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, SparseMat); TrainingLoss = TmpDenseRowVec.Sum(); break; case "linearCE": MatrixOperation.ScalarAddMatrix(SparseMat, y, 1e-20f); MatrixOperation.Log(SparseMat); MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpSparseMat, Dt, SparseMat); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpSparseMat); TrainingLoss = TmpDenseRowVec.Sum() * (-1.0f); break; default: throw new Exception("Unknown OutputType."); } return TrainingLoss; }
/* * Compute Regularized Cross Entropy between the reconstructed input and the actual input. (Loss funtion for the unsupervised learning case) */ public static float ComputeRegularizedCrossEntropy(SparseMatrix Xt, DenseMatrix Phi, DenseMatrix theta_top, DenseColumnVector b) { SparseMatrix TmpSparseMat = new SparseMatrix(Xt); DenseRowVector TmpDenseRowVec = new DenseRowVector(Xt.nCols); MatrixOperation.MatrixMultiplyMatrix(TmpSparseMat, Phi, theta_top); MatrixOperation.Log(TmpSparseMat); MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpSparseMat, Xt); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpSparseMat); float CE = (-1.0f) * TmpDenseRowVec.VectorValue.Sum(); DenseMatrix TmpDenseMat = new DenseMatrix(theta_top.nRows, theta_top.nCols); MatrixOperation.Log(TmpDenseMat, theta_top); MatrixOperation.bsxfunVectorMultiplyMatrix(TmpDenseMat, b); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpDenseMat); CE = CE - TmpDenseRowVec.VectorValue.Sum(); return CE; }
/* * Training: supervised learning of feedforward (unfolding) LDA by back propagation */ public static void TrainingBP_sLDA( SparseMatrix TrainData, SparseMatrix TrainLabel, SparseMatrix TestData, SparseMatrix TestLabel, SparseMatrix ValidData, SparseMatrix ValidLabel, paramModel_t paramModel, paramTrain_t paramTrain, string ModelFile, string ResultFile ) { Console.WriteLine("*****************************************************************"); Console.WriteLine("jvking version of BP-sLDA: Mirror-Descent Back Propagation"); Console.WriteLine("*****************************************************************"); // ---- Extract the parameters ---- // Model parameters int nInput = paramModel.nInput; int nHid = paramModel.nHid; int nHidLayer = paramModel.nHidLayer; int nOutput = paramModel.nOutput; float eta = paramModel.eta; float T_value = paramModel.T_value; string OutputType = paramModel.OutputType; float beta = paramModel.beta; // Training parameters int nEpoch = paramTrain.nEpoch; float mu_Phi = paramTrain.mu_Phi; float mu_U = paramTrain.mu_U; int nTrain = paramTrain.nTrain; float mu_ReduceFactor = paramTrain.mu_Phi_ReduceFactor; string LearnRateSchedule = paramTrain.LearnRateSchedule; int nSamplesPerDisplay = paramTrain.nSamplesPerDisplay; int nEpochPerSave = paramTrain.nEpochPerSave; int nEpochPerTest = paramTrain.nEpochPerTest; int nEpochPerDump = paramTrain.nEpochPerDump; // ---- Initialize the model ---- ModelInit_LDA_Feedforward(paramModel); // ---- Initialize the training algorithm ---- float TotLoss = 0.0f; float TotTrErr = 0.0f; double TotTime = 0.0f; double TotTimeThisEpoch = 0.0f; int TotSamples = 0; int TotSamplesThisEpoch = 0; float CntRunningAvg = 0.0f; float CntModelUpdate = 0.0f; double AvgnHidLayerEffective = 0.0f; DenseRowVector mu_phi_search = new DenseRowVector(nHid, mu_Phi); DenseRowVector mu_U_search = new DenseRowVector(nHid, mu_U); DenseRowVector AdaGradSum = new DenseRowVector(nHid, 0.0f); DenseRowVector TmpDenseRowVec = new DenseRowVector(nHid, 0.0f); DenseRowVector TestError_pool = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector ValidError_pool = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector TrainError_pool = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector TrainLoss_pool = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector TestError_epoch = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); DenseRowVector TestError_time = new DenseRowVector(nEpoch / nEpochPerTest, 0.0f); int CountTest = 0; float nLearnLineSearch = 0.0f; int[] IdxPerm = null; int BatchSize_NormalBatch = paramTrain.BatchSize; int BatchSize_tmp = paramTrain.BatchSize; int nBatch = (int)Math.Ceiling(((float)nTrain) / ((float)BatchSize_NormalBatch)); DNNRun_t DNNRun_NormalBatch = new DNNRun_t(nHid, BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); DNNRun_t DNNRun_EndBatch = new DNNRun_t(nHid, nTrain - (nBatch - 1) * BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); DNNRun_t DNNRun = null; Grad_t Grad = new Grad_t(nHid, nOutput, nInput, paramModel.nHidLayer, OutputType); SparseMatrix TmpGrad = new SparseMatrix(nInput, nHid, true); DenseMatrix TmpMatDensePhi = new DenseMatrix(nInput, nHid); DenseMatrix TmpMatDenseU = new DenseMatrix(nOutput, nHid); paramModel_t paramModel_avg = new paramModel_t(paramModel); Stopwatch stopWatch = new Stopwatch(); // ---- Compute the schedule of the learning rate double[] stepsize_pool_Phi = null; double[] stepsize_pool_U = null; switch (LearnRateSchedule) { case "PreCompute": stepsize_pool_Phi = PrecomputeLearningRateSchedule(nBatch, nEpoch, mu_Phi, mu_Phi / mu_ReduceFactor, 1e-8f); stepsize_pool_U = PrecomputeLearningRateSchedule(nBatch, nEpoch, mu_U, mu_U / mu_ReduceFactor, 1e-8f); break; case "Constant": stepsize_pool_Phi = new double[nEpoch]; stepsize_pool_U = new double[nEpoch]; for (int Idx = 0; Idx < nEpoch; Idx++) { stepsize_pool_Phi[Idx] = mu_Phi; stepsize_pool_U[Idx] = mu_U; } break; default: throw new Exception("Unknown type of LearnRateSchedule"); } // Now start training......................... for (int epoch = 0; epoch < nEpoch; epoch++) { TotSamplesThisEpoch = 0; TotTimeThisEpoch = 0.0; AvgnHidLayerEffective = 0.0f; // -- Set the batch size if there is schedule -- if (paramTrain.flag_BachSizeSchedule) { if (paramTrain.BachSizeSchedule.TryGetValue(epoch + 1, out BatchSize_tmp)) { BatchSize_NormalBatch = BatchSize_tmp; nBatch = (int)Math.Ceiling(((float)nTrain) / ((float)BatchSize_NormalBatch)); DNNRun_NormalBatch = new DNNRun_t(nHid, BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); DNNRun_EndBatch = new DNNRun_t(nHid, nTrain - (nBatch - 1) * BatchSize_NormalBatch, paramModel.nHidLayer, nOutput); } } // -- Shuffle the data (generating shuffled index) -- IdxPerm = Statistics.RandPerm(nTrain); // -- Reset the (MDA) inference step-sizes -- if (epoch > 0) { for (int Idx = 0; Idx < paramModel.nHidLayer; Idx++) { paramModel.T[Idx] = T_value; } } // -- Take the learning rate for the current epoch -- mu_Phi = (float)stepsize_pool_Phi[epoch]; mu_U = (float)stepsize_pool_U[epoch]; // -- Start this epoch -- Console.WriteLine("############## Epoch #{0}. BatchSize: {1} Learning Rate: Phi:{2}, U:{3} ##################", epoch + 1, BatchSize_NormalBatch, mu_Phi, mu_U); for (int IdxBatch = 0; IdxBatch < nBatch; IdxBatch++) { stopWatch.Start(); // Extract the batch int BatchSize = 0; if (IdxBatch < nBatch - 1) { BatchSize = BatchSize_NormalBatch; DNNRun = DNNRun_NormalBatch; } else { BatchSize = nTrain - IdxBatch * BatchSize_NormalBatch; DNNRun = DNNRun_EndBatch; } SparseMatrix Xt = new SparseMatrix(nInput, BatchSize); SparseMatrix Dt = new SparseMatrix(nOutput, BatchSize); int[] IdxSample = new int[BatchSize]; Array.Copy(IdxPerm, IdxBatch * BatchSize_NormalBatch, IdxSample, 0, BatchSize); TrainData.GetColumns(Xt, IdxSample); TrainLabel.GetColumns(Dt, IdxSample); // Forward activation LDA_Learn.ForwardActivation_LDA(Xt, DNNRun, paramModel, true); // Back propagation LDA_Learn.BackPropagation_LDA(Xt, Dt, DNNRun, paramModel, Grad); // Compute the gradient and update the model (All gradients of Phi are accumulated into Grad.grad_Q_Phi) // (i) Update Phi MatrixOperation.ScalarDivideMatrix(Grad.grad_Q_Phi, (-1.0f) * ((beta - 1) / ((float)nTrain)), paramModel.Phi, true); mu_phi_search.FillValue(mu_Phi); // Different learning rate for different columns of Phi: Similar to AdaGrad but does not decay with time ++CntModelUpdate; MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpMatDensePhi, Grad.grad_Q_Phi, Grad.grad_Q_Phi); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpMatDensePhi); MatrixOperation.ScalarMultiplyVector(TmpDenseRowVec, 1.0f / ((float)nInput)); MatrixOperation.VectorSubtractVector(TmpDenseRowVec, AdaGradSum); MatrixOperation.ScalarMultiplyVector(TmpDenseRowVec, 1.0f / CntModelUpdate); MatrixOperation.VectorAddVector(AdaGradSum, TmpDenseRowVec); MatrixOperation.ElementwiseSquareRoot(TmpDenseRowVec, AdaGradSum); MatrixOperation.ScalarAddVector(TmpDenseRowVec, mu_Phi); MatrixOperation.ElementwiseVectorDivideVector(mu_phi_search, mu_phi_search, TmpDenseRowVec); nLearnLineSearch = SMD_Update(paramModel.Phi, Grad.grad_Q_Phi, mu_phi_search, eta); // (ii) Update U MatrixOperation.ScalarMultiplyMatrix(Grad.grad_Q_U, (-1.0f) * mu_U); MatrixOperation.MatrixAddMatrix(paramModel.U, Grad.grad_Q_U); // (iii) Running average of the model if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch)/2.0f)) { ++CntRunningAvg; MatrixOperation.MatrixSubtractMatrix(TmpMatDensePhi, paramModel.Phi, paramModel_avg.Phi); MatrixOperation.MatrixSubtractMatrix(TmpMatDenseU, paramModel.U, paramModel_avg.U); MatrixOperation.ScalarMultiplyMatrix(TmpMatDensePhi, 1.0f / CntRunningAvg); MatrixOperation.ScalarMultiplyMatrix(TmpMatDenseU, 1.0f / CntRunningAvg); MatrixOperation.MatrixAddMatrix(paramModel_avg.Phi, TmpMatDensePhi); MatrixOperation.MatrixAddMatrix(paramModel_avg.U, TmpMatDenseU); } // Display the result TotTrErr += 100 * ComputeNumberOfErrors(Dt, DNNRun.y); TotLoss += ComputeSupervisedLoss(Dt, DNNRun.y, paramModel.OutputType); TotSamples += BatchSize; TotSamplesThisEpoch += BatchSize; AvgnHidLayerEffective = (((double)(TotSamplesThisEpoch - BatchSize)) / ((double)TotSamplesThisEpoch)) * AvgnHidLayerEffective + 1.0 / ((double)TotSamplesThisEpoch) * DNNRun.nHidLayerEffective.Sum(); stopWatch.Stop(); TimeSpan ts = stopWatch.Elapsed; TotTime += ts.TotalSeconds; TotTimeThisEpoch += ts.TotalSeconds; stopWatch.Reset(); if (TotSamplesThisEpoch % nSamplesPerDisplay == 0) { // Display results Console.WriteLine( "* Ep#{0}/{1} Bat#{2}/{3}. Loss={4:F3}. TrErr={5:F3}%. Speed={6} Samples/Sec.", epoch + 1, nEpoch, IdxBatch + 1, nBatch, TotLoss / TotSamples, TotTrErr / TotSamples, (int)((double)TotSamplesThisEpoch / TotTimeThisEpoch) ); if (paramTrain.DebugLevel == DebugLevel_t.medium) { Console.WriteLine( " muPhiMax={0} \n muPhiMin={1}", mu_phi_search.VectorValue.Max(), mu_phi_search.VectorValue.Min() ); Console.WriteLine(); } if (paramTrain.DebugLevel == DebugLevel_t.high) { Console.WriteLine( " muPhiMax={0} \n muPhiMin={1}", mu_phi_search.VectorValue.Max(), mu_phi_search.VectorValue.Min() ); float MaxAbsVal_Grad_Q_Phi = Grad.grad_Q_Phi.MaxAbsValue(); float MaxAbsVal_Grad_Q_U = Grad.grad_Q_U.MaxAbsValue(); Console.WriteLine( " AvgnHidLayerEff={0:F1}. G_Phi={1:F3}. G_U={2:F3}", AvgnHidLayerEffective, MaxAbsVal_Grad_Q_Phi, MaxAbsVal_Grad_Q_U ); // Save the screen into a log file (new FileInfo(ResultFile + ".log")).Directory.Create(); using (StreamWriter LogFile = File.AppendText(ResultFile + ".log")) { LogFile.WriteLine( "- Ep#{0}/{1} Bat#{2}/{3}. Loss={4:F3}. TrErr={5:F3}%. Speed={6} Samples/Sec.", epoch + 1, nEpoch, IdxBatch + 1, nBatch, TotLoss / TotSamples, TotTrErr / TotSamples, (int)((double)TotSamplesThisEpoch / TotTimeThisEpoch) ); LogFile.WriteLine( " muPhiMax={0} \n muPhiMin={1}", mu_phi_search.VectorValue.Max(), mu_phi_search.VectorValue.Min() ); LogFile.WriteLine( " AvgnHidLayerEff={0:F1}. G_Phi={1:F3}. G_U={2:F3}", AvgnHidLayerEffective, MaxAbsVal_Grad_Q_Phi, MaxAbsVal_Grad_Q_U ); Console.WriteLine(); } Console.WriteLine(); } } } // -- Test -- if ((epoch + 1) % nEpochPerTest == 0) { // Standard performance metric TestError_epoch.VectorValue[(epoch + 1) / nEpochPerTest - 1] = epoch + 1; TestError_time.VectorValue[(epoch + 1) / nEpochPerTest - 1] = (float)TotTime; if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { if (paramTrain.flag_HasValidSet) { ValidError_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = Testing_BP_sLDA( ValidData, ValidLabel, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".validscore", "Validation Set" ); } TestError_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = Testing_BP_sLDA( TestData, TestLabel, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".testscore", "Test Set" ); } else { if (paramTrain.flag_HasValidSet) { ValidError_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = Testing_BP_sLDA( ValidData, ValidLabel, paramModel, paramTrain.BatchSize_Test, ResultFile + ".validscore", "Validation Set" ); } TestError_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = Testing_BP_sLDA( TestData, TestLabel, paramModel, paramTrain.BatchSize_Test, ResultFile + ".testscore", "Test Set" ); } TrainError_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = TotTrErr / TotSamples; TrainLoss_pool.VectorValue[(epoch + 1) / nEpochPerTest - 1] = TotLoss / TotSamples; // Performance metric evaluated using external evaluation tools, e.g., AUC, Top@K accuracy, etc. if (paramTrain.flag_ExternalEval) { ExternalEvaluation( paramTrain.ExternalEval, ResultFile, paramTrain.TestLabelFile, epoch, "Test Set" ); if (paramTrain.flag_HasValidSet) { ExternalEvaluation( paramTrain.ExternalEval, ResultFile, paramTrain.ValidLabelFile, epoch, "Validation Set" ); } } CountTest++; } // -- Save -- if ((epoch + 1) % nEpochPerSave == 0) { // Save model string PhiCol = null; string UCol = null; (new FileInfo(ResultFile + ".model.Phi")).Directory.Create(); string ModelName_Phi; string ModelName_U; if (paramTrain.flag_SaveAllModels) { ModelName_Phi = ResultFile + ".model.Phi" + ".iter" + (epoch + 1).ToString(); ModelName_U = ResultFile + ".model.U" + ".iter" + (epoch + 1).ToString(); } else { ModelName_Phi = ResultFile + ".model.Phi"; ModelName_U = ResultFile + ".model.U"; } if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { using (StreamWriter FileSaveModel_Phi = new StreamWriter(ModelName_Phi, false)) { for (int IdxCol = 0; IdxCol < paramModel_avg.Phi.nCols; IdxCol++) { PhiCol = String.Join("\t", paramModel_avg.Phi.DenseMatrixValue[IdxCol].VectorValue); FileSaveModel_Phi.WriteLine(PhiCol); } } using (StreamWriter FileSaveModel_U = new StreamWriter(ModelName_U, false)) { for (int IdxCol = 0; IdxCol < paramModel_avg.U.nCols; IdxCol++) { UCol = String.Join("\t", paramModel_avg.U.DenseMatrixValue[IdxCol].VectorValue); FileSaveModel_U.WriteLine(UCol); } } } else { using (StreamWriter FileSaveModel_Phi = new StreamWriter(ModelName_Phi, false)) { for (int IdxCol = 0; IdxCol < paramModel.Phi.nCols; IdxCol++) { PhiCol = String.Join("\t", paramModel.Phi.DenseMatrixValue[IdxCol].VectorValue); FileSaveModel_Phi.WriteLine(PhiCol); } } using (StreamWriter FileSaveModel_U = new StreamWriter(ModelName_U, false)) { for (int IdxCol = 0; IdxCol < paramModel.U.nCols; IdxCol++) { UCol = String.Join("\t", paramModel.U.DenseMatrixValue[IdxCol].VectorValue); FileSaveModel_U.WriteLine(UCol); } } } // Save the final learning curves using (StreamWriter FileSavePerf = new StreamWriter(ResultFile + ".perf", false)) { FileSavePerf.Write("Epoch:\t"); FileSavePerf.WriteLine(String.Join("\t", TestError_epoch.VectorValue)); FileSavePerf.Write("TrainTime:\t"); FileSavePerf.WriteLine(String.Join("\t", TestError_time.VectorValue)); if (paramTrain.flag_HasValidSet) { FileSavePerf.Write("Validation:\t"); FileSavePerf.WriteLine(String.Join("\t", ValidError_pool.VectorValue)); } FileSavePerf.Write("Test:\t"); FileSavePerf.WriteLine(String.Join("\t", TestError_pool.VectorValue)); FileSavePerf.Write("TrainError:\t"); FileSavePerf.WriteLine(String.Join("\t", TrainError_pool.VectorValue)); FileSavePerf.Write("TrainLoss:\t"); FileSavePerf.WriteLine(String.Join("\t", TrainLoss_pool.VectorValue)); } } // -- Dump feature -- if (paramTrain.flag_DumpFeature && (epoch + 1) % nEpochPerDump == 0) { if (paramTrain.flag_RunningAvg && epoch >= (int)Math.Ceiling(((float)nEpoch) / 2.0f)) { DumpingFeature_BP_LDA(TrainData, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".train.fea", "Train"); DumpingFeature_BP_LDA(TestData, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".test.fea", "Test"); if (paramTrain.flag_HasValidSet) { DumpingFeature_BP_LDA(ValidData, paramModel_avg, paramTrain.BatchSize_Test, ResultFile + ".valid.fea", "Validation"); } } { DumpingFeature_BP_LDA(TrainData, paramModel, paramTrain.BatchSize_Test, ResultFile + ".train.fea", "Train"); DumpingFeature_BP_LDA(TestData, paramModel, paramTrain.BatchSize_Test, ResultFile + ".test.fea", "Test"); if (paramTrain.flag_HasValidSet) { DumpingFeature_BP_LDA(ValidData, paramModel, paramTrain.BatchSize_Test, ResultFile + ".valid.fea", "Validation"); } } } } }
public static float ComputeCrossEntropy(SparseMatrix Xt, DenseMatrix Phi, DenseMatrix[] theta_pool, int[] nHidLayerEffective) { SparseMatrix TmpSparseMat = new SparseMatrix(Xt); DenseRowVector TmpDenseRowVec = new DenseRowVector(Xt.nCols); Parallel.For(0, Xt.nCols, IdxSample => { MatrixOperation.MatrixMultiplyVector( TmpSparseMat.SparseColumnVectors[IdxSample], Phi, theta_pool[nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); }); MatrixOperation.Log(TmpSparseMat); MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpSparseMat, Xt); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpSparseMat); return (-1.0f) * TmpDenseRowVec.VectorValue.Sum(); }
/* * Compute Cross Entropy between the reconstructed input and the actual input. (Unsupervised learning case) */ public static float ComputeCrossEntropy(SparseMatrix Xt, DenseMatrix Phi, DenseMatrix theta_top) { SparseMatrix TmpSparseMat = new SparseMatrix(Xt); DenseRowVector TmpDenseRowVec = new DenseRowVector(Xt.nCols); MatrixOperation.MatrixMultiplyMatrix(TmpSparseMat, Phi, theta_top); MatrixOperation.Log(TmpSparseMat); MatrixOperation.ElementwiseMatrixMultiplyMatrix(TmpSparseMat, Xt); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, TmpSparseMat); return (-1.0f) * TmpDenseRowVec.VectorValue.Sum(); }
public static float SMD_Update(DenseMatrix X, DenseMatrix Grad, DenseRowVector LearningRatePerCol, float eta) { if (X.nCols != Grad.nCols || X.nRows != Grad.nRows) { throw new Exception("Dimension mismatch."); } DenseRowVector nLearnLineSearchPerCol = new DenseRowVector(X.nCols, 0.0f); DenseMatrix Update = new DenseMatrix(Grad.nRows, Grad.nCols); DenseRowVector TmpRowVec = new DenseRowVector(LearningRatePerCol); MatrixOperation.ScalarMultiplyVector(TmpRowVec, -1.0f); MatrixOperation.bsxfunVectorMultiplyMatrix(Update, Grad, TmpRowVec); MatrixOperation.VerticalMaxMatrix(TmpRowVec, Update); MatrixOperation.bsxfunMatrixSubtractVector(Update, Update, TmpRowVec); MatrixOperation.Exp(Update); MatrixOperation.ElementwiseMatrixMultiplyMatrix(X, X, Update); MatrixOperation.VerticalSumMatrix(TmpRowVec, X); MatrixOperation.bsxfunMatrixRightDivideVector(X, TmpRowVec); return 0.0f; }
public static void bsxfunMatrixSubtractVector(SparseMatrix Z, SparseMatrix X, DenseRowVector y) { if (Z.nCols != X.nCols || Z.nRows != X.nRows || Z.nCols != y.Dim) { throw new Exception("Dimension mismatch."); } int total = Z.nCols; int process_len = (total + THREADNUM - 1) / THREADNUM; Parallel.For(0, THREADNUM, new ParallelOptions{ MaxDegreeOfParallelism = MaxMultiThreadDegree}, thread_idx => { for (int t = 0; t < process_len; t++) { int IdxCol = thread_idx * process_len + t; if (IdxCol < total) { var zVal = Z.SparseColumnVectors[IdxCol].Val; var xVal = X.SparseColumnVectors[IdxCol].Val; var yVal = y.VectorValue[IdxCol]; int nNonzero = Z.SparseColumnVectors[IdxCol].nNonzero; for (int IdxRow = 0; IdxRow < nNonzero; ++IdxRow) { zVal[IdxRow] = xVal[IdxRow] - yVal; } } } }); }
public static void VerticalSumMatrix(DenseRowVector z, DenseMatrix X) { z.FillValue(0.0f); int nRows = X.nRows; Parallel.For(0, X.nCols, new ParallelOptions { MaxDegreeOfParallelism = MaxMultiThreadDegree }, IdxCol => { var zVal = z.VectorValue; var xVal = X.DenseMatrixValue[IdxCol].VectorValue; for (int IdxRow = 0; IdxRow < nRows; ++IdxRow) { zVal[IdxCol] += xVal[IdxRow]; } }); }
public static void VerticalMaxMatrix(DenseRowVector z, SparseMatrix X) { int zDim = z.Dim; var zVal = z.VectorValue; var XMat = X.SparseColumnVectors; Parallel.For(0, zDim, new ParallelOptions { MaxDegreeOfParallelism = MaxMultiThreadDegree }, IdxCol => { zVal[IdxCol] = XMat[IdxCol].Val.Max(); }); }
public static void VerticalSumMatrix(DenseRowVector z, SparseMatrix X) { Array.Clear(z.VectorValue, 0, z.VectorValue.Length); Parallel.For(0, X.nCols, new ParallelOptions { MaxDegreeOfParallelism = MaxMultiThreadDegree }, IdxCol => { var zVal = z.VectorValue; var xVal = X.SparseColumnVectors[IdxCol].Val; int nNonzero = X.SparseColumnVectors[IdxCol].nNonzero; for (int IdxRow = 0; IdxRow < nNonzero; ++IdxRow) { zVal[IdxCol] += xVal[IdxRow]; } }); }
/* * Project each column of the input matrix X onto the affine space defined by 1^T x = 1 */ public static void ProjCols2SimplexPlane(DenseMatrix X) { DenseRowVector TmpDenseRowVec = new DenseRowVector(X.nCols); MatrixOperation.VerticalSumMatrix(TmpDenseRowVec, X); MatrixOperation.ScalarMultiplyVector(TmpDenseRowVec, 1.0f / ((float)X.nRows)); MatrixOperation.bsxfunMatrixSubtractVector(X, X, TmpDenseRowVec); MatrixOperation.ScalarAddMatrix(X, 1.0f / ((float)X.nRows)); }
/* * X = bsxfun(@times, X, y) or X = X * y, where y is a dense row or column vector */ public static void bsxfunVectorMultiplyMatrix(DenseMatrix X, DenseRowVector y) { if (X.nCols != y.Dim) { throw new Exception("The Number of columns in the two inputs does not match!"); } Parallel.For(0, X.nCols, new ParallelOptions { MaxDegreeOfParallelism = MaxMultiThreadDegree }, IdxCol => { var xVal = X.DenseMatrixValue[IdxCol].VectorValue; var yVal = y.VectorValue[IdxCol]; int nRows = X.nRows; for (int IdxRow = 0; IdxRow < nRows; ++IdxRow) { xVal[IdxRow] *= yVal; } }); }
public DenseMatrix(int NumRows, int NumCols, bool IsPerColumn) { if (IsPerColumn) { nRows = NumRows; nCols = NumCols; isPerColumn = true; DenseMatrixValue = new DenseColumnVector[nCols]; for (int IdxCol = 0; IdxCol < nCols; IdxCol++) { DenseMatrixValue[IdxCol] = new DenseColumnVector(nRows); } } else { nRows = NumRows; nCols = NumCols; isPerColumn = false; DenseMatrixValuePerRow = new DenseRowVector[nRows]; for (int IdxRow = 0; IdxRow < nRows; IdxRow++) { DenseMatrixValuePerRow[IdxRow] = new DenseRowVector(nCols); } } }
public static void bsxfunVectorMultiplyMatrix(SparseMatrix X, DenseRowVector y) { if (X.nCols != y.Dim) { throw new Exception("Dimension mismatch."); } Parallel.For(0, X.nCols, new ParallelOptions { MaxDegreeOfParallelism = MaxMultiThreadDegree }, IdxCol => { var xVal = X.SparseColumnVectors[IdxCol].Val; var yVal = y.VectorValue[IdxCol]; var nNonzero = X.SparseColumnVectors[IdxCol].nNonzero; for (int IdxRow = 0; IdxRow < nNonzero; ++IdxRow) { xVal[IdxRow] *= yVal; } }); }
public void DeepCopyFrom(DenseRowVector SourceVector) { // Check dimension if (Dim != SourceVector.Dim) { throw new Exception("Dimension mismatch during deep copy of DenseRowVector."); } // Deep copy of the float array Array.Copy(SourceVector.VectorValue,VectorValue,Dim); }
/* * Z = bsxfun(@times, X, y) or Z = X * y, where y is a dense row or column vector */ public static void bsxfunVectorMultiplyMatrix(SparseMatrix Z, SparseMatrix X, DenseRowVector y) { if (Z.nCols != X.nCols || Z.nRows != X.nRows || Z.nCols != y.Dim) { throw new Exception("Dimension mismatch!"); } int ZnCols = Z.nCols; Parallel.For(0, ZnCols, new ParallelOptions { MaxDegreeOfParallelism = MaxMultiThreadDegree }, IdxCol => { int nNz = Z.SparseColumnVectors[IdxCol].nNonzero; var ZVal = Z.SparseColumnVectors[IdxCol].Val; var XVal = X.SparseColumnVectors[IdxCol].Val; var yVal = y.VectorValue; for (int IdxRow = 0; IdxRow < nNz; ++IdxRow) { ZVal[IdxRow] = XVal[IdxRow] * yVal[IdxCol]; } }); }
public static DenseRowVector ElementwiseVectorMultiplyVector(DenseRowVector x, DenseRowVector y) { if (x.Dim != y.Dim) { throw new Exception("Dimension mismatch."); } DenseRowVector z = new DenseRowVector(x.Dim); for (int IdxCol = 0; IdxCol < z.Dim; IdxCol++ ) { z.VectorValue[IdxCol] = x.VectorValue[IdxCol] * y.VectorValue[IdxCol]; } return z; }
public static void bsxfunVectorMultiplyMatrix(DenseMatrix Z, DenseMatrix X, DenseRowVector y) { if (X.nCols != y.Dim || Z.nCols != X.nCols || Z.nRows != X.nRows) { throw new Exception("The Number of columns in the two inputs does not match!"); } int total = Z.nCols * Z.nRows; int process_len = (total + THREADNUM - 1) / THREADNUM; Parallel.For(0, THREADNUM, new ParallelOptions{ MaxDegreeOfParallelism = MaxMultiThreadDegree}, thread_idx => { for (int t = 0; t < process_len; t++) { int id = thread_idx * process_len + t; if (id < total) { int IdxCol = id / Z.nRows; int IdxRow = id % Z.nRows; Z.DenseMatrixValue[IdxCol].VectorValue[IdxRow] = X.DenseMatrixValue[IdxCol].VectorValue[IdxRow] * y.VectorValue[IdxCol]; } else { break; } } }); }
/* * z = x * y, where y is a scalar */ public static void ScalarMultiplyVector(DenseRowVector z, DenseRowVector x, float y) { var zVal = z.VectorValue; var xVal = x.VectorValue; int Dim = z.Dim; for (int IdxCol = 0; IdxCol < Dim; ++IdxCol) { zVal[IdxCol] = xVal[IdxCol] * y; } }
/* * Z = bsxfun(@minus, X, y) */ public static void bsxfunMatrixSubtractVector(DenseMatrix Z, DenseMatrix X, DenseRowVector y) { if (Z.nCols != X.nCols || Z.nRows != X.nRows || Z.nCols != y.Dim) { throw new Exception("Dimension mismatch."); } int total = Z.nCols * Z.nRows; int process_len = (total + THREADNUM - 1) / THREADNUM; Parallel.For(0, THREADNUM, new ParallelOptions{ MaxDegreeOfParallelism = MaxMultiThreadDegree}, thread_idx => { for (int t = 0; t < process_len; t++) { int id = thread_idx * process_len + t; if (id < total) { int IdxCol = id / Z.nRows; int IdxRow = id % Z.nRows; Z.DenseMatrixValue[IdxCol].VectorValue[IdxRow] = X.DenseMatrixValue[IdxCol].VectorValue[IdxRow] - y.VectorValue[IdxCol]; } else break; } }); }
public static void ScalarAddVector(DenseRowVector z, float y) { var zVal = z.VectorValue; for (int IdxCol = 0; IdxCol < z.Dim; IdxCol++) { zVal[IdxCol] += y; } }
/* * Forward activation of Latent Dirichlet Allocation model (Mirror descent approach) */ public static void ForwardActivation_LDA(SparseMatrix Xt, DNNRun_t DNNRun, paramModel_t paramModel, bool flag_IsTraining) { // -------- Extract parameters -------- int nHid = paramModel.nHid; int nHidLayer = paramModel.nHidLayer; float eta = paramModel.eta; float T_value = paramModel.T_value; string OutputType = paramModel.OutputType; float To = paramModel.To; int BatchSize = Xt.nCols; // -------- Hidden activations -------- // ---- Reset the effective number of hidden layers (mainly for alpha<1 case) ---- Array.Clear(DNNRun.nHidLayerEffective,0,DNNRun.nHidLayerEffective.Length); // ---- T is different over layers (adaptive step-size MDA) ---- DenseRowVector T = new DenseRowVector(BatchSize, T_value); SparseMatrix Phitheta = new SparseMatrix(Xt); DenseRowVector loss_pre = new DenseRowVector(BatchSize); DenseRowVector loss_post = new DenseRowVector(BatchSize); DenseRowVector loss_gap = new DenseRowVector(BatchSize); DenseRowVector loss_gap_thresh = new DenseRowVector(BatchSize); DenseRowVector gradproj = new DenseRowVector(BatchSize); SparseMatrix TmpSparseMat = new SparseMatrix(Xt); DenseMatrix TmpDenseMat = new DenseMatrix(nHid, BatchSize); DenseMatrix LogTheta = new DenseMatrix(nHid, BatchSize); DenseRowVector TmpDenseRowVec = new DenseRowVector(BatchSize); DenseMatrix NegGrad = new DenseMatrix(nHid, BatchSize); DenseMatrix LLR = new DenseMatrix(nHid, BatchSize); //for (int IdxSample = 0; IdxSample < BatchSize; IdxSample++) Parallel.For(0, BatchSize, new ParallelOptions { MaxDegreeOfParallelism = MatrixOperation.MaxMultiThreadDegree }, IdxSample => { float KLDivergence = 0.0f; // The forward activation for each data sample for (int IdxLayer = 0; IdxLayer < nHidLayer; IdxLayer++) { // Compute the loss before unfolding the current layer if (IdxLayer == 0) { MatrixOperation.MatrixMultiplyVector( Phitheta.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta0.DenseMatrixValue[IdxSample] ); } else { MatrixOperation.MatrixMultiplyVector( Phitheta.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); } if (IdxLayer > 1) { loss_pre.VectorValue[IdxSample] = loss_post.VectorValue[IdxSample]; } else { MatrixOperation.ScalarAddVector(TmpSparseMat.SparseColumnVectors[IdxSample], Phitheta.SparseColumnVectors[IdxSample], 1e-12f); MatrixOperation.Log(TmpSparseMat.SparseColumnVectors[IdxSample]); MatrixOperation.ElementwiseVectorMultiplyVector(TmpSparseMat.SparseColumnVectors[IdxSample], Xt.SparseColumnVectors[IdxSample]); loss_pre.VectorValue[IdxSample] = (-1.0f)*TmpSparseMat.SparseColumnVectors[IdxSample].Sum(); if (IdxLayer == 0) { MatrixOperation.ScalarAddVector(TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample], 1e-12f); } else { MatrixOperation.ScalarAddVector(TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample], 1e-12f); } MatrixOperation.Log(TmpDenseMat.DenseMatrixValue[IdxSample]); MatrixOperation.ElementwiseVectorMultiplyVector(TmpDenseMat.DenseMatrixValue[IdxSample], paramModel.b); TmpDenseRowVec.VectorValue[IdxSample] = TmpDenseMat.DenseMatrixValue[IdxSample].Sum(); loss_pre.VectorValue[IdxSample] -= TmpDenseRowVec.VectorValue[IdxSample]; } // Compute the hidden activation of the current layer MatrixOperation.ScalarAddVector(TmpSparseMat.SparseColumnVectors[IdxSample], Phitheta.SparseColumnVectors[IdxSample], 1e-12f); MatrixOperation.ElementwiseVectorDivideVector( TmpSparseMat.SparseColumnVectors[IdxSample], Xt.SparseColumnVectors[IdxSample], TmpSparseMat.SparseColumnVectors[IdxSample] ); MatrixOperation.MatrixTransposeMultiplyVector( TmpDenseMat.DenseMatrixValue[IdxSample], paramModel.Phi, TmpSparseMat.SparseColumnVectors[IdxSample] ); if (IdxLayer == 0) { MatrixOperation.ScalarAddVector( NegGrad.DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample], 1e-12f ); } else { MatrixOperation.ScalarAddVector( NegGrad.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample], 1e-12f ); } MatrixOperation.ElementwiseVectorDivideVector(NegGrad.DenseMatrixValue[IdxSample], paramModel.b, NegGrad.DenseMatrixValue[IdxSample]); MatrixOperation.VectorAddVector(NegGrad.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample]); // Line search for the parameter T if (paramModel.alpha >= 1) { T.VectorValue[IdxSample] *= (1.0f / eta); } // only perform line search for alpha>=1 case (convex) loss_post.VectorValue[IdxSample] = loss_pre.VectorValue[IdxSample]; if (IdxLayer == 0) { MatrixOperation.Log(LogTheta.DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample]); } else { MatrixOperation.Log(LogTheta.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]); } while (true) { MatrixOperation.ScalarMultiplyVector(DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], NegGrad.DenseMatrixValue[IdxSample], T.VectorValue[IdxSample]); MatrixOperation.VectorAddVector(DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], LogTheta.DenseMatrixValue[IdxSample]); MatrixOperation.ScalarAddVector(DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], (-1.0f) * DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample].MaxValue()); MatrixOperation.Exp(DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample]); MatrixOperation.ScalarMultiplyVector(DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], (1.0f / DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample].Sum())); // Compute the loss after undfolding the current layer MatrixOperation.MatrixMultiplyVector(Phitheta.SparseColumnVectors[IdxSample], paramModel.Phi, DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample]); MatrixOperation.Log(Phitheta.SparseColumnVectors[IdxSample]); loss_post.VectorValue[IdxSample] = (-1.0f) * MatrixOperation.InnerProduct(Xt.SparseColumnVectors[IdxSample], Phitheta.SparseColumnVectors[IdxSample]); MatrixOperation.ScalarAddVector(TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], 1e-12f); MatrixOperation.Log(TmpDenseMat.DenseMatrixValue[IdxSample]); loss_post.VectorValue[IdxSample] -= MatrixOperation.InnerProduct(TmpDenseMat.DenseMatrixValue[IdxSample], paramModel.b); if (IdxLayer == 0) { MatrixOperation.VectorSubtractVector(TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample]); } else { MatrixOperation.VectorSubtractVector(TmpDenseMat.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample]); } loss_gap.VectorValue[IdxSample] = loss_post.VectorValue[IdxSample] - loss_pre.VectorValue[IdxSample]; gradproj.VectorValue[IdxSample] = (-1.0f) * MatrixOperation.InnerProduct(NegGrad.DenseMatrixValue[IdxSample], TmpDenseMat.DenseMatrixValue[IdxSample]); loss_gap_thresh.VectorValue[IdxSample] = gradproj.VectorValue[IdxSample] + (0.5f / T.VectorValue[IdxSample]) * (float)Math.Pow((double)TmpDenseMat.DenseMatrixValue[IdxSample].L1Norm(), 2.0); if (loss_gap.VectorValue[IdxSample] > loss_gap_thresh.VectorValue[IdxSample] + 1e-12 && paramModel.alpha>=1) { T.VectorValue[IdxSample] *= eta; } // Only perform line search for alpha>=1 case (convex) else { DNNRun.T_pool.DenseMatrixValuePerRow[IdxLayer].VectorValue[IdxSample] = T.VectorValue[IdxSample]; break; } } // Count the effective number of hidden layers ++DNNRun.nHidLayerEffective[IdxSample]; // stop MDA if termination condition holds if (paramModel.flag_AdaptivenHidLayer) { if (IdxLayer == 0) { MatrixOperation.ElementwiseVectorDivideVector( LLR.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], DNNRun.theta0.DenseMatrixValue[IdxSample] ); MatrixOperation.Log(LLR.DenseMatrixValue[IdxSample]); } else { MatrixOperation.ElementwiseVectorDivideVector( LLR.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.Log(LLR.DenseMatrixValue[IdxSample]); MatrixOperation.ResetVectorSparsePattern( LLR.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample] ); } KLDivergence = MatrixOperation.InnerProduct( LLR.DenseMatrixValue[IdxSample], DNNRun.theta_pool[IdxLayer].DenseMatrixValue[IdxSample] ); if (KLDivergence < 1e-12f) { break; } } } // ---- Generate output ---- switch (OutputType) { case "softmaxCE": MatrixOperation.MatrixMultiplyVector( DNNRun.y.DenseMatrixValue[IdxSample], paramModel.U, DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); MatrixOperation.ScalarAddVector(DNNRun.y.DenseMatrixValue[IdxSample], To); TmpDenseRowVec.VectorValue[IdxSample] = DNNRun.y.DenseMatrixValue[IdxSample].MaxValue(); MatrixOperation.ScalarAddVector(DNNRun.y.DenseMatrixValue[IdxSample], (-1.0f) * TmpDenseRowVec.VectorValue[IdxSample]); MatrixOperation.Exp(DNNRun.y.DenseMatrixValue[IdxSample]); TmpDenseRowVec.VectorValue[IdxSample] = DNNRun.y.DenseMatrixValue[IdxSample].Sum(); MatrixOperation.ScalarMultiplyVector(DNNRun.y.DenseMatrixValue[IdxSample], (1.0f) / TmpDenseRowVec.VectorValue[IdxSample]); break; case "unsupLDA": // Will not compute the reconstructed input at forward activation to save time during training. break; case "linearQuad": MatrixOperation.MatrixMultiplyVector( DNNRun.y.DenseMatrixValue[IdxSample], paramModel.U, DNNRun.theta_pool[DNNRun.nHidLayerEffective[IdxSample] - 1].DenseMatrixValue[IdxSample] ); break; case "linearCE": throw new Exception("linearCE not implemented."); default: throw new Exception("Unknown OutputType."); } }); }