public override void learnNet(State state, int timeat) { //create delta list double beta2 = beta * alpha; if (m_bCRFTraining == true) { //For RNN-CRF, use joint probability of output layer nodes and transition between contigous nodes for (int c = 0; c < L2; c++) { neuOutput[c].er = -m_Diff[timeat][c]; } neuOutput[state.GetLabel()].er = 1 - m_Diff[timeat][state.GetLabel()]; } else { //For standard RNN for (int c = 0; c < L2; c++) { neuOutput[c].er = -neuOutput[c].ac; } neuOutput[state.GetLabel()].er = 1 - neuOutput[state.GetLabel()].ac; } //Get sparse feature and apply it into hidden layer var sparse = state.GetSparseData(); int sparseFeatureSize = sparse.GetNumberOfEntries(); //put variables for derivaties in weight class and cell class Parallel.For(0, L1, parallelOption, i => { LSTMWeight[] w_i = mat_input2hidden[i]; LSTMCell c = neuHidden[i]; for (int k = 0; k < sparseFeatureSize; k++) { var entry = sparse.GetEntry(k); LSTMWeight w = w_i[entry.Key]; w_i[entry.Key].dSInputCell = w.dSInputCell * c.yForget + gPrime(c.netCellState) * c.yIn * entry.Value; w_i[entry.Key].dSInputInputGate = w.dSInputInputGate * c.yForget + activationFunctionG(c.netCellState) * fPrime(c.netIn) * entry.Value; w_i[entry.Key].dSInputForgetGate = w.dSInputForgetGate * c.yForget + c.previousCellState * fPrime(c.netForget) * entry.Value; } if (fea_size > 0) { w_i = mat_feature2hidden[i]; for (int j = 0; j < fea_size; j++) { LSTMWeight w = w_i[j]; w_i[j].dSInputCell = w.dSInputCell * c.yForget + gPrime(c.netCellState) * c.yIn * neuFeatures[j].ac; w_i[j].dSInputInputGate = w.dSInputInputGate * c.yForget + activationFunctionG(c.netCellState) * fPrime(c.netIn) * neuFeatures[j].ac; w_i[j].dSInputForgetGate = w.dSInputForgetGate * c.yForget + c.previousCellState * fPrime(c.netForget) * neuFeatures[j].ac; } } //partial derivatives for internal connections c.dSWCellIn = c.dSWCellIn * c.yForget + activationFunctionG(c.netCellState) * fPrime(c.netIn) * c.cellState; //partial derivatives for internal connections, initially zero as dS is zero and previous cell state is zero c.dSWCellForget = c.dSWCellForget * c.yForget + c.previousCellState * fPrime(c.netForget) * c.previousCellState; neuHidden[i] = c; }); //for all output neurons for (int k = 0; k < L2; k++) { //for each connection to the hidden layer double er = neuOutput[k].er; for (int j = 0; j <= L1; j++) { deltaHiddenOutput[j][k] = alpha * neuHidden[j].cellOutput * er; } } //for each hidden neuron Parallel.For(0, L1, parallelOption, i => { LSTMCell c = neuHidden[i]; //find the error by find the product of the output errors and their weight connection. double weightedSum = 0; for (int k = 0; k < L2; k++) { weightedSum += neuOutput[k].er * mat_hidden2output[i][k]; } //using the error find the gradient of the output gate c.gradientOutputGate = fPrime(c.netOut) * activationFunctionH(c.cellState) * weightedSum; //internal cell state error c.cellStateError = c.yOut * weightedSum * hPrime(c.cellState); //weight updates //already done the deltas for the hidden-output connections //output gates. for each connection to the hidden layer //to the input layer LSTMWeight[] w_i = mat_input2hidden[i]; for (int k = 0; k < sparseFeatureSize; k++) { var entry = sparse.GetEntry(k); //updates weights for input to hidden layer if ((counter % 10) == 0) //regularization is done every 10. step { w_i[entry.Key].wInputCell += alpha * c.cellStateError * w_i[entry.Key].dSInputCell - w_i[entry.Key].wInputCell * beta2; w_i[entry.Key].wInputInputGate += alpha * c.cellStateError * w_i[entry.Key].dSInputInputGate - w_i[entry.Key].wInputInputGate * beta2; w_i[entry.Key].wInputForgetGate += alpha * c.cellStateError * w_i[entry.Key].dSInputForgetGate - w_i[entry.Key].wInputForgetGate * beta2; w_i[entry.Key].wInputOutputGate += alpha * c.gradientOutputGate * entry.Value - w_i[entry.Key].wInputOutputGate * beta2; } else { w_i[entry.Key].wInputCell += alpha * c.cellStateError * w_i[entry.Key].dSInputCell; w_i[entry.Key].wInputInputGate += alpha * c.cellStateError * w_i[entry.Key].dSInputInputGate; w_i[entry.Key].wInputForgetGate += alpha * c.cellStateError * w_i[entry.Key].dSInputForgetGate; w_i[entry.Key].wInputOutputGate += alpha * c.gradientOutputGate * entry.Value; } } if (fea_size > 0) { w_i = mat_feature2hidden[i]; for (int j = 0; j < fea_size; j++) { //make the delta equal to the learning rate multiplied by the gradient multipled by the input for the connection //update connection weights if ((counter % 10) == 0) //regularization is done every 10. step { w_i[j].wInputCell += alpha * c.cellStateError * w_i[j].dSInputCell - w_i[j].wInputCell * beta2; w_i[j].wInputInputGate += alpha * c.cellStateError * w_i[j].dSInputInputGate - w_i[j].wInputInputGate * beta2; w_i[j].wInputForgetGate += alpha * c.cellStateError * w_i[j].dSInputForgetGate - w_i[j].wInputForgetGate * beta2; w_i[j].wInputOutputGate += alpha * c.gradientOutputGate * neuFeatures[j].ac - w_i[j].wInputOutputGate * beta2; } else { w_i[j].wInputCell += alpha * c.cellStateError * w_i[j].dSInputCell; w_i[j].wInputInputGate += alpha * c.cellStateError * w_i[j].dSInputInputGate; w_i[j].wInputForgetGate += alpha * c.cellStateError * w_i[j].dSInputForgetGate; w_i[j].wInputOutputGate += alpha * c.gradientOutputGate * neuFeatures[j].ac; } } } //for the internal connection double deltaOutputGateCell = alpha * c.gradientOutputGate * c.cellState; //using internal partial derivative double deltaInputGateCell = alpha * c.cellStateError * c.dSWCellIn; double deltaForgetGateCell = alpha * c.cellStateError * c.dSWCellForget; //update internal weights if ((counter % 10) == 0) //regularization is done every 10. step { c.wCellIn += deltaInputGateCell - c.wCellIn * beta2; c.wCellForget += deltaForgetGateCell - c.wCellForget * beta2; c.wCellOut += deltaOutputGateCell - c.wCellOut * beta2; } else { c.wCellIn += deltaInputGateCell; c.wCellForget += deltaForgetGateCell; c.wCellOut += deltaOutputGateCell; } neuHidden[i] = c; //update weights for hidden to output layer for (int k = 0; k < L2; k++) { if ((counter % 10) == 0) //regularization is done every 10. step { mat_hidden2output[i][k] += deltaHiddenOutput[i][k] - mat_hidden2output[i][k] * beta2; } else { mat_hidden2output[i][k] += deltaHiddenOutput[i][k]; } } }); }
public override void learnNet(State state, int timeat) { if (m_bCRFTraining == true) { //For RNN-CRF, use joint probability of output layer nodes and transition between contigous nodes for (int c = 0; c < L2; c++) { neuOutput[c].er = -m_Diff[timeat][c]; } neuOutput[state.GetLabel()].er = 1 - m_Diff[timeat][state.GetLabel()]; } else { //For standard RNN for (int c = 0; c < L2; c++) { neuOutput[c].er = -neuOutput[c].ac; } neuOutput[state.GetLabel()].er = 1 - neuOutput[state.GetLabel()].ac; } for (int a = 0; a < L1; a++) { neuHidden[a].er = 0; } matrixXvectorADD(neuHidden, neuOutput, mat_hidden2output, 0, L2, 0, L1, 1); //error output->hidden for words from specific class Parallel.For(0, L2, parallelOption, c => { for (int a = 0; a < L1; a++) { double dg = neuOutput[c].er * neuHidden[a].ac; if ((counter % 10) == 0) //regularization is done every 10. step { mat_hidden2output[c][a] += alpha * (dg - mat_hidden2output[c][a] * beta); } else { mat_hidden2output[c][a] += alpha * dg; } } }); }