public IWeightTensor Step(IWeightTensor input, IComputeGraph g) { var innerGraph = g.CreateSubGraph(m_name); var hidden_prev = m_hidden; var cell_prev = m_cell; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var hhSum = innerGraph.Affine(inputs, m_Wxh, m_b); var hhSum2 = m_layerNorm1.Process(hhSum, innerGraph); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum2, m_hdim * 3, m_hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, m_hdim, m_hdim, m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write m_cell = innerGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = m_layerNorm2.Process(m_cell, innerGraph); // compute hidden state as gated, saturated cell activations m_hidden = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct2)); return(m_hidden); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightTensor Step(IWeightTensor context, IWeightTensor input, IComputeGraph g) { var computeGraph = g.CreateSubGraph(m_name); var cell_prev = Cell; var hidden_prev = Hidden; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var hhSum = computeGraph.Affine(hxhc, m_Wxhc, m_b); var hhSum2 = layerNorm1.Process(hhSum, computeGraph); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum2, m_hdim * 3, m_hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, m_hdim, m_hdim, m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write Cell = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(Cell, computeGraph); Hidden = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct2)); return(Hidden); }
public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); var hhSum2 = layerNorm1.Process(hhSum, innerGraph); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = innerGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, innerGraph); // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct2)); return(ht); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var seqLen = input.Rows / m_batchSize; //Input projections var allQ = g.View(Q.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allK = g.View(K.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allV = g.View(V.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions var Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); var Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * m_batchSize, m_d, seqLen); var Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); var attn = g.MulBatch(Qs, Ks, m_multiHeadNum * m_batchSize, scale); var attn2 = g.View(attn, m_multiHeadNum * m_batchSize * seqLen, seqLen); var softmax = g.Softmax(attn2); var softmax2 = g.View(softmax, m_multiHeadNum * m_batchSize, seqLen, seqLen); var o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * m_batchSize), m_multiHeadNum, m_batchSize, seqLen, m_d); var W = g.View(g.Permute(o, 1, 2, 0, 3), m_batchSize * seqLen, m_multiHeadNum * m_d); // Output projection var finalAttResults = g.Affine(W, W0, b0); //Skip connection and layer normaliztion var addedAttResult = g.Add(finalAttResults, input); var normAddedAttResult = layerNorm1.Process(addedAttResult, g); //Feed forward var ffnResult = feedForwardLayer1.Process(normAddedAttResult, g); var reluFFNResult = g.Relu(ffnResult); var ffn2Result = feedForwardLayer2.Process(reluFFNResult, g); //Skip connection and layer normaliztion var addFFNResult = g.Add(ffn2Result, normAddedAttResult); var normAddFFNResult = layerNorm2.Process(addFFNResult, g); return(normAddFFNResult); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightMatrix Step(IWeightMatrix context, IWeightMatrix input, IComputeGraph computeGraph) { var cell_prev = ct; var hidden_prev = ht; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var bs = computeGraph.RepeatRows(b, input.Rows); var hhSum = computeGraph.MulAdd(hxhc, Wxhc, bs); var hhSum2 = layerNorm1.Process(hhSum, computeGraph); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, computeGraph); ht = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct2)); return(ht); }