public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation var retain_cell = innerGraph.EltMul(forget_gate, cell_prev); // what do we keep from cell var write_cell = innerGraph.EltMul(input_gate, cell_write); // what do we write to cell ct = innerGraph.Add(retain_cell, write_cell); // new cell contents // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct)); return(ht); }
public IWeightTensor Step(IWeightTensor input, IComputeGraph g) { using (IComputeGraph innerGraph = g.CreateSubGraph(m_name)) { IWeightTensor hidden_prev = m_hidden; IWeightTensor cell_prev = m_cell; IWeightTensor inputs = innerGraph.Concate(1, input, hidden_prev); IWeightTensor hhSum = innerGraph.Affine(inputs, m_Wxh, m_b); IWeightTensor hhSum2 = m_layerNorm1.Norm(hhSum, innerGraph); (IWeightTensor gates_raw, IWeightTensor cell_write_raw) = innerGraph.SplitColumns(hhSum2, m_hdim * 3, m_hdim); IWeightTensor gates = innerGraph.Sigmoid(gates_raw); IWeightTensor cell_write = innerGraph.Tanh(cell_write_raw); (IWeightTensor input_gate, IWeightTensor forget_gate, IWeightTensor output_gate) = innerGraph.SplitColumns(gates, m_hdim, m_hdim, m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write m_cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); IWeightTensor ct2 = m_layerNorm2.Norm(m_cell, innerGraph); // compute hidden state as gated, saturated cell activations m_hidden = g.EltMul(output_gate, innerGraph.Tanh(ct2)); return(m_hidden); } }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightMatrix Step(IWeightMatrix context, IWeightMatrix input, IComputeGraph computeGraph) { var cell_prev = ct; var hidden_prev = ht; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var bs = computeGraph.RepeatRows(b, input.Rows); var hhSum = computeGraph.MulAdd(hxhc, Wxhc, bs); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum, hdim * 3, hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation //var retain_cell = computeGraph.EltMul(forget_gate, cell_prev); //var write_cell = computeGraph.EltMul(input_gate, cell_write); //ct = computeGraph.Add(retain_cell, write_cell); ct = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); ht = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct)); return(ht); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightTensor Step(IWeightTensor context, IWeightTensor input, IComputeGraph g) { using (IComputeGraph computeGraph = g.CreateSubGraph(m_name)) { IWeightTensor cell_prev = Cell; IWeightTensor hidden_prev = Hidden; IWeightTensor hxhc = computeGraph.Concate(1, input, hidden_prev, context); IWeightTensor hhSum = computeGraph.Affine(hxhc, m_Wxhc, m_b); IWeightTensor hhSum2 = m_layerNorm1.Norm(hhSum, computeGraph); (IWeightTensor gates_raw, IWeightTensor cell_write_raw) = computeGraph.SplitColumns(hhSum2, m_hiddenDim * 3, m_hiddenDim); IWeightTensor gates = computeGraph.Sigmoid(gates_raw); IWeightTensor cell_write = computeGraph.Tanh(cell_write_raw); (IWeightTensor input_gate, IWeightTensor forget_gate, IWeightTensor output_gate) = computeGraph.SplitColumns(gates, m_hiddenDim, m_hiddenDim, m_hiddenDim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write Cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); IWeightTensor ct2 = m_layerNorm2.Norm(Cell, computeGraph); Hidden = g.EltMul(output_gate, computeGraph.Tanh(ct2)); return(Hidden); } }
public IWeightTensor Step(IWeightTensor input, IComputeGraph g) { using (var innerGraph = g.CreateSubGraph(this.m_name)) { var hidden_prev = this.m_hidden; var cell_prev = this.m_cell; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var hhSum = innerGraph.Affine(inputs, this.m_Wxh, this.m_b); var hhSum2 = this.m_layerNorm1.Norm(hhSum, innerGraph); var(gates_raw, cell_write_raw) = innerGraph.SplitColumns(hhSum2, this.m_hdim * 3, this.m_hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); var(input_gate, forget_gate, output_gate) = innerGraph.SplitColumns(gates, this.m_hdim, this.m_hdim, this.m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write this.m_cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = this.m_layerNorm2.Norm(this.m_cell, innerGraph); // compute hidden state as gated, saturated cell activations this.m_hidden = g.EltMul(output_gate, innerGraph.Tanh(ct2)); return(this.m_hidden); } }
public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); var hhSum2 = layerNorm1.Process(hhSum, innerGraph); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = innerGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, innerGraph); // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct2)); return(ht); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightTensor Step(IWeightTensor context, IWeightTensor input, IComputeGraph g) { using (var computeGraph = g.CreateSubGraph(this.m_name)) { var cell_prev = this.Cell; var hidden_prev = this.Hidden; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var hhSum = computeGraph.Affine(hxhc, this.m_Wxhc, this.m_b); var hhSum2 = this.m_layerNorm1.Norm(hhSum, computeGraph); var(gates_raw, cell_write_raw) = computeGraph.SplitColumns(hhSum2, this.m_hiddenDim * 3, this.m_hiddenDim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); var(input_gate, forget_gate, output_gate) = computeGraph.SplitColumns(gates, this.m_hiddenDim, this.m_hiddenDim, this.m_hiddenDim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write this.Cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = this.m_layerNorm2.Norm(this.Cell, computeGraph); this.Hidden = g.EltMul(output_gate, computeGraph.Tanh(ct2)); return(this.Hidden); } }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightMatrix Step(IWeightMatrix context, IWeightMatrix input, IComputeGraph computeGraph) { var cell_prev = ct; var hidden_prev = ht; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var bs = computeGraph.RepeatRows(b, input.Rows); var hhSum = computeGraph.MulAdd(hxhc, Wxhc, bs); var hhSum2 = layerNorm1.Process(hhSum, computeGraph); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, computeGraph); ht = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct2)); return(ht); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="computeGraph">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> public override List <NetworkResult> RunForwardOnSingleDevice(IComputeGraph computeGraph, ISntPairBatch sntPairBatch, int deviceIdIdx, bool isTraining, DecodingOptions decodingOptions) { int batchSize = sntPairBatch.BatchSize; float cost = 0.0f; var nrs = new List <NetworkResult>(); var nr = new NetworkResult { Output = new List <List <List <string> > >() }; (IEncoder encoder, IWeightTensor srcEmbedding, IFeedForwardLayer encoderFFLayer, IWeightTensor posEmbedding, IWeightTensor segmentEmbedding) = GetNetworksOnDeviceAt(deviceIdIdx); IWeightTensor encOutput1; IWeightTensor encOutput2; if (!isTraining && (m_options.ProcessorType == ProcessorTypeEnums.CPU)) { //We only check cache at inference time string cacheKey1 = GenerateCacheKey(sntPairBatch.GetSrcTokens(0)); if (!m_memoryCache.TryGetValue(cacheKey1, out encOutput1)) { encOutput1 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 0); // output shape: [batch_size, dim] var cacheEntryOptions = new MemoryCacheEntryOptions().SetSize(1); m_memoryCache.Set(cacheKey1, encOutput1.CopyWeightsRef($"cache_{encOutput1.Name}", false), cacheEntryOptions); } string cacheKey2 = GenerateCacheKey(sntPairBatch.GetSrcTokens(1)); if (!m_memoryCache.TryGetValue(cacheKey2, out encOutput2)) { encOutput2 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 1); // output_shape: [batch_size, dim] var cacheEntryOptions = new MemoryCacheEntryOptions().SetSize(1); m_memoryCache.Set(cacheKey2, encOutput2.CopyWeightsRef($"cache_{encOutput2.Name}", false), cacheEntryOptions); } } else { //We always run encoder network during training time or using GPUs encOutput1 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 0); // output shape: [batch_size, dim] encOutput2 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 1); // output_shape: [batch_size, dim] } if (m_modelMetaData.SimilarityType.Equals("Continuous", StringComparison.InvariantCultureIgnoreCase)) { // Cosine similairy var w12 = computeGraph.EltMul(encOutput1, encOutput2); w12 = computeGraph.Sum(w12, 1); var w1 = computeGraph.EltMul(encOutput1, encOutput1); w1 = computeGraph.Sum(w1, 1); var w2 = computeGraph.EltMul(encOutput2, encOutput2); w2 = computeGraph.Sum(w2, 1); var n12 = computeGraph.EltMul(w1, w2); n12 = computeGraph.Rsqrt(n12); var probs = computeGraph.EltMul(w12, n12); if (isTraining) { var tgtSnts = sntPairBatch.GetTgtTokens(0); for (int k = 0; k < batchSize; k++) { float golden_score_k = float.Parse(tgtSnts[k][0]); // Get golden similiary score from target side float score_k = probs.GetWeightAt(new long[] { k, 0 }); probs.SetWeightAt(score_k - golden_score_k, new long[] { k, 0 }); cost += (float)Math.Abs(score_k - golden_score_k); } probs.CopyWeightsToGradients(probs); nr.Cost = cost / batchSize; } else { nr.Output.Add(new List <List <string> >()); for (int k = 0; k < batchSize; k++) { float score_k = probs.GetWeightAt(new long[] { k, 0 }); nr.Output[0].Add(new List <string>()); nr.Output[0][k].Add(score_k.ToString()); } } } else { IWeightTensor encOutput = computeGraph.EltMul(encOutput1, encOutput2); IWeightTensor ffLayer = encoderFFLayer.Process(encOutput, batchSize, computeGraph); using (IWeightTensor probs = computeGraph.Softmax(ffLayer, runGradients: false, inPlace: true)) { if (isTraining) { var tgtSnts = sntPairBatch.GetTgtTokens(0); for (int k = 0; k < batchSize; k++) { int ix_targets_k_j = m_modelMetaData.ClsVocab.GetWordIndex(tgtSnts[k][0]); float score_k = probs.GetWeightAt(new long[] { k, ix_targets_k_j }); cost += (float)-Math.Log(score_k); probs.SetWeightAt(score_k - 1, new long[] { k, ix_targets_k_j }); } ffLayer.CopyWeightsToGradients(probs); nr.Cost = cost / batchSize; } else { // Output "i"th target word using var targetIdxTensor = computeGraph.Argmax(probs, 1); float[] targetIdx = targetIdxTensor.ToWeightArray(); List <string> targetWords = m_modelMetaData.ClsVocab.ConvertIdsToString(targetIdx.ToList()); nr.Output.Add(new List <List <string> >()); for (int k = 0; k < batchSize; k++) { nr.Output[0].Add(new List <string>()); nr.Output[0][k].Add(targetWords[k]); } } } } nrs.Add(nr); return(nrs); }