public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); var hhSum2 = layerNorm1.Process(hhSum, innerGraph); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = innerGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, innerGraph); // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct2)); return(ht); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor inputs, int batchSize, IComputeGraph g, IWeightTensor srcSelfMask) { using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Encoder")) { IWeightTensor maskTensor = null; if (srcSelfMask != null) { int seqLen = inputs.Rows / batchSize; using var keyMaskView = subg.View(srcSelfMask, dims: new long[] { batchSize, 1, seqLen, seqLen }); maskTensor = subg.Expand(keyMaskView, dims: new long[] { batchSize, m_multiHeadNum, seqLen, seqLen }); } IWeightTensor attnProbs = null; for (int k = 0; k < m_encoders.Count; k++) { (inputs, attnProbs) = m_encoders[k].Perform(inputs, maskTensor, batchSize, subg, outputAttenWeights: false); inputs = m_posFFNs[k].Perform(inputs, batchSize, subg); } inputs = layerNorm.Norm(inputs, subg); inputs.UnbindFromComputeGraph(); if (attnProbs != null) { attnProbs.UnbindFromComputeGraph(); } if (maskTensor != null) { maskTensor.Dispose(); } } return(inputs); }
public WeightMatrix Perform(WeightMatrix input, WeightMatrix state, IComputeGraph g) { WeightMatrix context; List <WeightMatrix> atten = new List <WeightMatrix>(); var stateRepeat = g.RepeatRows(state, input.Rows); var baiseInput = new WeightMatrix(input.Rows, 1, 1); var inputb = g.concatColumns(input, baiseInput); var uh = g.mul(inputb, Ua); baiseInput = new WeightMatrix(stateRepeat.Rows, 1, 1); stateRepeat = g.concatColumns(stateRepeat, baiseInput); var wc = g.mul(stateRepeat, Wa); var gg = g.addtanh(uh, wc); var aa = g.mul(gg, V); var res = g.Softmax(aa); var weighted = g.weightRows(input, res);; context = g.sumColumns(weighted); return(context); }
public static IWeightTensor BuildPadSelfMask(IComputeGraph g, int paddedLength, List <int> originalLengths, int deviceId) { var buf = new float[originalLengths.Count * paddedLength * paddedLength]; for (var i = 0; i < buf.Length; i++) { buf[i] = -1e30f; } for (var k = 0; k < originalLengths.Count; k++) { for (var i = 0; i < originalLengths[k]; i++) { for (var j = 0; j < originalLengths[k]; j++) { // ReSharper disable once ArrangeRedundantParentheses buf[k * (paddedLength * paddedLength) + i * paddedLength + j] = 0.0f; } } } var tensor = new WeightTensor(new long[] { originalLengths.Count, paddedLength, paddedLength }, 0.0f, deviceId, $"TriMask_{deviceId}", false); tensor.SetWeightArray(buf); return(tensor); }
public static IWeightTensor BuildSrcTgtMask(IComputeGraph g, int srcPaddedLength, int tgtPaddedLength, List <int> tgtOriginalLengths, List <int> srcOriginalLengths, int deviceId) { float[] buf = new float[tgtOriginalLengths.Count * tgtPaddedLength * srcPaddedLength]; Array.Fill(buf, -99999999.0f); for (int k = 0; k < tgtOriginalLengths.Count; k++) // batch size { int offset_k = k * (tgtPaddedLength * srcPaddedLength); for (int i = 0; i < tgtOriginalLengths[k]; i++) { int offset_k_i = offset_k + i * srcPaddedLength; for (int j = 0; j < srcOriginalLengths[k]; j++) { buf[offset_k_i + j] = 0.0f; } } } WeightTensor tensor = new WeightTensor(new long[] { tgtOriginalLengths.Count, tgtPaddedLength, srcPaddedLength }, deviceId, $"SrcTgtMask_{deviceId}", isTrainable: false); tensor.SetWeightArray(buf); return(tensor); }
public IWeightMatrix Process(IWeightMatrix input, IComputeGraph innerGraph) { var alphas = innerGraph.RepeatRows(alpha, input.Rows); var betas = innerGraph.RepeatRows(beta, input.Rows); return(innerGraph.LayerNorm(input, alphas, betas)); }
/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="srcSnts"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightTensor Encode(IComputeGraph g, List <List <string> > srcSnts, IEncoder encoder, IWeightTensor Embedding, IWeightTensor srcSelfMask, IWeightTensor posEmbedding, List <int> originalSrcLengths) { var seqLen = srcSnts[0].Count; var batchSize = srcSnts.Count; var inputs = new List <IWeightTensor>(); // Generate batch-first based input embeddings for (var j = 0; j < batchSize; j++) { var originalLength = originalSrcLengths[j]; for (var i = 0; i < seqLen; i++) { var ix_source = this.m_modelMetaData.Vocab.GetSourceWordIndex(srcSnts[j][i], true); var emb = g.PeekRow(Embedding, ix_source, runGradients: i < originalLength ? true : false); inputs.Add(emb); } } var inputEmbs = g.ConcatRows(inputs); if (this.m_modelMetaData.EncoderType == EncoderTypeEnums.Transformer) { inputEmbs = this.AddPositionEmbedding(g, posEmbedding, batchSize, seqLen, inputEmbs); } return(encoder.Encode(inputEmbs, batchSize, g, srcSelfMask)); }
/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="inputSentences"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightTensor Encode(IComputeGraph g, List <List <string> > inputSentences, IEncoder encoder, IWeightTensor Embedding) { PadSentences(inputSentences); List <IWeightTensor> forwardOutputs = new List <IWeightTensor>(); List <IWeightTensor> backwardOutputs = new List <IWeightTensor>(); int seqLen = inputSentences[0].Count; List <IWeightTensor> forwardInput = new List <IWeightTensor>(); for (int i = 0; i < seqLen; i++) { for (int j = 0; j < inputSentences.Count; j++) { var inputSentence = inputSentences[j]; int ix_source = (int)SENTTAGS.UNK; if (m_srcWordToIndex.ContainsKey(inputSentence[i])) { ix_source = m_srcWordToIndex[inputSentence[i]]; } else { Logger.WriteLine($"'{inputSentence[i]}' is an unknown word."); } var x = g.PeekRow(Embedding, ix_source); forwardInput.Add(x); } } var forwardInputsM = g.ConcatRows(forwardInput); return(encoder.Encode(forwardInputsM, g)); }
public static IWeightTensor BuildPadSelfTriMask(IComputeGraph g, int paddedLength, List <int> originalLengths, int deviceId) { float[] buf = new float[originalLengths.Count * paddedLength * paddedLength]; for (int i = 0; i < buf.Length; i++) { buf[i] = -1e9f; } for (int k = 0; k < originalLengths.Count; k++) { for (int i = 0; i < originalLengths[k]; i++) { for (int j = 0; j < originalLengths[k]; j++) { if (i >= j) { buf[k * (paddedLength * paddedLength) + i * paddedLength + j] = 0.0f; } else { break; } } } } WeightTensor tensor = new WeightTensor(new long[] { originalLengths.Count, paddedLength, paddedLength }, 0.0f, deviceId, $"TriMask_{deviceId}", isTrainable: false); tensor.SetWeightArray(buf); return(tensor); }
public IWeightTensor Step(IWeightTensor input, IComputeGraph g) { using (var innerGraph = g.CreateSubGraph(this.m_name)) { var hidden_prev = this.m_hidden; var cell_prev = this.m_cell; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var hhSum = innerGraph.Affine(inputs, this.m_Wxh, this.m_b); var hhSum2 = this.m_layerNorm1.Norm(hhSum, innerGraph); var(gates_raw, cell_write_raw) = innerGraph.SplitColumns(hhSum2, this.m_hdim * 3, this.m_hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); var(input_gate, forget_gate, output_gate) = innerGraph.SplitColumns(gates, this.m_hdim, this.m_hdim, this.m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write this.m_cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = this.m_layerNorm2.Norm(this.m_cell, innerGraph); // compute hidden state as gated, saturated cell activations this.m_hidden = g.EltMul(output_gate, innerGraph.Tanh(ct2)); return(this.m_hidden); } }
public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation var retain_cell = innerGraph.EltMul(forget_gate, cell_prev); // what do we keep from cell var write_cell = innerGraph.EltMul(input_gate, cell_write); // what do we write to cell ct = innerGraph.Add(retain_cell, write_cell); // new cell contents // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct)); return(ht); }
public IWeightTensor Process(IWeightTensor inputT, int batchSize, IComputeGraph graph) { var g = graph.CreateSubGraph(m_name); var res = g.Affine(inputT, m_Whd, m_Bd); return(g.Dropout(res, batchSize, m_dropoutRatio, inPlace: true)); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightTensor Step(IWeightTensor context, IWeightTensor input, IComputeGraph g) { var computeGraph = g.CreateSubGraph(m_name); var cell_prev = Cell; var hidden_prev = Hidden; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var hhSum = computeGraph.Affine(hxhc, m_Wxhc, m_b); var hhSum2 = layerNorm1.Process(hhSum, computeGraph); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum2, m_hdim * 3, m_hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, m_hdim, m_hdim, m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write Cell = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(Cell, computeGraph); Hidden = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct2)); return(Hidden); }
private void RunValidParallel(Func<IComputeGraph, List<List<string>>, List<List<string>>, int, bool, float> RunNetwork, List<IMetric> metrics, bool outputToFile, List<string> srcSents, List<string> refSents, List<string> hypSents, List<SntPairBatch> sntPairBatchs) { // Run forward on all available processors Parallel.For(0, m_deviceIds.Length, i => { SntPairBatch sntPairBatch = sntPairBatchs[i]; // Construct sentences for encoding and decoding List<List<string>> srcTkns = new List<List<string>>(); List<List<string>> refTkns = new List<List<string>>(); List<List<string>> hypTkns = new List<List<string>>(); for (int j = 0; j < sntPairBatch.BatchSize; j++) { srcTkns.Add(sntPairBatch.SntPairs[j].SrcSnt.ToList()); refTkns.Add(sntPairBatch.SntPairs[j].TgtSnt.ToList()); hypTkns.Add(new List<string>() { ParallelCorpus.BOS }); } // Create a new computing graph instance using (IComputeGraph computeGraph = CreateComputGraph(i, needBack: false)) { // Run forward part RunNetwork(computeGraph, srcTkns, hypTkns, i, false); } lock (locker) { for (int j = 0; j < hypTkns.Count; j++) { foreach (IMetric metric in metrics) { if (j < 0 || j >= refTkns.Count) { throw new InvalidDataException($"Ref token only has '{refTkns.Count}' batch, however, it try to access batch '{j}'. Hyp token has '{hypTkns.Count}' tokens, Batch Size = '{sntPairBatch.BatchSize}'"); } if (j < 0 || j >= hypTkns.Count) { throw new InvalidDataException($"Hyp token only has '{hypTkns.Count}' batch, however, it try to access batch '{j}'. Ref token has '{refTkns.Count}' tokens, Batch Size = '{sntPairBatch.BatchSize}'"); } metric.Evaluate(new List<List<string>>() { refTkns[j] }, hypTkns[j]); } } if (outputToFile) { for (int j = 0; j < srcTkns.Count; j++) { srcSents.Add(string.Join(" ", srcTkns[j])); refSents.Add(string.Join(" ", refTkns[j])); hypSents.Add(string.Join(" ", hypTkns[j])); } } } }); }
public List <IWeightMatrix> Encode(List <IWeightMatrix> inputs, IComputeGraph g) { List <IWeightMatrix> forwardOutputs = new List <IWeightMatrix>(); List <IWeightMatrix> backwardOutputs = new List <IWeightMatrix>(); List <IWeightMatrix> layerOutputs = inputs.ToList(); int seqLen = inputs.Count; for (int i = 0; i < depth; i++) { for (int j = 0; j < seqLen; j++) { var forwardOutput = forwardEncoders[i].Step(layerOutputs[j], g); forwardOutputs.Add(forwardOutput); var backwardOutput = backwardEncoders[i].Step(layerOutputs[inputs.Count - j - 1], g); backwardOutputs.Add(backwardOutput); } backwardOutputs.Reverse(); layerOutputs.Clear(); for (int j = 0; j < seqLen; j++) { var concatW = g.ConcatColumns(forwardOutputs[j], backwardOutputs[j]); layerOutputs.Add(concatW); } } return(layerOutputs); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightMatrix Step(IWeightMatrix context, IWeightMatrix input, IComputeGraph computeGraph) { var cell_prev = ct; var hidden_prev = ht; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var bs = computeGraph.RepeatRows(b, input.Rows); var hhSum = computeGraph.MulAdd(hxhc, Wxhc, bs); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum, hdim * 3, hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation //var retain_cell = computeGraph.EltMul(forget_gate, cell_prev); //var write_cell = computeGraph.EltMul(input_gate, cell_write); //ct = computeGraph.Add(retain_cell, write_cell); ct = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); ht = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct)); return(ht); }
public IWeightTensor Step(IWeightTensor input, IComputeGraph g) { using (IComputeGraph innerGraph = g.CreateSubGraph(m_name)) { IWeightTensor hidden_prev = m_hidden; IWeightTensor cell_prev = m_cell; IWeightTensor inputs = innerGraph.Concate(1, input, hidden_prev); IWeightTensor hhSum = innerGraph.Affine(inputs, m_Wxh, m_b); IWeightTensor hhSum2 = m_layerNorm1.Norm(hhSum, innerGraph); (IWeightTensor gates_raw, IWeightTensor cell_write_raw) = innerGraph.SplitColumns(hhSum2, m_hdim * 3, m_hdim); IWeightTensor gates = innerGraph.Sigmoid(gates_raw); IWeightTensor cell_write = innerGraph.Tanh(cell_write_raw); (IWeightTensor input_gate, IWeightTensor forget_gate, IWeightTensor output_gate) = innerGraph.SplitColumns(gates, m_hdim, m_hdim, m_hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write m_cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); IWeightTensor ct2 = m_layerNorm2.Norm(m_cell, innerGraph); // compute hidden state as gated, saturated cell activations m_hidden = g.EltMul(output_gate, innerGraph.Tanh(ct2)); return(m_hidden); } }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor rawInput, int batchSize, IComputeGraph g) { int seqLen = rawInput.Rows / batchSize; IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim); IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false); // Transpose to batch-first based sequence IWeightTensor inputs = g.TransposeBatch(rawInput, batchSize); inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); // We don't update position embedding, so dispose it now to save memory. posEmbeddingRepeat.Dispose(); posEmbedding.Dispose(); inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true); for (int k = 0; k < m_encoders.Count; k++) { inputs = m_encoders[k].Perform(inputs, batchSize, g); } // Transpose back to time-first based sequence rawInput = g.TransposeBatch(inputs, seqLen); return(rawInput); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightTensor Step(IWeightTensor context, IWeightTensor input, IComputeGraph g) { using (var computeGraph = g.CreateSubGraph(this.m_name)) { var cell_prev = this.Cell; var hidden_prev = this.Hidden; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var hhSum = computeGraph.Affine(hxhc, this.m_Wxhc, this.m_b); var hhSum2 = this.m_layerNorm1.Norm(hhSum, computeGraph); var(gates_raw, cell_write_raw) = computeGraph.SplitColumns(hhSum2, this.m_hiddenDim * 3, this.m_hiddenDim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); var(input_gate, forget_gate, output_gate) = computeGraph.SplitColumns(gates, this.m_hiddenDim, this.m_hiddenDim, this.m_hiddenDim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write this.Cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = this.m_layerNorm2.Norm(this.Cell, computeGraph); this.Hidden = g.EltMul(output_gate, computeGraph.Tanh(ct2)); return(this.Hidden); } }
public void VisualizeNeuralNetwork(string visNNFilePath) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); // Build input sentence List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(null); int batchSize = inputSeqs.Count; IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false, visNetwork: true); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Run encoder IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); // Prepare for attention over encoder-decoder AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); // Run decoder IWeightTensor x = g.PeekRow(tgtEmbedding, (int)SENTTAGS.START); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); IWeightTensor probs = g.Softmax(eOutput); g.VisualizeNeuralNetToFile(visNNFilePath); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightTensor Step(IWeightTensor context, IWeightTensor input, IComputeGraph g) { using (IComputeGraph computeGraph = g.CreateSubGraph(m_name)) { IWeightTensor cell_prev = Cell; IWeightTensor hidden_prev = Hidden; IWeightTensor hxhc = computeGraph.Concate(1, input, hidden_prev, context); IWeightTensor hhSum = computeGraph.Affine(hxhc, m_Wxhc, m_b); IWeightTensor hhSum2 = m_layerNorm1.Norm(hhSum, computeGraph); (IWeightTensor gates_raw, IWeightTensor cell_write_raw) = computeGraph.SplitColumns(hhSum2, m_hiddenDim * 3, m_hiddenDim); IWeightTensor gates = computeGraph.Sigmoid(gates_raw); IWeightTensor cell_write = computeGraph.Tanh(cell_write_raw); (IWeightTensor input_gate, IWeightTensor forget_gate, IWeightTensor output_gate) = computeGraph.SplitColumns(gates, m_hiddenDim, m_hiddenDim, m_hiddenDim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write Cell = g.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); IWeightTensor ct2 = m_layerNorm2.Norm(Cell, computeGraph); Hidden = g.EltMul(output_gate, computeGraph.Tanh(ct2)); return(Hidden); } }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { int srcSeqLen = attenPreProcessResult.inputsBatchFirst.Rows / batchSize; using (IComputeGraph g = graph.CreateSubGraph(m_name)) { // Affine decoder state IWeightTensor wc = g.Affine(state, m_Wa, m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] IWeightTensor wc1 = g.View(wc, batchSize, 1, wc.Columns); IWeightTensor wcExp = g.Expand(wc1, batchSize, srcSeqLen, wc.Columns); IWeightTensor ggs = null; if (m_enableCoverageModel) { // Get coverage model status at {t-1} IWeightTensor wCoverage = g.Affine(m_coverage.Hidden, m_Wc, m_bWc); IWeightTensor wCoverage1 = g.View(wCoverage, batchSize, srcSeqLen, -1); ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp); } IWeightTensor ggss = g.View(ggs, batchSize * srcSeqLen, -1); IWeightTensor atten = g.Mul(ggss, m_V); IWeightTensor attenT = g.Transpose(atten); IWeightTensor attenT2 = g.View(attenT, batchSize, srcSeqLen); IWeightTensor attenSoftmax1 = g.Softmax(attenT2, inPlace: true); IWeightTensor attenSoftmax = g.View(attenSoftmax1, batchSize, 1, srcSeqLen); IWeightTensor inputs2 = g.View(attenPreProcessResult.inputsBatchFirst, batchSize, srcSeqLen, attenPreProcessResult.inputsBatchFirst.Columns); IWeightTensor contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); if (m_enableCoverageModel) { // Concatenate tensor as input for coverage model IWeightTensor aCoverage = g.View(attenSoftmax1, attenPreProcessResult.inputsBatchFirst.Rows, 1); IWeightTensor state2 = g.View(state, batchSize, 1, state.Columns); IWeightTensor state3 = g.Expand(state2, batchSize, srcSeqLen, state.Columns); IWeightTensor state4 = g.View(state3, batchSize * srcSeqLen, -1); IWeightTensor concate = g.ConcatColumns(aCoverage, attenPreProcessResult.inputsBatchFirst, state4); m_coverage.Step(concate, graph); } return(contexts); } }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="g">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side. In training mode, it inputs target tokens, otherwise, it outputs target tokens generated by decoder</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> public override List <NetworkResult> RunForwardOnSingleDevice(IComputeGraph g, ISntPairBatch sntPairBatch, int deviceIdIdx, bool isTraining, DecodingOptions decodingOptions) { List <NetworkResult> nrs = new List <NetworkResult>(); var srcSnts = sntPairBatch.GetSrcTokens(0); var tgtSnts = sntPairBatch.GetTgtTokens(0); (IEncoder encoder, IWeightTensor srcEmbedding, IWeightTensor posEmbedding, FeedForwardLayer decoderFFLayer) = GetNetworksOnDeviceAt(deviceIdIdx); // Reset networks encoder.Reset(g.GetWeightFactory(), srcSnts.Count); var originalSrcLengths = BuildInTokens.PadSentences(srcSnts); var srcTokensList = m_modelMetaData.SrcVocab.GetWordIndex(srcSnts); BuildInTokens.PadSentences(tgtSnts); var tgtTokensLists = m_modelMetaData.ClsVocab.GetWordIndex(tgtSnts); int seqLen = srcSnts[0].Count; int batchSize = srcSnts.Count; // Encoding input source sentences IWeightTensor encOutput = Encoder.Run(g, sntPairBatch, encoder, m_modelMetaData, m_shuffleType, srcEmbedding, posEmbedding, null, srcTokensList, originalSrcLengths); IWeightTensor ffLayer = decoderFFLayer.Process(encOutput, batchSize, g); float cost = 0.0f; IWeightTensor probs = g.Softmax(ffLayer, inPlace: true); if (isTraining) { var tgtTokensTensor = g.CreateTokensTensor(tgtTokensLists); cost = g.CrossEntropyLoss(probs, tgtTokensTensor); } else { // Output "i"th target word using var targetIdxTensor = g.Argmax(probs, 1); float[] targetIdx = targetIdxTensor.ToWeightArray(); List <string> targetWords = m_modelMetaData.ClsVocab.ConvertIdsToString(targetIdx.ToList()); for (int k = 0; k < batchSize; k++) { tgtSnts[k] = targetWords.GetRange(k * seqLen, seqLen); } } NetworkResult nr = new NetworkResult { Cost = cost, Output = new List <List <List <string> > >() }; nr.Output.Add(tgtSnts); nrs.Add(nr); return(nrs); }
public IWeightTensor Process(IWeightTensor input, IComputeGraph g) { var innerGraph = g.CreateSubGraph(m_name); //var alphas = innerGraph.RepeatRows(m_alpha, input.Rows); //var betas = innerGraph.RepeatRows(m_beta, input.Rows); return(innerGraph.LayerNorm(input, m_alpha, m_beta)); }
public WeightMatrix Encode(WeightMatrix V, IComputeGraph g) { foreach (var encoder in encoders) { var e = encoder.Step(V, g); V = e; } return(V); }
static public IWeightTensor Run(IComputeGraph computeGraph, ISntPairBatch sntPairBatch, IEncoder encoder, IModel modelMetaData, ShuffleEnums shuffleType, IWeightTensor srcEmbedding, IWeightTensor posEmbedding, IWeightTensor segmentEmbedding, List <List <int> > srcSntsIds, float[] originalSrcLengths) { // Reset networks encoder.Reset(computeGraph.GetWeightFactory(), srcSntsIds.Count); IWeightTensor encOutput = InnerRunner(computeGraph, srcSntsIds, originalSrcLengths, shuffleType, encoder, modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding); return(encOutput); }
public IWeightTensor Encode(IWeightTensor V, IComputeGraph g) { foreach (var encoder in encoders) { var e = encoder.Step(V, g); V = e; } return(V); }
public AttentionPreProcessResult PreProcess(IWeightTensor inputs, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name + "_PreProcess"); AttentionPreProcessResult r = new AttentionPreProcessResult(); r.uhs = g.Affine(inputs, m_Ua, m_bUa); r.inputs = g.TransposeBatch(inputs, batchSize); return(r); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> /// public (IWeightTensor, IWeightTensor) Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor srcTgtMask, int batchSize, IComputeGraph g, bool outputAttnWeights = false, Dictionary <string, IWeightTensor> cachedTensors = null) { IWeightTensor attnProbs = null; using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder")) { int seqLenQ = tgtInputs.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = encOutputBatchFirst.Rows / batchSize; IWeightTensor selfMaskTensor = null; if (tgtSelfMask != null) { selfMaskTensor = subg.Expand(tgtSelfMask, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ }); } IWeightTensor crossMaskTensor = null; if (srcTgtMask != null) { crossMaskTensor = subg.Expand(srcTgtMask, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK }); } for (int k = 0; k < m_selfAttns.Count; k++) { (tgtInputs, attnProbs) = m_selfAttns[k].Perform(tgtInputs, selfMaskTensor, batchSize, subg, outputAttenWeights: false); (tgtInputs, attnProbs) = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, crossMaskTensor, batchSize, subg, outputAttenWeights: (outputAttnWeights && k == m_selfAttns.Count - 1), cachedTensors: cachedTensors); tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg); } tgtInputs = layerNorm.Norm(tgtInputs, subg); tgtInputs.UnbindFromComputeGraph(); if (attnProbs != null) { attnProbs.UnbindFromComputeGraph(); } if (selfMaskTensor != null) { selfMaskTensor.Dispose(); } if (crossMaskTensor != null) { crossMaskTensor.Dispose(); } } // tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g); return(tgtInputs, attnProbs); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false) { using IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"); int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections var weightedQKV = g.View(g.Affine(inputQNorm, QKV, QKVb), dims: new long[] { batchSize, seqLenQ, 3, m_multiHeadNum, m_d }); var allQ = g.Select(weightedQKV, 2, 0); var allK = g.Select(weightedQKV, 2, 1); var allV = g.Select(weightedQKV, 2, 2); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { //Merge all attention probs over multi-heads sumAttnWeights = graph.Sum(attnProbs, 1); sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenQ }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenQ }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }