public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputsUnfolder[0].Rows); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); List <IWeightMatrix> attens = g.UnFolderRow(atten, m_batchSize); List <IWeightMatrix> contexts = new List <IWeightMatrix>(); List <IWeightMatrix> attensT = new List <IWeightMatrix>(); for (int i = 0; i < m_batchSize; i++) { attensT.Add(g.Transpose2(attens[i])); } var attenT = g.ConcatRows(attensT); var attenSoftmax = g.SoftmaxM(attenT); for (int i = 0; i < m_batchSize; i++) { IWeightMatrix context = g.Mul(g.PeekRow(attenSoftmax, i), attenPreProcessResult.inputsUnfolder[i]); contexts.Add(context); } return(g.ConcatRows(contexts)); }
public IWeightMatrix Process(IWeightMatrix input, IComputeGraph innerGraph) { var alphas = innerGraph.RepeatRows(alpha, input.Rows); var betas = innerGraph.RepeatRows(beta, input.Rows); return(innerGraph.LayerNorm(input, alphas, betas)); }
public WeightMatrix Perform(WeightMatrix input, WeightMatrix state, IComputeGraph g) { WeightMatrix context; List <WeightMatrix> atten = new List <WeightMatrix>(); var stateRepeat = g.RepeatRows(state, input.Rows); var baiseInput = new WeightMatrix(input.Rows, 1, 1); var inputb = g.concatColumns(input, baiseInput); var uh = g.mul(inputb, Ua); baiseInput = new WeightMatrix(stateRepeat.Rows, 1, 1); stateRepeat = g.concatColumns(stateRepeat, baiseInput); var wc = g.mul(stateRepeat, Wa); var gg = g.addtanh(uh, wc); var aa = g.mul(gg, V); var res = g.Softmax(aa); var weighted = g.weightRows(input, res);; context = g.sumColumns(weighted); return(context); }
public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation var retain_cell = innerGraph.EltMul(forget_gate, cell_prev); // what do we keep from cell var write_cell = innerGraph.EltMul(input_gate, cell_write); // what do we write to cell ct = innerGraph.Add(retain_cell, write_cell); // new cell contents // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct)); return(ht); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor rawInput, int batchSize, IComputeGraph g) { int seqLen = rawInput.Rows / batchSize; IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim); IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false); // Transpose to batch-first based sequence IWeightTensor inputs = g.TransposeBatch(rawInput, batchSize); inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); // We don't update position embedding, so dispose it now to save memory. posEmbeddingRepeat.Dispose(); posEmbedding.Dispose(); inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true); for (int k = 0; k < m_encoders.Count; k++) { inputs = m_encoders[k].Perform(inputs, batchSize, g); } // Transpose back to time-first based sequence rawInput = g.TransposeBatch(inputs, seqLen); return(rawInput); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightMatrix Step(IWeightMatrix context, IWeightMatrix input, IComputeGraph computeGraph) { var cell_prev = ct; var hidden_prev = ht; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var bs = computeGraph.RepeatRows(b, input.Rows); var hhSum = computeGraph.MulAdd(hxhc, Wxhc, bs); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum, hdim * 3, hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation //var retain_cell = computeGraph.EltMul(forget_gate, cell_prev); //var write_cell = computeGraph.EltMul(input_gate, cell_write); //ct = computeGraph.Add(retain_cell, write_cell); ct = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); ht = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct)); return(ht); }
public IWeightMatrix Step(IWeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var inputs = innerGraph.ConcatColumns(input, hidden_prev); var bs = innerGraph.RepeatRows(b, input.Rows); var hhSum = innerGraph.MulAdd(inputs, Wxh, bs); var hhSum2 = layerNorm1.Process(hhSum, innerGraph); (var gates_raw, var cell_write_raw) = innerGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = innerGraph.Sigmoid(gates_raw); var cell_write = innerGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = innerGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = innerGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, innerGraph); // compute hidden state as gated, saturated cell activations ht = innerGraph.EltMul(output_gate, innerGraph.Tanh(ct2)); return(ht); }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); var atten2 = g.PermuteBatch(atten, m_batchSize); var attenT = g.Transpose2(atten2); var attenT2 = g.View(attenT, m_batchSize, attenPreProcessResult.inputs.Rows / m_batchSize); var attenSoftmax = g.Softmax(attenT2); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
public AttentionPreProcessResult PreProcess(IWeightMatrix inputs, IComputeGraph g) { AttentionPreProcessResult r = new AttentionPreProcessResult(); IWeightMatrix bUas = g.RepeatRows(bUa, inputs.Rows); r.uhs = g.MulAdd(inputs, Ua, bUas); r.inputs = g.ConcatRows(g.UnFolderRow(inputs, m_batchSize)); return(r); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> /// public IWeightTensor Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor decEncAttnMask, IWeightTensor tgtDimMask, int batchSize, IComputeGraph g) { int tgtSeqLen = tgtInputs.Rows / batchSize; int srcSeqLen = encOutputBatchFirst.Rows / batchSize; using (IWeightTensor posEmbedding = g.BuildPositionMatrix(tgtSeqLen, m_inputDim)) { using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false)) { tgtInputs = g.AddMul(posEmbeddingRepeat, tgtInputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); } } tgtInputs = g.Dropout(tgtInputs, batchSize, m_dropoutRatio, inPlace: true); var tgtSelfMaskRep = g.View(tgtSelfMask, dims: new long[] { 1, batchSize, tgtSeqLen, tgtSeqLen }); var tgtSelfMaskRepExp = g.Expand(tgtSelfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, tgtSeqLen }); var decEncAttnMaskRep = g.View(decEncAttnMask, dims: new long[] { 1, batchSize, tgtSeqLen, srcSeqLen }); var decEncAttnMaskRepExp = g.Expand(decEncAttnMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, srcSeqLen }); var tgtSelfMaskRepExpView = g.View(tgtSelfMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, tgtSeqLen }); var decEncAttnMaskRepExpView = g.View(decEncAttnMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, srcSeqLen }); tgtSelfMaskRep.Dispose(); tgtSelfMaskRepExp.Dispose(); decEncAttnMaskRep.Dispose(); decEncAttnMaskRepExp.Dispose(); using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder")) { for (int k = 0; k < m_selfAttns.Count; k++) { tgtInputs = g.MaskFill(tgtInputs, tgtDimMask, 0.0f); tgtInputs = m_selfAttns[k].Perform(tgtInputs, tgtInputs, tgtInputs, tgtSelfMaskRepExpView, batchSize, subg); tgtInputs = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, decEncAttnMaskRepExpView, batchSize, subg); tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg); } tgtInputs.UnbindFromComputeGraph(); } tgtInputs = layerNorm.Norm(tgtInputs, g); // tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g); return(tgtInputs); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor rawInput, IComputeGraph g) { int seqLen = rawInput.Rows / m_batchSize; var posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim); var posEmbeddingRepeat = g.RepeatRows(posEmbedding, m_batchSize); // Transpose to batch-first based sequence var inputs = g.TransposeBatch(rawInput, m_batchSize); inputs = g.Mul(inputs, (float)Math.Sqrt(m_inputDim)); inputs = g.Add(inputs, posEmbeddingRepeat); for (int k = 0; k < m_encoders.Count; k++) { inputs = m_encoders[k].Perform(inputs, g); } // Transpose back to time-first based sequence rawInput = g.TransposeBatch(inputs, seqLen); return(rawInput); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var wc = g.Affine(state, m_Wa, m_bWa); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, m_V); var atten2 = g.TransposeBatch(atten, batchSize); var attenT = g.Transpose(atten2); var attenT2 = g.View(attenT, batchSize, attenPreProcessResult.inputs.Rows / batchSize); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, batchSize, attenSoftmax1.Rows / batchSize, attenSoftmax1.Columns); var inputs2 = g.View(attenPreProcessResult.inputs, batchSize, attenPreProcessResult.inputs.Rows / batchSize, attenPreProcessResult.inputs.Columns); IWeightTensor contexts = g.MulBatch(attenSoftmax, inputs2, batchSize); return(contexts); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor inputs, IWeightTensor selfMask, IWeightTensor dimMask, int batchSize, IComputeGraph g) { int seqLen = inputs.Rows / batchSize; using (IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim)) { using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false)) { inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); } } inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true); var selfMaskRep = g.View(selfMask, dims: new long[] { 1, batchSize, seqLen, seqLen }); var multiHeadhSelfMaskRep = g.Expand(selfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, seqLen, seqLen }); var multiHeadhSelfMaskRepView = g.View(multiHeadhSelfMaskRep, dims: new long[] { m_multiHeadNum *batchSize *seqLen, seqLen }); selfMaskRep.Dispose(); multiHeadhSelfMaskRep.Dispose(); using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Encoder")) { for (int k = 0; k < m_encoders.Count; k++) { inputs = g.MaskFill(inputs, dimMask, 0.0f); inputs = m_encoders[k].Perform(inputs, inputs, inputs, multiHeadhSelfMaskRepView, batchSize, subg); inputs = m_posFFNs[k].Perform(inputs, batchSize, subg); } inputs.UnbindFromComputeGraph(); } inputs = layerNorm.Norm(inputs, g); return(inputs); }
/// <summary> /// Update LSTM-Attention cells according to given weights /// </summary> /// <param name="context">The context weights for attention</param> /// <param name="input">The input weights</param> /// <param name="computeGraph">The compute graph to build workflow</param> /// <returns>Update hidden weights</returns> public IWeightMatrix Step(IWeightMatrix context, IWeightMatrix input, IComputeGraph computeGraph) { var cell_prev = ct; var hidden_prev = ht; var hxhc = computeGraph.ConcatColumns(input, hidden_prev, context); var bs = computeGraph.RepeatRows(b, input.Rows); var hhSum = computeGraph.MulAdd(hxhc, Wxhc, bs); var hhSum2 = layerNorm1.Process(hhSum, computeGraph); (var gates_raw, var cell_write_raw) = computeGraph.SplitColumns(hhSum2, hdim * 3, hdim); var gates = computeGraph.Sigmoid(gates_raw); var cell_write = computeGraph.Tanh(cell_write_raw); (var input_gate, var forget_gate, var output_gate) = computeGraph.SplitColumns(gates, hdim, hdim, hdim); // compute new cell activation: ct = forget_gate * cell_prev + input_gate * cell_write ct = computeGraph.EltMulMulAdd(forget_gate, cell_prev, input_gate, cell_write); var ct2 = layerNorm2.Process(ct, computeGraph); ht = computeGraph.EltMul(output_gate, computeGraph.Tanh(ct2)); return(ht); }
public IWeightMatrix Process(IWeightMatrix inputT, IComputeGraph g) { var bds = g.RepeatRows(m_Bd, inputT.Rows); return(g.MulAdd(inputT, m_Whd, bds)); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, IWeightMatrix Whd, IWeightMatrix bd, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } var bds = g.RepeatRows(bd, m_batchSize); for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } //Softmax for output var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); } return(cost); }