/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor rawInput, int batchSize, IComputeGraph g) { int seqLen = rawInput.Rows / batchSize; IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim); IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false); // Transpose to batch-first based sequence IWeightTensor inputs = g.TransposeBatch(rawInput, batchSize); inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); // We don't update position embedding, so dispose it now to save memory. posEmbeddingRepeat.Dispose(); posEmbedding.Dispose(); inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true); for (int k = 0; k < m_encoders.Count; k++) { inputs = m_encoders[k].Perform(inputs, batchSize, g); } // Transpose back to time-first based sequence rawInput = g.TransposeBatch(inputs, seqLen); return(rawInput); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false) { using IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"); int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections var weightedQKV = g.View(g.Affine(inputQNorm, QKV, QKVb), dims: new long[] { batchSize, seqLenQ, 3, m_multiHeadNum, m_d }); var allQ = g.Select(weightedQKV, 2, 0); var allK = g.Select(weightedQKV, 2, 1); var allV = g.Select(weightedQKV, 2, 2); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { //Merge all attention probs over multi-heads sumAttnWeights = graph.Sum(attnProbs, 1); sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenQ }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenQ }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }
private float DecodeOutput(string[] OutputSentence, IComputeGraph g, float cost, SparseWeightMatrix sparseInput, List <WeightMatrix> encoded, AttentionDecoder decoder, WeightMatrix Whd, WeightMatrix bd, WeightMatrix Embedding) { int ix_input = (int)SENTTAGS.START; for (int i = 0; i < OutputSentence.Length + 1; i++) { int ix_target = (int)SENTTAGS.UNK; if (i == OutputSentence.Length) { ix_target = (int)SENTTAGS.END; } else { if (t_wordToIndex.ContainsKey(OutputSentence[i])) { ix_target = t_wordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_input); var eOutput = decoder.Decode(sparseInput, x, encoded, g); if (UseDropout) { eOutput = g.Dropout(eOutput, 0.2f); } var o = g.muladd(eOutput, Whd, bd); if (UseDropout) { o = g.Dropout(o, 0.2f); } var probs = g.SoftmaxWithCrossEntropy(o); cost += (float)-Math.Log(probs.Weight[ix_target]); o.Gradient = probs.Weight; o.Gradient[ix_target] -= 1; ix_input = ix_target; } return(cost); }
public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { if (m_sharedQKV == false) { throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors."); } using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV")) { int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections float scale = 1.0f / (float)(m_inputDim); IWeightTensor mulQ, mulK, mulV; using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns })) { using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns })) { using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, 3, scale)) { mulQ = g.Select(mulQKV, 0, 0); mulK = g.Select(mulQKV, 0, 1); mulV = g.Select(mulQKV, 0, 2); } } } IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); // Scaled softmax scale = 1.0f / (float)(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> /// public IWeightTensor Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor decEncAttnMask, IWeightTensor tgtDimMask, int batchSize, IComputeGraph g) { int tgtSeqLen = tgtInputs.Rows / batchSize; int srcSeqLen = encOutputBatchFirst.Rows / batchSize; using (IWeightTensor posEmbedding = g.BuildPositionMatrix(tgtSeqLen, m_inputDim)) { using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false)) { tgtInputs = g.AddMul(posEmbeddingRepeat, tgtInputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); } } tgtInputs = g.Dropout(tgtInputs, batchSize, m_dropoutRatio, inPlace: true); var tgtSelfMaskRep = g.View(tgtSelfMask, dims: new long[] { 1, batchSize, tgtSeqLen, tgtSeqLen }); var tgtSelfMaskRepExp = g.Expand(tgtSelfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, tgtSeqLen }); var decEncAttnMaskRep = g.View(decEncAttnMask, dims: new long[] { 1, batchSize, tgtSeqLen, srcSeqLen }); var decEncAttnMaskRepExp = g.Expand(decEncAttnMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, srcSeqLen }); var tgtSelfMaskRepExpView = g.View(tgtSelfMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, tgtSeqLen }); var decEncAttnMaskRepExpView = g.View(decEncAttnMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, srcSeqLen }); tgtSelfMaskRep.Dispose(); tgtSelfMaskRepExp.Dispose(); decEncAttnMaskRep.Dispose(); decEncAttnMaskRepExp.Dispose(); using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder")) { for (int k = 0; k < m_selfAttns.Count; k++) { tgtInputs = g.MaskFill(tgtInputs, tgtDimMask, 0.0f); tgtInputs = m_selfAttns[k].Perform(tgtInputs, tgtInputs, tgtInputs, tgtSelfMaskRepExpView, batchSize, subg); tgtInputs = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, decEncAttnMaskRepExpView, batchSize, subg); tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg); } tgtInputs.UnbindFromComputeGraph(); } tgtInputs = layerNorm.Norm(tgtInputs, g); // tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g); return(tgtInputs); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNorm1.Norm(inputQ, g); IWeightTensor inputKNorm = (inputK == inputQ) ? inputQNorm : inputK; // layerNorm1.Norm(inputK, g); IWeightTensor inputVNorm = (inputK == inputV) ? inputKNorm : inputV; // layerNorm1.Norm(inputV, g); //Input projections IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputKNorm, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputVNorm, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d }); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, dims: new long[] { m_multiHeadNum *batchSize *seqLenQ, seqLenK }); if (keyMask != null) { // attn2 = g.Add(attn2, mask, runGradient2: false); attn2 = g.MaskFill(attn2, keyMask, -1e9f); } IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, seqLenK }); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
public IWeightTensor Decode(IWeightTensor input, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph g) { IWeightTensor V = input; IWeightTensor lastStatus = m_decoders.LastOrDefault().Cell; IWeightTensor context = m_attentionLayer.Perform(lastStatus, attenPreProcessResult, batchSize, g); foreach (LSTMAttentionDecoderCell decoder in m_decoders) { IWeightTensor e = decoder.Step(context, V, g); V = e; } IWeightTensor eOutput = g.Dropout(V, batchSize, m_dropoutRatio, false); // eOutput = m_decoderFFLayer.Process(eOutput, batchSize, g); return(eOutput); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph(m_name)) { int seqLen = input.Rows / batchSize; IWeightTensor nInput = layerNorm1.Norm(input, g); //Input projections IWeightTensor allQ = g.View(g.Affine(nInput, Q, Qb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allK = g.View(g.Affine(nInput, K, Kb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allV = g.View(g.Affine(nInput, V, Vb), batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * batchSize, m_d, seqLen); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, m_multiHeadNum * batchSize * seqLen, seqLen); IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, m_multiHeadNum * batchSize, seqLen, seqLen); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), m_multiHeadNum, batchSize, seqLen, m_d); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), batchSize * seqLen, m_multiHeadNum * m_d); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); //Skip connection and layer normaliztion IWeightTensor normAddedAttResult = layerNorm2.AddNorm(finalAttResults, input, g); //Feed forward IWeightTensor ffnResult = feedForwardLayer1.Process(normAddedAttResult, batchSize, g); IWeightTensor reluFFNResult = g.Relu(ffnResult); IWeightTensor ffn2Result = feedForwardLayer2.Process(reluFFNResult, batchSize, g); //Skip connection and layer normaliztion IWeightTensor addFFNResult = graph.Add(ffn2Result, normAddedAttResult); return(addFFNResult); } }
private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs) { using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false)) { using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, false, new long[] { 1, seqLen, this.m_modelMetaData.EmbeddingDim })) { using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, false, new long[] { batchSize, seqLen, this.m_modelMetaData.EmbeddingDim })) { inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, this.m_modelMetaData.EmbeddingDim }); inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false); inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, this.m_modelMetaData.EmbeddingDim }); } } } inputEmbs = g.Dropout(inputEmbs, batchSize, this.m_dropoutRatio, true); return(inputEmbs); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { if (m_sharedQKV) { throw new ArgumentException($"Layer '{m_name}' is in shared QKV mode, please call antoher Perform function with single input tensor."); } using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections float scale = 1.0f / (float)(m_inputDim); IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d }); // Scaled softmax scale = 1.0f / (float)(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
public static IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, IWeightTensor inputEmbs, float dropoutRatio) { var Column = posEmbedding.Columns; int seqLen = inputEmbs.Rows / batchSize; using (var posEmbeddingPeek = g.Peek(posEmbedding, 0, 0, seqLen)) { using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, dims: new long[] { 1, seqLen, Column })) { using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, dims: new long[] { batchSize, seqLen, Column })) { inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, Column }); inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, inPlace: true); inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, Column }); } } } inputEmbs = g.Dropout(inputEmbs, batchSize, dropoutRatio, inPlace: true); return(inputEmbs); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor inputs, IWeightTensor selfMask, IWeightTensor dimMask, int batchSize, IComputeGraph g) { int seqLen = inputs.Rows / batchSize; using (IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim)) { using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false)) { inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true); } } inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true); var selfMaskRep = g.View(selfMask, dims: new long[] { 1, batchSize, seqLen, seqLen }); var multiHeadhSelfMaskRep = g.Expand(selfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, seqLen, seqLen }); var multiHeadhSelfMaskRepView = g.View(multiHeadhSelfMaskRep, dims: new long[] { m_multiHeadNum *batchSize *seqLen, seqLen }); selfMaskRep.Dispose(); multiHeadhSelfMaskRep.Dispose(); using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Encoder")) { for (int k = 0; k < m_encoders.Count; k++) { inputs = g.MaskFill(inputs, dimMask, 0.0f); inputs = m_encoders[k].Perform(inputs, inputs, inputs, multiHeadhSelfMaskRepView, batchSize, subg); inputs = m_posFFNs[k].Perform(inputs, batchSize, subg); } inputs.UnbindFromComputeGraph(); } inputs = layerNorm.Norm(inputs, g); return(inputs); }
private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs) { var Column = posEmbedding.Columns; inputEmbs = g.Mul(inputEmbs, (float)Math.Sqrt(m_modelMetaData.HiddenDim)); using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false)) { using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, runGradient: false, dims: new long[] { 1, seqLen, Column })) { using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, runGradient: false, dims: new long[] { batchSize, seqLen, Column })) { inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, Column }); inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false); inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, Column }); } } } inputEmbs = g.Dropout(inputEmbs, batchSize, m_dropoutRatio, inPlace: true); return(inputEmbs); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode var originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; var dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model var attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); eOutput = g.Dropout(eOutput, batchSize, dropoutRatio, true); eOutput = decoderFFLayer.Process(eOutput, batchSize, g); //Softmax for output using (var probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (var probs_k = g.PeekRow(probs, k, runGradients: false)) { var ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); var score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false, Dictionary <string, IWeightTensor> cachedTensors = null) { string keyName = $"{m_name}_MultiHeadAttention"; using IComputeGraph g = graph.CreateSubGraph(keyName); int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = null; IWeightTensor Vs = null; if (cachedTensors == null) // We don't use any cached tensors { IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); } else { string KsCacheName = keyName + "_" + nameof(Ks); string VsCacheName = keyName + "_" + nameof(Vs); if (cachedTensors.ContainsKey(KsCacheName) == false) { IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); cachedTensors.Add(KsCacheName, Ks.CopyWeightsRef(KsCacheName, Ks.NeedGradient)); } else { Ks = cachedTensors[KsCacheName]; } if (cachedTensors.ContainsKey(VsCacheName) == false) { IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); cachedTensors.Add(VsCacheName, Vs.CopyWeightsRef(VsCacheName, Vs.NeedGradient)); } else { Vs = cachedTensors[VsCacheName]; } } // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { sumAttnWeights = g.Select(attnProbs, 1, 0); for (int i = 1; i < m_multiHeadNum; i++) { var tmp = g.Select(attnProbs, 1, i); sumAttnWeights = g.Add(sumAttnWeights, tmp); } sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenK }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } var o = decoderFFLayer.Process(eOutput, g); //Softmax for output // var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } return(cost); }
public IWeightTensor Process(IWeightTensor inputT, int batchSize, IComputeGraph g) { var res = g.Affine(inputT, this.m_Whd, this.m_Bd); return(g.Dropout(res, batchSize, this.m_dropoutRatio, true)); }
public IWeightTensor Process(IWeightTensor inputT, int batchSize, IComputeGraph g) { IWeightTensor res = g.Affine(inputT, m_Whd, m_Bd); return(g.Dropout(res, batchSize, m_dropoutRatio, inPlace: true)); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); if (inputK == inputQ) { inputK = inputQNorm; } if (inputV == inputQ) { inputV = inputQNorm; } //Input projections float scale = 1.0f; IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); // Scaled softmax scale = 1.0f / (float)(Math.Sqrt(m_d)); IWeightTensor attn = g.MulBatch(Qs, Ks, batchSize * m_multiHeadNum, scale); if (keyMask != null) { using (var keyMaskView = g.View(keyMask, runGradient: false, dims: new long[] { batchSize, 1, seqLenQ, seqLenK })) { using (var keyMaskViewExp = g.Expand(keyMaskView, runGradient: false, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK })) { using (var keyMaskViewExpConti = g.AsContiguous(keyMaskViewExp, runGradient: false)) { using (var keyMaskViewExpContiView = g.View(keyMaskViewExpConti, runGradient: false, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK })) { attn = g.Add(attn, keyMaskViewExpContiView, runGradient1: true, runGradient2: false); } } } } } IWeightTensor softmax = g.Softmax(attn, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, batchSize * m_multiHeadNum), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }