/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false) { using IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"); int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections var weightedQKV = g.View(g.Affine(inputQNorm, QKV, QKVb), dims: new long[] { batchSize, seqLenQ, 3, m_multiHeadNum, m_d }); var allQ = g.Select(weightedQKV, 2, 0); var allK = g.Select(weightedQKV, 2, 1); var allV = g.Select(weightedQKV, 2, 2); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { //Merge all attention probs over multi-heads sumAttnWeights = graph.Sum(attnProbs, 1); sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenQ }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenQ }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }
public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { if (m_sharedQKV == false) { throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors."); } using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV")) { int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections float scale = 1.0f / (float)(m_inputDim); IWeightTensor mulQ, mulK, mulV; using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns })) { using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns })) { using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, 3, scale)) { mulQ = g.Select(mulQKV, 0, 0); mulK = g.Select(mulQKV, 0, 1); mulV = g.Select(mulQKV, 0, 2); } } } IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); // Scaled softmax scale = 1.0f / (float)(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNorm1.Norm(inputQ, g); IWeightTensor inputKNorm = (inputK == inputQ) ? inputQNorm : inputK; // layerNorm1.Norm(inputK, g); IWeightTensor inputVNorm = (inputK == inputV) ? inputKNorm : inputV; // layerNorm1.Norm(inputV, g); //Input projections IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputKNorm, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputVNorm, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d }); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, dims: new long[] { m_multiHeadNum *batchSize *seqLenQ, seqLenK }); if (keyMask != null) { // attn2 = g.Add(attn2, mask, runGradient2: false); attn2 = g.MaskFill(attn2, keyMask, -1e9f); } IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, seqLenK }); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { int srcSeqLen = attenPreProcessResult.inputsBatchFirst.Rows / batchSize; using (IComputeGraph g = graph.CreateSubGraph(m_name)) { // Affine decoder state IWeightTensor wc = g.Affine(state, m_Wa, m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] IWeightTensor wc1 = g.View(wc, batchSize, 1, wc.Columns); IWeightTensor wcExp = g.Expand(wc1, batchSize, srcSeqLen, wc.Columns); IWeightTensor ggs = null; if (m_enableCoverageModel) { // Get coverage model status at {t-1} IWeightTensor wCoverage = g.Affine(m_coverage.Hidden, m_Wc, m_bWc); IWeightTensor wCoverage1 = g.View(wCoverage, batchSize, srcSeqLen, -1); ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp); } IWeightTensor ggss = g.View(ggs, batchSize * srcSeqLen, -1); IWeightTensor atten = g.Mul(ggss, m_V); IWeightTensor attenT = g.Transpose(atten); IWeightTensor attenT2 = g.View(attenT, batchSize, srcSeqLen); IWeightTensor attenSoftmax1 = g.Softmax(attenT2, inPlace: true); IWeightTensor attenSoftmax = g.View(attenSoftmax1, batchSize, 1, srcSeqLen); IWeightTensor inputs2 = g.View(attenPreProcessResult.inputsBatchFirst, batchSize, srcSeqLen, attenPreProcessResult.inputsBatchFirst.Columns); IWeightTensor contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); if (m_enableCoverageModel) { // Concatenate tensor as input for coverage model IWeightTensor aCoverage = g.View(attenSoftmax1, attenPreProcessResult.inputsBatchFirst.Rows, 1); IWeightTensor state2 = g.View(state, batchSize, 1, state.Columns); IWeightTensor state3 = g.Expand(state2, batchSize, srcSeqLen, state.Columns); IWeightTensor state4 = g.View(state3, batchSize * srcSeqLen, -1); IWeightTensor concate = g.ConcatColumns(aCoverage, attenPreProcessResult.inputsBatchFirst, state4); m_coverage.Step(concate, graph); } return(contexts); } }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var seqLen = input.Rows / m_batchSize; //Input projections var allQ = g.View(Q.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allK = g.View(K.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allV = g.View(V.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions var Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); var Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * m_batchSize, m_d, seqLen); var Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); var attn = g.MulBatch(Qs, Ks, m_multiHeadNum * m_batchSize, scale); var attn2 = g.View(attn, m_multiHeadNum * m_batchSize * seqLen, seqLen); var softmax = g.Softmax(attn2); var softmax2 = g.View(softmax, m_multiHeadNum * m_batchSize, seqLen, seqLen); var o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * m_batchSize), m_multiHeadNum, m_batchSize, seqLen, m_d); var W = g.View(g.Permute(o, 1, 2, 0, 3), m_batchSize * seqLen, m_multiHeadNum * m_d); // Output projection var finalAttResults = g.Affine(W, W0, b0); //Skip connection and layer normaliztion var addedAttResult = g.Add(finalAttResults, input); var normAddedAttResult = layerNorm1.Process(addedAttResult, g); //Feed forward var ffnResult = feedForwardLayer1.Process(normAddedAttResult, g); var reluFFNResult = g.Relu(ffnResult); var ffn2Result = feedForwardLayer2.Process(reluFFNResult, g); //Skip connection and layer normaliztion var addFFNResult = g.Add(ffn2Result, normAddedAttResult); var normAddFFNResult = layerNorm2.Process(addFFNResult, g); return(normAddFFNResult); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph(m_name)) { int seqLen = input.Rows / batchSize; IWeightTensor nInput = layerNorm1.Norm(input, g); //Input projections IWeightTensor allQ = g.View(g.Affine(nInput, Q, Qb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allK = g.View(g.Affine(nInput, K, Kb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allV = g.View(g.Affine(nInput, V, Vb), batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * batchSize, m_d, seqLen); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, m_multiHeadNum * batchSize * seqLen, seqLen); IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, m_multiHeadNum * batchSize, seqLen, seqLen); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), m_multiHeadNum, batchSize, seqLen, m_d); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), batchSize * seqLen, m_multiHeadNum * m_d); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); //Skip connection and layer normaliztion IWeightTensor normAddedAttResult = layerNorm2.AddNorm(finalAttResults, input, g); //Feed forward IWeightTensor ffnResult = feedForwardLayer1.Process(normAddedAttResult, batchSize, g); IWeightTensor reluFFNResult = g.Relu(ffnResult); IWeightTensor ffn2Result = feedForwardLayer2.Process(reluFFNResult, batchSize, g); //Skip connection and layer normaliztion IWeightTensor addFFNResult = graph.Add(ffn2Result, normAddedAttResult); return(addFFNResult); } }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { if (m_sharedQKV) { throw new ArgumentException($"Layer '{m_name}' is in shared QKV mode, please call antoher Perform function with single input tensor."); } using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections float scale = 1.0f / (float)(m_inputDim); IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d }); // Scaled softmax scale = 1.0f / (float)(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); var atten2 = g.PermuteBatch(atten, m_batchSize); var attenT = g.Transpose2(atten2); var attenT2 = g.View(attenT, m_batchSize, attenPreProcessResult.inputs.Rows / m_batchSize); var attenSoftmax = g.Softmax(attenT2); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var wc = g.Affine(state, m_Wa, m_bWa); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, m_V); var atten2 = g.TransposeBatch(atten, batchSize); var attenT = g.Transpose(atten2); var attenT2 = g.View(attenT, batchSize, attenPreProcessResult.inputs.Rows / batchSize); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, batchSize, attenSoftmax1.Rows / batchSize, attenSoftmax1.Columns); var inputs2 = g.View(attenPreProcessResult.inputs, batchSize, attenPreProcessResult.inputs.Rows / batchSize, attenPreProcessResult.inputs.Columns); IWeightTensor contexts = g.MulBatch(attenSoftmax, inputs2, batchSize); return(contexts); }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); List <IWeightMatrix> attens = g.UnFolderRow(atten, m_batchSize); List <IWeightMatrix> attensT = new List <IWeightMatrix>(); for (int i = 0; i < m_batchSize; i++) { attensT.Add(g.Transpose2(attens[i])); } var attenT = g.ConcatRows(attensT); var attenSoftmax = g.Softmax(attenT); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false, Dictionary <string, IWeightTensor> cachedTensors = null) { string keyName = $"{m_name}_MultiHeadAttention"; using IComputeGraph g = graph.CreateSubGraph(keyName); int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = null; IWeightTensor Vs = null; if (cachedTensors == null) // We don't use any cached tensors { IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); } else { string KsCacheName = keyName + "_" + nameof(Ks); string VsCacheName = keyName + "_" + nameof(Vs); if (cachedTensors.ContainsKey(KsCacheName) == false) { IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); cachedTensors.Add(KsCacheName, Ks.CopyWeightsRef(KsCacheName, Ks.NeedGradient)); } else { Ks = cachedTensors[KsCacheName]; } if (cachedTensors.ContainsKey(VsCacheName) == false) { IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); cachedTensors.Add(VsCacheName, Vs.CopyWeightsRef(VsCacheName, Vs.NeedGradient)); } else { Vs = cachedTensors[VsCacheName]; } } // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { sumAttnWeights = g.Select(attnProbs, 1, 0); for (int i = 1; i < m_multiHeadNum; i++) { var tmp = g.Select(attnProbs, 1, i); sumAttnWeights = g.Add(sumAttnWeights, tmp); } sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenK }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); if (inputK == inputQ) { inputK = inputQNorm; } if (inputV == inputQ) { inputV = inputQNorm; } //Input projections float scale = 1.0f; IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); // Scaled softmax scale = 1.0f / (float)(Math.Sqrt(m_d)); IWeightTensor attn = g.MulBatch(Qs, Ks, batchSize * m_multiHeadNum, scale); if (keyMask != null) { using (var keyMaskView = g.View(keyMask, runGradient: false, dims: new long[] { batchSize, 1, seqLenQ, seqLenK })) { using (var keyMaskViewExp = g.Expand(keyMaskView, runGradient: false, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK })) { using (var keyMaskViewExpConti = g.AsContiguous(keyMaskViewExp, runGradient: false)) { using (var keyMaskViewExpContiView = g.View(keyMaskViewExpConti, runGradient: false, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK })) { attn = g.Add(attn, keyMaskViewExpContiView, runGradient1: true, runGradient2: false); } } } } } IWeightTensor softmax = g.Softmax(attn, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, batchSize * m_multiHeadNum), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attnPre, int batchSize, IComputeGraph graph) { var srcSeqLen = attnPre.encOutput.Rows / batchSize; using (var g = graph.CreateSubGraph(this.m_name)) { // Affine decoder state var wc = g.Affine(state, this.m_Wa, this.m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] var wc1 = g.View(wc, dims: new long[] { batchSize, 1, wc.Columns }); var wcExp = g.Expand(wc1, dims: new long[] { batchSize, srcSeqLen, wc.Columns }); IWeightTensor ggs = null; if (this.m_enableCoverageModel) { // Get coverage model status at {t-1} var wCoverage = g.Affine(this.m_coverage.Hidden, this.m_Wc, this.m_bWc); var wCoverage1 = g.View(wCoverage, dims: new long[] { batchSize, srcSeqLen, -1 }); ggs = g.AddTanh(attnPre.Uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attnPre.Uhs, wcExp); } var ggss = g.View(ggs, dims: new long[] { batchSize *srcSeqLen, -1 }); var atten = g.Mul(ggss, this.m_V); var attenT = g.Transpose(atten); var attenT2 = g.View(attenT, dims: new long[] { batchSize, srcSeqLen }); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, dims: new long[] { batchSize, 1, srcSeqLen }); var inputs2 = g.View(attnPre.encOutput, dims: new long[] { batchSize, srcSeqLen, attnPre.encOutput.Columns }); var contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); contexts = graph.View(contexts, dims: new long[] { batchSize, attnPre.encOutput.Columns }); if (this.m_enableCoverageModel) { // Concatenate tensor as input for coverage model var aCoverage = g.View(attenSoftmax1, dims: new long[] { attnPre.encOutput.Rows, 1 }); var state2 = g.View(state, dims: new long[] { batchSize, 1, state.Columns }); var state3 = g.Expand(state2, dims: new long[] { batchSize, srcSeqLen, state.Columns }); var state4 = g.View(state3, dims: new long[] { batchSize *srcSeqLen, -1 }); var concate = g.ConcatColumns(aCoverage, attnPre.encOutput, state4); this.m_coverage.Step(concate, graph); } return(contexts); } }