public IWeightTensor Perform(IWeightTensor input, int batchSize, IComputeGraph graph) { using IComputeGraph g = graph.CreateSubGraph($"{m_name}_PositionwiseFeedForward"); var inputNorm = layerNorm2.Norm(input, g); //Feed forward IWeightTensor ffnResult = feedForwardLayer1.Process(inputNorm, batchSize, g); IWeightTensor reluFFNResult = g.Relu(ffnResult, inPlace: true); IWeightTensor ffn2Result = feedForwardLayer2.Process(reluFFNResult, batchSize, g); //Skip connection and layer normaliztion IWeightTensor addFFNResult = graph.Add(ffn2Result, input, inPlace: true); return(addFFNResult); }
public IWeightTensor Decode(IWeightTensor input, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph g) { IWeightTensor V = input; IWeightTensor lastStatus = m_decoders.LastOrDefault().Cell; IWeightTensor context = m_attentionLayer.Perform(lastStatus, attenPreProcessResult, batchSize, g); foreach (LSTMAttentionDecoderCell decoder in m_decoders) { IWeightTensor e = decoder.Step(context, V, g); V = e; } IWeightTensor eOutput = g.Dropout(V, batchSize, m_dropoutRatio, false); eOutput = m_decoderFFLayer.Process(eOutput, batchSize, g); return(eOutput); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var seqLen = input.Rows / m_batchSize; //Input projections var allQ = g.View(Q.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allK = g.View(K.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allV = g.View(V.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions var Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); var Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * m_batchSize, m_d, seqLen); var Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); var attn = g.MulBatch(Qs, Ks, m_multiHeadNum * m_batchSize, scale); var attn2 = g.View(attn, m_multiHeadNum * m_batchSize * seqLen, seqLen); var softmax = g.Softmax(attn2); var softmax2 = g.View(softmax, m_multiHeadNum * m_batchSize, seqLen, seqLen); var o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * m_batchSize), m_multiHeadNum, m_batchSize, seqLen, m_d); var W = g.View(g.Permute(o, 1, 2, 0, 3), m_batchSize * seqLen, m_multiHeadNum * m_d); // Output projection var finalAttResults = g.Affine(W, W0, b0); //Skip connection and layer normaliztion var addedAttResult = g.Add(finalAttResults, input); var normAddedAttResult = layerNorm1.Process(addedAttResult, g); //Feed forward var ffnResult = feedForwardLayer1.Process(normAddedAttResult, g); var reluFFNResult = g.Relu(ffnResult); var ffn2Result = feedForwardLayer2.Process(reluFFNResult, g); //Skip connection and layer normaliztion var addFFNResult = g.Add(ffn2Result, normAddedAttResult); var normAddFFNResult = layerNorm2.Process(addFFNResult, g); return(normAddFFNResult); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph(m_name)) { int seqLen = input.Rows / batchSize; IWeightTensor nInput = layerNorm1.Norm(input, g); //Input projections IWeightTensor allQ = g.View(g.Affine(nInput, Q, Qb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allK = g.View(g.Affine(nInput, K, Kb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allV = g.View(g.Affine(nInput, V, Vb), batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * batchSize, m_d, seqLen); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, m_multiHeadNum * batchSize * seqLen, seqLen); IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, m_multiHeadNum * batchSize, seqLen, seqLen); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), m_multiHeadNum, batchSize, seqLen, m_d); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), batchSize * seqLen, m_multiHeadNum * m_d); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); //Skip connection and layer normaliztion IWeightTensor normAddedAttResult = layerNorm2.AddNorm(finalAttResults, input, g); //Feed forward IWeightTensor ffnResult = feedForwardLayer1.Process(normAddedAttResult, batchSize, g); IWeightTensor reluFFNResult = g.Relu(ffnResult); IWeightTensor ffn2Result = feedForwardLayer2.Process(reluFFNResult, batchSize, g); //Skip connection and layer normaliztion IWeightTensor addFFNResult = graph.Add(ffn2Result, normAddedAttResult); return(addFFNResult); } }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> /// public IWeightTensor Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor srcTgtMask, int batchSize, IComputeGraph g) { using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder")) { for (int k = 0; k < m_selfAttns.Count; k++) { tgtInputs = m_selfAttns[k].Perform(tgtInputs, tgtInputs, tgtInputs, tgtSelfMask, batchSize, subg); tgtInputs = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, srcTgtMask, batchSize, subg); tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg); } tgtInputs = layerNorm.Norm(tgtInputs, subg); tgtInputs.UnbindFromComputeGraph(); } tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g); return(tgtInputs); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode var originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; var dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model var attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); eOutput = g.Dropout(eOutput, batchSize, dropoutRatio, true); eOutput = decoderFFLayer.Process(eOutput, batchSize, g); //Softmax for output using (var probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (var probs_k = g.PeekRow(probs, k, runGradients: false)) { var ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); var score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } var o = decoderFFLayer.Process(eOutput, g); //Softmax for output // var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } return(cost); }