public void VisualizeNeuralNetwork(string visNNFilePath) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); // Build input sentence List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(null); int batchSize = inputSeqs.Count; IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false, visNetwork: true); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Run encoder IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); // Prepare for attention over encoder-decoder AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); // Run decoder IWeightTensor x = g.PeekRow(tgtEmbedding, (int)SENTTAGS.START); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); IWeightTensor probs = g.Softmax(eOutput); g.VisualizeNeuralNetToFile(visNNFilePath); }
public WeightMatrix Perform(WeightMatrix input, WeightMatrix state, IComputeGraph g) { WeightMatrix context; List <WeightMatrix> atten = new List <WeightMatrix>(); var stateRepeat = g.RepeatRows(state, input.Rows); var baiseInput = new WeightMatrix(input.Rows, 1, 1); var inputb = g.concatColumns(input, baiseInput); var uh = g.mul(inputb, Ua); baiseInput = new WeightMatrix(stateRepeat.Rows, 1, 1); stateRepeat = g.concatColumns(stateRepeat, baiseInput); var wc = g.mul(stateRepeat, Wa); var gg = g.addtanh(uh, wc); var aa = g.mul(gg, V); var res = g.Softmax(aa); var weighted = g.weightRows(input, res);; context = g.sumColumns(weighted); return(context); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { int srcSeqLen = attenPreProcessResult.inputsBatchFirst.Rows / batchSize; using (IComputeGraph g = graph.CreateSubGraph(m_name)) { // Affine decoder state IWeightTensor wc = g.Affine(state, m_Wa, m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] IWeightTensor wc1 = g.View(wc, batchSize, 1, wc.Columns); IWeightTensor wcExp = g.Expand(wc1, batchSize, srcSeqLen, wc.Columns); IWeightTensor ggs = null; if (m_enableCoverageModel) { // Get coverage model status at {t-1} IWeightTensor wCoverage = g.Affine(m_coverage.Hidden, m_Wc, m_bWc); IWeightTensor wCoverage1 = g.View(wCoverage, batchSize, srcSeqLen, -1); ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp); } IWeightTensor ggss = g.View(ggs, batchSize * srcSeqLen, -1); IWeightTensor atten = g.Mul(ggss, m_V); IWeightTensor attenT = g.Transpose(atten); IWeightTensor attenT2 = g.View(attenT, batchSize, srcSeqLen); IWeightTensor attenSoftmax1 = g.Softmax(attenT2, inPlace: true); IWeightTensor attenSoftmax = g.View(attenSoftmax1, batchSize, 1, srcSeqLen); IWeightTensor inputs2 = g.View(attenPreProcessResult.inputsBatchFirst, batchSize, srcSeqLen, attenPreProcessResult.inputsBatchFirst.Columns); IWeightTensor contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); if (m_enableCoverageModel) { // Concatenate tensor as input for coverage model IWeightTensor aCoverage = g.View(attenSoftmax1, attenPreProcessResult.inputsBatchFirst.Rows, 1); IWeightTensor state2 = g.View(state, batchSize, 1, state.Columns); IWeightTensor state3 = g.Expand(state2, batchSize, srcSeqLen, state.Columns); IWeightTensor state4 = g.View(state3, batchSize * srcSeqLen, -1); IWeightTensor concate = g.ConcatColumns(aCoverage, attenPreProcessResult.inputsBatchFirst, state4); m_coverage.Step(concate, graph); } return(contexts); } }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="g">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side. In training mode, it inputs target tokens, otherwise, it outputs target tokens generated by decoder</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> public override List <NetworkResult> RunForwardOnSingleDevice(IComputeGraph g, ISntPairBatch sntPairBatch, int deviceIdIdx, bool isTraining, DecodingOptions decodingOptions) { List <NetworkResult> nrs = new List <NetworkResult>(); var srcSnts = sntPairBatch.GetSrcTokens(0); var tgtSnts = sntPairBatch.GetTgtTokens(0); (IEncoder encoder, IWeightTensor srcEmbedding, IWeightTensor posEmbedding, FeedForwardLayer decoderFFLayer) = GetNetworksOnDeviceAt(deviceIdIdx); // Reset networks encoder.Reset(g.GetWeightFactory(), srcSnts.Count); var originalSrcLengths = BuildInTokens.PadSentences(srcSnts); var srcTokensList = m_modelMetaData.SrcVocab.GetWordIndex(srcSnts); BuildInTokens.PadSentences(tgtSnts); var tgtTokensLists = m_modelMetaData.ClsVocab.GetWordIndex(tgtSnts); int seqLen = srcSnts[0].Count; int batchSize = srcSnts.Count; // Encoding input source sentences IWeightTensor encOutput = Encoder.Run(g, sntPairBatch, encoder, m_modelMetaData, m_shuffleType, srcEmbedding, posEmbedding, null, srcTokensList, originalSrcLengths); IWeightTensor ffLayer = decoderFFLayer.Process(encOutput, batchSize, g); float cost = 0.0f; IWeightTensor probs = g.Softmax(ffLayer, inPlace: true); if (isTraining) { var tgtTokensTensor = g.CreateTokensTensor(tgtTokensLists); cost = g.CrossEntropyLoss(probs, tgtTokensTensor); } else { // Output "i"th target word using var targetIdxTensor = g.Argmax(probs, 1); float[] targetIdx = targetIdxTensor.ToWeightArray(); List <string> targetWords = m_modelMetaData.ClsVocab.ConvertIdsToString(targetIdx.ToList()); for (int k = 0; k < batchSize; k++) { tgtSnts[k] = targetWords.GetRange(k * seqLen, seqLen); } } NetworkResult nr = new NetworkResult { Cost = cost, Output = new List <List <List <string> > >() }; nr.Output.Add(tgtSnts); nrs.Add(nr); return(nrs); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false) { using IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"); int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections var weightedQKV = g.View(g.Affine(inputQNorm, QKV, QKVb), dims: new long[] { batchSize, seqLenQ, 3, m_multiHeadNum, m_d }); var allQ = g.Select(weightedQKV, 2, 0); var allK = g.Select(weightedQKV, 2, 1); var allV = g.Select(weightedQKV, 2, 2); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { //Merge all attention probs over multi-heads sumAttnWeights = graph.Sum(attnProbs, 1); sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenQ }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenQ }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }
public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { if (m_sharedQKV == false) { throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors."); } using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV")) { int seqLenQ = inputQ.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections float scale = 1.0f / (float)(m_inputDim); IWeightTensor mulQ, mulK, mulV; using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns })) { using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns })) { using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, 3, scale)) { mulQ = g.Select(mulQKV, 0, 0); mulK = g.Select(mulQKV, 0, 1); mulV = g.Select(mulQKV, 0, 2); } } } IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenQ }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); // Scaled softmax scale = 1.0f / (float)(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNorm1.Norm(inputQ, g); IWeightTensor inputKNorm = (inputK == inputQ) ? inputQNorm : inputK; // layerNorm1.Norm(inputK, g); IWeightTensor inputVNorm = (inputK == inputV) ? inputKNorm : inputV; // layerNorm1.Norm(inputV, g); //Input projections IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputKNorm, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputVNorm, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d }); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, dims: new long[] { m_multiHeadNum *batchSize *seqLenQ, seqLenK }); if (keyMask != null) { // attn2 = g.Add(attn2, mask, runGradient2: false); attn2 = g.MaskFill(attn2, keyMask, -1e9f); } IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, seqLenK }); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var seqLen = input.Rows / m_batchSize; //Input projections var allQ = g.View(Q.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allK = g.View(K.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); var allV = g.View(V.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions var Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); var Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * m_batchSize, m_d, seqLen); var Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); var attn = g.MulBatch(Qs, Ks, m_multiHeadNum * m_batchSize, scale); var attn2 = g.View(attn, m_multiHeadNum * m_batchSize * seqLen, seqLen); var softmax = g.Softmax(attn2); var softmax2 = g.View(softmax, m_multiHeadNum * m_batchSize, seqLen, seqLen); var o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * m_batchSize), m_multiHeadNum, m_batchSize, seqLen, m_d); var W = g.View(g.Permute(o, 1, 2, 0, 3), m_batchSize * seqLen, m_multiHeadNum * m_d); // Output projection var finalAttResults = g.Affine(W, W0, b0); //Skip connection and layer normaliztion var addedAttResult = g.Add(finalAttResults, input); var normAddedAttResult = layerNorm1.Process(addedAttResult, g); //Feed forward var ffnResult = feedForwardLayer1.Process(normAddedAttResult, g); var reluFFNResult = g.Relu(ffnResult); var ffn2Result = feedForwardLayer2.Process(reluFFNResult, g); //Skip connection and layer normaliztion var addFFNResult = g.Add(ffn2Result, normAddedAttResult); var normAddFFNResult = layerNorm2.Process(addFFNResult, g); return(normAddFFNResult); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="input">The input tensor</param> /// <param name="g">The instance of computing graph</param> /// <returns></returns> public IWeightTensor Perform(IWeightTensor input, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph(m_name)) { int seqLen = input.Rows / batchSize; IWeightTensor nInput = layerNorm1.Norm(input, g); //Input projections IWeightTensor allQ = g.View(g.Affine(nInput, Q, Qb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allK = g.View(g.Affine(nInput, K, Kb), batchSize, seqLen, m_multiHeadNum, m_d); IWeightTensor allV = g.View(g.Affine(nInput, V, Vb), batchSize, seqLen, m_multiHeadNum, m_d); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * batchSize, m_d, seqLen); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d); // Scaled softmax float scale = 1.0f / (float)Math.Sqrt(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor attn2 = g.View(attn, m_multiHeadNum * batchSize * seqLen, seqLen); IWeightTensor softmax = g.Softmax(attn2, inPlace: true); IWeightTensor softmax2 = g.View(softmax, m_multiHeadNum * batchSize, seqLen, seqLen); IWeightTensor o = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), m_multiHeadNum, batchSize, seqLen, m_d); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), batchSize * seqLen, m_multiHeadNum * m_d); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); //Skip connection and layer normaliztion IWeightTensor normAddedAttResult = layerNorm2.AddNorm(finalAttResults, input, g); //Feed forward IWeightTensor ffnResult = feedForwardLayer1.Process(normAddedAttResult, batchSize, g); IWeightTensor reluFFNResult = g.Relu(ffnResult); IWeightTensor ffn2Result = feedForwardLayer2.Process(reluFFNResult, batchSize, g); //Skip connection and layer normaliztion IWeightTensor addFFNResult = graph.Add(ffn2Result, normAddedAttResult); return(addFFNResult); } }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); var atten2 = g.PermuteBatch(atten, m_batchSize); var attenT = g.Transpose2(atten2); var attenT2 = g.View(attenT, m_batchSize, attenPreProcessResult.inputs.Rows / m_batchSize); var attenSoftmax = g.Softmax(attenT2); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { if (m_sharedQKV) { throw new ArgumentException($"Layer '{m_name}' is in shared QKV mode, please call antoher Perform function with single input tensor."); } using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections float scale = 1.0f / (float)(m_inputDim); IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK }); IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d }); // Scaled softmax scale = 1.0f / (float)(m_d); IWeightTensor attn = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale); IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
public WeightMatrix Perform(List <WeightMatrix> input, WeightMatrix state, IComputeGraph g) { WeightMatrix context; WeightMatrix[] atten = new WeightMatrix[input.Count]; var wc = g.muladd(state, Wa, bWa); Parallel.For(0, input.Count, i => { var h_j = input[i]; var uh = g.muladd(h_j, Ua, bUa); var gg = g.addtanh(uh, wc); var aa = g.mul(gg, V); atten[i] = aa; }); var res = g.Softmax(atten); //var cmax = res[0].Weight[0]; //int maxAtt = 0; //for (int i = 1; i < res.Count; i++) //{ // if (res[i].Weight[0] > cmax) // { // cmax = res[i].Weight[0]; // maxAtt = i; // } //} //this.MaxIndex = maxAtt; context = g.scalemul(input[0], res[0]); for (int hj = 1; hj < input.Count; hj++) { context = g.scalemuladd(input[hj], res[hj], context); } return(context); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var wc = g.Affine(state, m_Wa, m_bWa); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, m_V); var atten2 = g.TransposeBatch(atten, batchSize); var attenT = g.Transpose(atten2); var attenT2 = g.View(attenT, batchSize, attenPreProcessResult.inputs.Rows / batchSize); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, batchSize, attenSoftmax1.Rows / batchSize, attenSoftmax1.Columns); var inputs2 = g.View(attenPreProcessResult.inputs, batchSize, attenPreProcessResult.inputs.Rows / batchSize, attenPreProcessResult.inputs.Columns); IWeightTensor contexts = g.MulBatch(attenSoftmax, inputs2, batchSize); return(contexts); }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); List <IWeightMatrix> attens = g.UnFolderRow(atten, m_batchSize); List <IWeightMatrix> attensT = new List <IWeightMatrix>(); for (int i = 0; i < m_batchSize; i++) { attensT.Add(g.Transpose2(attens[i])); } var attenT = g.ConcatRows(attensT); var attenSoftmax = g.Softmax(attenT); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
private float DecodeTransformer(List <List <string> > tgtSeqs, IComputeGraph g, IWeightTensor encOutputs, TransformerDecoder decoder, IWeightTensor tgtEmbedding, IWeightTensor posEmbedding, int batchSize, int deviceId, List <int> srcOriginalLenghts, bool isTraining = true) { float cost = 0.0f; var tgtOriginalLengths = ParallelCorpus.PadSentences(tgtSeqs); int tgtSeqLen = tgtSeqs[0].Count; int srcSeqLen = encOutputs.Rows / batchSize; using (IWeightTensor srcTgtMask = MaskUtils.BuildSrcTgtMask(g, srcSeqLen, tgtSeqLen, tgtOriginalLengths, srcOriginalLenghts, deviceId)) { using (IWeightTensor tgtSelfTriMask = MaskUtils.BuildPadSelfTriMask(g, tgtSeqLen, tgtOriginalLengths, deviceId)) { List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSeqs[i][j], logUnk: true); var emb = g.PeekRow(tgtEmbedding, ix_targets_k, runGradients: j < tgtOriginalLengths[i] ? true : false); inputs.Add(emb); } } IWeightTensor inputEmbs = inputs.Count > 1 ? g.ConcatRows(inputs) : inputs[0]; inputEmbs = AddPositionEmbedding(g, posEmbedding, batchSize, tgtSeqLen, inputEmbs); IWeightTensor decOutput = decoder.Decode(inputEmbs, encOutputs, tgtSelfTriMask, srcTgtMask, batchSize, g); using (IWeightTensor probs = g.Softmax(decOutput, runGradients: false, inPlace: true)) { if (isTraining) { var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(tgtSeqs, ParallelCorpus.EOS); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { using (IWeightTensor probs_i_j = g.PeekRow(probs, i * tgtSeqLen + j, runGradients: false)) { if (j < tgtOriginalLengths[i]) { int ix_targets_i_j = m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true); float score_i_j = probs_i_j.GetWeightAt(ix_targets_i_j); cost += (float)-Math.Log(score_i_j); probs_i_j.SetWeightAt(score_i_j - 1, ix_targets_i_j); } else { probs_i_j.CleanWeight(); } } } } decOutput.CopyWeightsToGradients(probs); } //if (isTraining) //{ // var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(tgtSeqs, ParallelCorpus.EOS); // int[] targetIds = new int[batchSize * tgtSeqLen]; // int ids = 0; // for (int i = 0; i < batchSize; i++) // { // for (int j = 0; j < tgtSeqLen; j++) // { // targetIds[ids] = j < tgtOriginalLengths[i] ? m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true) : -1; // ids++; // } // } // cost += g.UpdateCost(probs, targetIds); // decOutput.CopyWeightsToGradients(probs); //} else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int i = 0; i < batchSize; i++) { tgtSeqs[i].Add(targetWords[i * tgtSeqLen + tgtSeqLen - 1]); } } } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSnts">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="tgtEmbedding"></param> /// <returns></returns> private float DecodeAttentionLSTM(List <List <string> > outputSnts, IComputeGraph g, IWeightTensor encOutputs, AttentionDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[i][0]); } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSnts) : null; int seqLen = isTraining ? outputSnts[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encOutputs, batchSize, g); for (int i = 1; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(tgtEmbedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSnts[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } ix_inputs = targetIdx; } } } return(cost); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="computeGraph">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> public override List <NetworkResult> RunForwardOnSingleDevice(IComputeGraph computeGraph, ISntPairBatch sntPairBatch, int deviceIdIdx, bool isTraining, DecodingOptions decodingOptions) { int batchSize = sntPairBatch.BatchSize; float cost = 0.0f; var nrs = new List <NetworkResult>(); var nr = new NetworkResult { Output = new List <List <List <string> > >() }; (IEncoder encoder, IWeightTensor srcEmbedding, IFeedForwardLayer encoderFFLayer, IWeightTensor posEmbedding, IWeightTensor segmentEmbedding) = GetNetworksOnDeviceAt(deviceIdIdx); IWeightTensor encOutput1; IWeightTensor encOutput2; if (!isTraining && (m_options.ProcessorType == ProcessorTypeEnums.CPU)) { //We only check cache at inference time string cacheKey1 = GenerateCacheKey(sntPairBatch.GetSrcTokens(0)); if (!m_memoryCache.TryGetValue(cacheKey1, out encOutput1)) { encOutput1 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 0); // output shape: [batch_size, dim] var cacheEntryOptions = new MemoryCacheEntryOptions().SetSize(1); m_memoryCache.Set(cacheKey1, encOutput1.CopyWeightsRef($"cache_{encOutput1.Name}", false), cacheEntryOptions); } string cacheKey2 = GenerateCacheKey(sntPairBatch.GetSrcTokens(1)); if (!m_memoryCache.TryGetValue(cacheKey2, out encOutput2)) { encOutput2 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 1); // output_shape: [batch_size, dim] var cacheEntryOptions = new MemoryCacheEntryOptions().SetSize(1); m_memoryCache.Set(cacheKey2, encOutput2.CopyWeightsRef($"cache_{encOutput2.Name}", false), cacheEntryOptions); } } else { //We always run encoder network during training time or using GPUs encOutput1 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 0); // output shape: [batch_size, dim] encOutput2 = Encoder.BuildTensorForSourceTokenGroupAt(computeGraph, sntPairBatch, m_shuffleType, encoder, m_modelMetaData, srcEmbedding, posEmbedding, segmentEmbedding, 1); // output_shape: [batch_size, dim] } if (m_modelMetaData.SimilarityType.Equals("Continuous", StringComparison.InvariantCultureIgnoreCase)) { // Cosine similairy var w12 = computeGraph.EltMul(encOutput1, encOutput2); w12 = computeGraph.Sum(w12, 1); var w1 = computeGraph.EltMul(encOutput1, encOutput1); w1 = computeGraph.Sum(w1, 1); var w2 = computeGraph.EltMul(encOutput2, encOutput2); w2 = computeGraph.Sum(w2, 1); var n12 = computeGraph.EltMul(w1, w2); n12 = computeGraph.Rsqrt(n12); var probs = computeGraph.EltMul(w12, n12); if (isTraining) { var tgtSnts = sntPairBatch.GetTgtTokens(0); for (int k = 0; k < batchSize; k++) { float golden_score_k = float.Parse(tgtSnts[k][0]); // Get golden similiary score from target side float score_k = probs.GetWeightAt(new long[] { k, 0 }); probs.SetWeightAt(score_k - golden_score_k, new long[] { k, 0 }); cost += (float)Math.Abs(score_k - golden_score_k); } probs.CopyWeightsToGradients(probs); nr.Cost = cost / batchSize; } else { nr.Output.Add(new List <List <string> >()); for (int k = 0; k < batchSize; k++) { float score_k = probs.GetWeightAt(new long[] { k, 0 }); nr.Output[0].Add(new List <string>()); nr.Output[0][k].Add(score_k.ToString()); } } } else { IWeightTensor encOutput = computeGraph.EltMul(encOutput1, encOutput2); IWeightTensor ffLayer = encoderFFLayer.Process(encOutput, batchSize, computeGraph); using (IWeightTensor probs = computeGraph.Softmax(ffLayer, runGradients: false, inPlace: true)) { if (isTraining) { var tgtSnts = sntPairBatch.GetTgtTokens(0); for (int k = 0; k < batchSize; k++) { int ix_targets_k_j = m_modelMetaData.ClsVocab.GetWordIndex(tgtSnts[k][0]); float score_k = probs.GetWeightAt(new long[] { k, ix_targets_k_j }); cost += (float)-Math.Log(score_k); probs.SetWeightAt(score_k - 1, new long[] { k, ix_targets_k_j }); } ffLayer.CopyWeightsToGradients(probs); nr.Cost = cost / batchSize; } else { // Output "i"th target word using var targetIdxTensor = computeGraph.Argmax(probs, 1); float[] targetIdx = targetIdxTensor.ToWeightArray(); List <string> targetWords = m_modelMetaData.ClsVocab.ConvertIdsToString(targetIdx.ToList()); nr.Output.Add(new List <List <string> >()); for (int k = 0; k < batchSize; k++) { nr.Output[0].Add(new List <string>()); nr.Output[0][k].Add(targetWords[k]); } } } } nrs.Add(nr); return(nrs); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="g">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side. In training mode, it inputs target tokens, otherwise, it outputs target tokens generated by decoder</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> private float RunForwardOnSingleDevice(IComputeGraph g, List <List <string> > srcSnts, List <List <string> > tgtSnts, int deviceIdIdx, bool isTraining) { var(encoder, srcEmbedding, posEmbedding, decoderFFLayer) = this.GetNetworksOnDeviceAt(deviceIdIdx); // Reset networks encoder.Reset(g.GetWeightFactory(), srcSnts.Count); var originalSrcLengths = ParallelCorpus.PadSentences(srcSnts); var seqLen = srcSnts[0].Count; var batchSize = srcSnts.Count; // Encoding input source sentences var encOutput = this.Encode(g, srcSnts, encoder, srcEmbedding, null, posEmbedding, originalSrcLengths); var ffLayer = decoderFFLayer.Process(encOutput, batchSize, g); var ffLayerBatch = g.TransposeBatch(ffLayer, batchSize); var cost = 0.0f; using (var probs = g.Softmax(ffLayerBatch, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (var k = 0; k < batchSize; k++) { for (var j = 0; j < seqLen; j++) { using (var probs_k_j = g.PeekRow(probs, k * seqLen + j, runGradients: false)) { var ix_targets_k_j = this.m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); var score_k = probs_k_j.GetWeightAt(ix_targets_k_j); cost += (float)-Math.Log(score_k); probs_k_j.SetWeightAt(score_k - 1, ix_targets_k_j); } } ////CRF part //using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) //{ // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.ForwardBackward(seqLen, weights_k); // int[] trueTags = new int[seqLen]; // for (int j = 0; j < seqLen; j++) // { // trueTags[j] = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); // } // m_crfDecoder.UpdateBigramTransition(seqLen, crfOutput_k, trueTags); //} } ffLayerBatch.CopyWeightsToGradients(probs); } else { // CRF decoder //for (int k = 0; k < batchSize; k++) //{ // //CRF part // using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) // { // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.DecodeNBestCRF(weights_k, seqLen, 1); // var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(crfOutput_k[0].ToList()); // tgtSnts.Add(targetWords); // } //} // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = this.m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (var k = 0; k < batchSize; k++) { tgtSnts[k] = targetWords.GetRange(k * seqLen, seqLen); } } } return(cost); }
private float DecodeTransformer(List <List <string> > outInputSeqs, IComputeGraph g, IWeightTensor encOutputs, IWeightTensor encMask, TransformerDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, int deviceId, bool isTraining = true) { float cost = 0.0f; var originalInputLengths = ParallelCorpus.PadSentences(outInputSeqs); int tgtSeqLen = outInputSeqs[0].Count; IWeightTensor tgtDimMask = MaskUtils.BuildPadDimMask(g, tgtSeqLen, originalInputLengths, m_modelMetaData.HiddenDim, deviceId); using (IWeightTensor tgtSelfTriMask = MaskUtils.BuildPadSelfTriMask(g, tgtSeqLen, originalInputLengths, deviceId)) { List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outInputSeqs[i][j], logUnk: true); inputs.Add(g.PeekRow(tgtEmbedding, ix_targets_k)); } } IWeightTensor tgtInputEmbeddings = inputs.Count > 1 ? g.ConcatRows(inputs) : inputs[0]; IWeightTensor decOutput = decoder.Decode(tgtInputEmbeddings, encOutputs, tgtSelfTriMask, encMask, tgtDimMask, batchSize, g); decOutput = g.Mul(decOutput, g.Transpose(tgtEmbedding)); using (IWeightTensor probs = g.Softmax(decOutput, runGradients: false, inPlace: true)) { if (isTraining) { var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(outInputSeqs, ParallelCorpus.EOS); var originalOutputLengths = ParallelCorpus.PadSentences(leftShiftInputSeqs, tgtSeqLen); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { using (IWeightTensor probs_i_j = g.PeekRow(probs, i * tgtSeqLen + j, runGradients: false)) { if (j < originalOutputLengths[i]) { int ix_targets_i_j = m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true); float score_i_j = probs_i_j.GetWeightAt(ix_targets_i_j); if (j < originalOutputLengths[i]) { cost += (float)-Math.Log(score_i_j); } probs_i_j.SetWeightAt(score_i_j - 1, ix_targets_i_j); } else { probs_i_j.CleanWeight(); } } } } decOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int i = 0; i < batchSize; i++) { outInputSeqs[i].Add(targetWords[i * tgtSeqLen + tgtSeqLen - 1]); } } } } return(cost); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph) { using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention")) { int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); if (inputK == inputQ) { inputK = inputQNorm; } if (inputV == inputQ) { inputV = inputQNorm; } //Input projections float scale = 1.0f; IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); IWeightTensor allK = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); // Scaled softmax scale = 1.0f / (float)(Math.Sqrt(m_d)); IWeightTensor attn = g.MulBatch(Qs, Ks, batchSize * m_multiHeadNum, scale); if (keyMask != null) { using (var keyMaskView = g.View(keyMask, runGradient: false, dims: new long[] { batchSize, 1, seqLenQ, seqLenK })) { using (var keyMaskViewExp = g.Expand(keyMaskView, runGradient: false, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK })) { using (var keyMaskViewExpConti = g.AsContiguous(keyMaskViewExp, runGradient: false)) { using (var keyMaskViewExpContiView = g.View(keyMaskViewExpConti, runGradient: false, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK })) { attn = g.Add(attn, keyMaskViewExpContiView, runGradient1: true, runGradient2: false); } } } } } IWeightTensor softmax = g.Softmax(attn, inPlace: true); IWeightTensor o = g.View(g.MulBatch(softmax, Vs, batchSize * m_multiHeadNum), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); return(graph.Add(finalAttResults, inputQ)); } }
/// <summary> /// Given input sentence and generate output sentence by seq2seq model with beam search /// </summary> /// <param name="input"></param> /// <param name="beamSearchSize"></param> /// <param name="maxOutputLength"></param> /// <returns></returns> public List <List <string> > Predict(List <string> input, int beamSearchSize = 1, int maxOutputLength = 100) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(input); int batchSize = 1; // For predict with beam search, we currently only supports one sentence per call IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Construct beam search status list List <BeamSearchStatus> bssList = new List <BeamSearchStatus>(); BeamSearchStatus bss = new BeamSearchStatus(); bss.OutputIds.Add((int)SENTTAGS.START); bss.CTs = rnnDecoder.GetCTs(); bss.HTs = rnnDecoder.GetHTs(); bssList.Add(bss); IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); List <BeamSearchStatus> newBSSList = new List <BeamSearchStatus>(); bool finished = false; int outputLength = 0; while (finished == false && outputLength < maxOutputLength) { finished = true; for (int i = 0; i < bssList.Count; i++) { bss = bssList[i]; if (bss.OutputIds[bss.OutputIds.Count - 1] == (int)SENTTAGS.END) { newBSSList.Add(bss); } else if (bss.OutputIds.Count > maxOutputLength) { newBSSList.Add(bss); } else { finished = false; int ix_input = bss.OutputIds[bss.OutputIds.Count - 1]; rnnDecoder.SetCTs(bss.CTs); rnnDecoder.SetHTs(bss.HTs); IWeightTensor x = g.PeekRow(tgtEmbedding, ix_input); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); using (IWeightTensor probs = g.Softmax(eOutput)) { List <int> preds = probs.GetTopNMaxWeightIdx(beamSearchSize); for (int j = 0; j < preds.Count; j++) { BeamSearchStatus newBSS = new BeamSearchStatus(); newBSS.OutputIds.AddRange(bss.OutputIds); newBSS.OutputIds.Add(preds[j]); newBSS.CTs = rnnDecoder.GetCTs(); newBSS.HTs = rnnDecoder.GetHTs(); float score = probs.GetWeightAt(preds[j]); newBSS.Score = bss.Score; newBSS.Score += (float)(-Math.Log(score)); //var lengthPenalty = Math.Pow((5.0f + newBSS.OutputIds.Count) / 6, 0.6); //newBSS.Score /= (float)lengthPenalty; newBSSList.Add(newBSS); } } } } bssList = BeamSearch.GetTopNBSS(newBSSList, beamSearchSize); newBSSList.Clear(); outputLength++; } // Convert output target word ids to real string List <List <string> > results = new List <List <string> >(); for (int i = 0; i < bssList.Count; i++) { results.Add(m_modelMetaData.Vocab.ConvertTargetIdsToString(bssList[i].OutputIds)); } return(results); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="computeGraph">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> public override List <NetworkResult> RunForwardOnSingleDevice(IComputeGraph computeGraph, ISntPairBatch sntPairBatch, int deviceIdIdx, bool isTraining, DecodingOptions decodingOptions) { List <NetworkResult> nrs = new List <NetworkResult>(); (IEncoder encoder, IWeightTensor srcEmbedding, List <IFeedForwardLayer> encoderFFLayer, IWeightTensor posEmbedding, IWeightTensor segmentEmbedding) = GetNetworksOnDeviceAt(deviceIdIdx); var srcSnts = sntPairBatch.GetSrcTokens(0); var originalSrcLengths = BuildInTokens.PadSentences(srcSnts); var srcTokensList = m_modelMetaData.SrcVocab.GetWordIndex(srcSnts); IWeightTensor encOutput = Encoder.Run(computeGraph, sntPairBatch, encoder, m_modelMetaData, m_shuffleType, srcEmbedding, posEmbedding, segmentEmbedding, srcTokensList, originalSrcLengths); int srcSeqPaddedLen = srcSnts[0].Count; int batchSize = srcSnts.Count; float[] clsIdxs = new float[batchSize]; for (int i = 0; i < batchSize; i++) { for (int j = 0; j < srcSnts[i].Count; j++) { if (srcSnts[i][j] == BuildInTokens.CLS) { clsIdxs[i] = i * srcSeqPaddedLen + j; break; } } } IWeightTensor clsWeightTensor = computeGraph.IndexSelect(encOutput, clsIdxs); for (int i = 0; i < m_encoderFFLayer.Length; i++) { float cost = 0.0f; NetworkResult nr = new NetworkResult { Output = new List <List <List <string> > >() }; IWeightTensor ffLayer = encoderFFLayer[i].Process(clsWeightTensor, batchSize, computeGraph); using (IWeightTensor probs = computeGraph.Softmax(ffLayer, runGradients: false, inPlace: true)) { if (isTraining) { var tgtSnts = sntPairBatch.GetTgtTokens(i); for (int k = 0; k < batchSize; k++) { int ix_targets_k_j = m_modelMetaData.ClsVocabs[i].GetWordIndex(tgtSnts[k][0]); float score_k = probs.GetWeightAt(new long[] { k, ix_targets_k_j }); cost += (float)-Math.Log(score_k); probs.SetWeightAt(score_k - 1, new long[] { k, ix_targets_k_j }); } ffLayer.CopyWeightsToGradients(probs); nr.Cost = cost / batchSize; } else { // Output "i"th target word using var targetIdxTensor = computeGraph.Argmax(probs, 1); float[] targetIdx = targetIdxTensor.ToWeightArray(); List <string> targetWords = m_modelMetaData.ClsVocabs[i].ConvertIdsToString(targetIdx.ToList()); nr.Output.Add(new List <List <string> >()); for (int k = 0; k < batchSize; k++) { nr.Output[0].Add(new List <string>()); nr.Output[0][k].Add(targetWords[k]); } } } nrs.Add(nr); } return(nrs); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="g">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side. In training mode, it inputs target tokens, otherwise, it outputs target tokens generated by decoder</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> private float RunForwardOnSingleDevice(IComputeGraph g, List <List <string> > srcSnts, List <List <string> > tgtSnts, int deviceIdIdx, bool isTraining) { (IEncoder encoder, IWeightTensor srcEmbedding, FeedForwardLayer decoderFFLayer) = GetNetworksOnDeviceAt(deviceIdIdx); int batchSize = srcSnts.Count; // Reset networks encoder.Reset(g.GetWeightFactory(), batchSize); // Encoding input source sentences ParallelCorpus.PadSentences(srcSnts); if (isTraining) { ParallelCorpus.PadSentences(tgtSnts); if (srcSnts[0].Count != tgtSnts[0].Count) { throw new ArgumentException($"The length of source side and target side must be equal. source length = '{srcSnts[0].Count}', target length = '{tgtSnts[0].Count}'"); } } int seqLen = srcSnts[0].Count; IWeightTensor encodedWeightMatrix = Encode(g.CreateSubGraph("Encoder"), srcSnts, encoder, srcEmbedding); IWeightTensor ffLayer = decoderFFLayer.Process(encodedWeightMatrix, batchSize, g); IWeightTensor ffLayerBatch = g.TransposeBatch(ffLayer, batchSize); // Logger.WriteLine("1"); float cost = 0.0f; using (var probs = g.Softmax(ffLayerBatch, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { for (int j = 0; j < seqLen; j++) { using (var probs_k_j = g.PeekRow(probs, k * seqLen + j, runGradients: false)) { var ix_targets_k_j = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); var score_k = probs_k_j.GetWeightAt(ix_targets_k_j); cost += (float)-Math.Log(score_k); probs_k_j.SetWeightAt(score_k - 1, ix_targets_k_j); } } ////CRF part //using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) //{ // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.ForwardBackward(seqLen, weights_k); // int[] trueTags = new int[seqLen]; // for (int j = 0; j < seqLen; j++) // { // trueTags[j] = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); // } // m_crfDecoder.UpdateBigramTransition(seqLen, crfOutput_k, trueTags); //} } ffLayerBatch.CopyWeightsToGradients(probs); } else { // CRF decoder //for (int k = 0; k < batchSize; k++) //{ // //CRF part // using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) // { // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.DecodeNBestCRF(weights_k, seqLen, 1); // var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(crfOutput_k[0].ToList()); // tgtSnts.Add(targetWords); // } //} // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int k = 0; k < batchSize; k++) { tgtSnts.Add(targetWords.GetRange(k * seqLen, seqLen)); } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } var o = decoderFFLayer.Process(eOutput, g); //Softmax for output // var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } return(cost); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="computeGraph">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> public override List <NetworkResult> RunForwardOnSingleDevice(IComputeGraph computeGraph, ISntPairBatch sntPairBatch, int deviceIdIdx, bool isTraining, DecodingOptions decodingOptions) { (IEncoder encoder, IDecoder decoder, IFeedForwardLayer encoderFFLayer, IFeedForwardLayer decoderFFLayer, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding, IWeightTensor posEmbedding, IWeightTensor segmentEmbedding) = GetNetworksOnDeviceAt(deviceIdIdx); var srcSnts = sntPairBatch.GetSrcTokens(0); var originalSrcLengths = BuildInTokens.PadSentences(srcSnts); var srcTokensList = m_modelMetaData.SrcVocab.GetWordIndex(srcSnts); IWeightTensor encOutput = Encoder.Run(computeGraph, sntPairBatch, encoder, m_modelMetaData, m_shuffleType, srcEmbedding, posEmbedding, segmentEmbedding, srcTokensList, originalSrcLengths); List <NetworkResult> nrs = new List <NetworkResult>(); int srcSeqPaddedLen = srcSnts[0].Count; int batchSize = srcSnts.Count; float[] clsIdxs = new float[batchSize]; for (int i = 0; i < batchSize; i++) { for (int j = 0; j < srcSnts[i].Count; j++) { if (srcSnts[i][j] == BuildInTokens.CLS) { clsIdxs[i] = i * srcSeqPaddedLen + j; break; } } } IWeightTensor clsWeightTensor = computeGraph.IndexSelect(encOutput, clsIdxs); float cost = 0.0f; NetworkResult nrCLS = new NetworkResult { Output = new List <List <List <string> > >() }; IWeightTensor ffLayer = encoderFFLayer.Process(clsWeightTensor, batchSize, computeGraph); using (IWeightTensor probs = computeGraph.Softmax(ffLayer, runGradients: false, inPlace: true)) { if (isTraining) { var clsSnts = sntPairBatch.GetTgtTokens(0); for (int k = 0; k < batchSize; k++) { int ix_targets_k_j = m_modelMetaData.ClsVocab.GetWordIndex(clsSnts[k][0]); float score_k = probs.GetWeightAt(new long[] { k, ix_targets_k_j }); cost += (float)-Math.Log(score_k); probs.SetWeightAt(score_k - 1, new long[] { k, ix_targets_k_j }); } ffLayer.CopyWeightsToGradients(probs); nrCLS.Cost = cost / batchSize; } else { // Output "i"th target word using var targetIdxTensor = computeGraph.Argmax(probs, 1); float[] targetIdx = targetIdxTensor.ToWeightArray(); List <string> targetWords = m_modelMetaData.ClsVocab.ConvertIdsToString(targetIdx.ToList()); nrCLS.Output.Add(new List <List <string> >()); for (int k = 0; k < batchSize; k++) { nrCLS.Output[0].Add(new List <string>()); nrCLS.Output[0][k].Add(targetWords[k]); } } } // Reset networks decoder.Reset(computeGraph.GetWeightFactory(), srcSnts.Count); // Generate output decoder sentences var tgtSnts = sntPairBatch.GetTgtTokens(1); var tgtTokensList = m_modelMetaData.TgtVocab.GetWordIndex(tgtSnts); NetworkResult nr = new NetworkResult(); if (decoder is AttentionDecoder) { nr.Cost = Decoder.DecodeAttentionLSTM(tgtTokensList, computeGraph, encOutput, decoder as AttentionDecoder, decoderFFLayer, tgtEmbedding, m_modelMetaData.TgtVocab, srcSnts.Count, isTraining); nr.Output = new List <List <List <string> > > { m_modelMetaData.TgtVocab.ConvertIdsToString(tgtTokensList) }; } else { if (isTraining) { (var c, _) = Decoder.DecodeTransformer(tgtTokensList, computeGraph, encOutput, decoder as TransformerDecoder, decoderFFLayer, tgtEmbedding, posEmbedding, originalSrcLengths, m_modelMetaData.TgtVocab, m_shuffleType, m_options.DropoutRatio, null, isTraining); nr.Cost = c; nr.Output = null; } else { List <List <BeamSearchStatus> > beam2batchStatus = Decoder.InitBeamSearchStatusListList(batchSize, tgtTokensList); for (int i = 0; i < decodingOptions.MaxTgtSentLength; i++) { List <List <BeamSearchStatus> > batch2beam2seq = null; //(batch_size, beam_search_size) try { foreach (var batchStatus in beam2batchStatus) { var batch2tgtTokens = Decoder.ExtractBatchTokens(batchStatus); using var g = computeGraph.CreateSubGraph($"TransformerDecoder_Step_{i}"); (var cost2, var bssSeqList) = Decoder.DecodeTransformer(batch2tgtTokens, g, encOutput, decoder as TransformerDecoder, decoderFFLayer, tgtEmbedding, posEmbedding, originalSrcLengths, m_modelMetaData.TgtVocab, m_shuffleType, 0.0f, decodingOptions, isTraining, outputSentScore: decodingOptions.BeamSearchSize > 1, previousBeamSearchResults: batchStatus); bssSeqList = Decoder.SwapBeamAndBatch(bssSeqList); batch2beam2seq = Decoder.CombineBeamSearchResults(batch2beam2seq, bssSeqList); } } catch (OutOfMemoryException) { GC.Collect(); Logger.WriteLine(Logger.Level.warn, $"We have out of memory while generating '{i}th' tokens, so terminate decoding for current sequences."); break; } if (decodingOptions.BeamSearchSize > 1) { // Keep top N result and drop all others for (int k = 0; k < batchSize; k++) { batch2beam2seq[k] = BeamSearch.GetTopNBSS(batch2beam2seq[k], decodingOptions.BeamSearchSize); } } beam2batchStatus = Decoder.SwapBeamAndBatch(batch2beam2seq); if (Decoder.AreAllSentsCompleted(beam2batchStatus)) { break; } } nr.Cost = 0.0f; nr.Output = m_modelMetaData.TgtVocab.ExtractTokens(beam2batchStatus); } } nr.RemoveDuplicatedEOS(); nrs.Add(nrCLS); nrs.Add(nr); return(nrs); }
/// <summary> /// Scaled multi-heads attention component with skip connectioned feed forward layers /// </summary> /// <param name="inputQ">The input Q tensor</param> /// <param name="inputK">The input K tensor</param> /// <param name="inputV">The input V tensor</param> /// <param name="keyMask">The mask for softmax</param> /// <param name="batchSize">Batch size of input data set</param> /// <param name="graph">The instance of computing graph</param> /// <returns>Transformered output tensor</returns> public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false, Dictionary <string, IWeightTensor> cachedTensors = null) { string keyName = $"{m_name}_MultiHeadAttention"; using IComputeGraph g = graph.CreateSubGraph(keyName); int seqLenQ = inputQ.Rows / batchSize; // SeqLenK must be euqal to SeqLenV int seqLenK = inputK.Rows / batchSize; int seqLenV = inputV.Rows / batchSize; IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); //Input projections IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); //Multi-head attentions IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d }); IWeightTensor Ks = null; IWeightTensor Vs = null; if (cachedTensors == null) // We don't use any cached tensors { IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); } else { string KsCacheName = keyName + "_" + nameof(Ks); string VsCacheName = keyName + "_" + nameof(Vs); if (cachedTensors.ContainsKey(KsCacheName) == false) { IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d }); Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK }); cachedTensors.Add(KsCacheName, Ks.CopyWeightsRef(KsCacheName, Ks.NeedGradient)); } else { Ks = cachedTensors[KsCacheName]; } if (cachedTensors.ContainsKey(VsCacheName) == false) { IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d }); Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d }); cachedTensors.Add(VsCacheName, Vs.CopyWeightsRef(VsCacheName, Vs.NeedGradient)); } else { Vs = cachedTensors[VsCacheName]; } } // Scaled softmax float scale = 1.0f / (float)(Math.Sqrt(m_d)); var attn = g.MulBatch(Qs, Ks, scale); attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK }); if (keyMask != null) { attn = g.Add(attn, keyMask, inPlace: true); } var attnProbs = g.Softmax(attn, inPlace: true); IWeightTensor sumAttnWeights = null; if (outputAttenWeights) { sumAttnWeights = g.Select(attnProbs, 1, 0); for (int i = 1; i < m_multiHeadNum; i++) { var tmp = g.Select(attnProbs, 1, i); sumAttnWeights = g.Add(sumAttnWeights, tmp); } sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum); sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenK }); } attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK }); IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d }); IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d }); // Output projection IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); IWeightTensor result = graph.Add(finalAttResults, inputQ, inPlace: true); return(result, sumAttnWeights); }