public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputsUnfolder[0].Rows); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); List <IWeightMatrix> attens = g.UnFolderRow(atten, m_batchSize); List <IWeightMatrix> contexts = new List <IWeightMatrix>(); List <IWeightMatrix> attensT = new List <IWeightMatrix>(); for (int i = 0; i < m_batchSize; i++) { attensT.Add(g.Transpose2(attens[i])); } var attenT = g.ConcatRows(attensT); var attenSoftmax = g.SoftmaxM(attenT); for (int i = 0; i < m_batchSize; i++) { IWeightMatrix context = g.Mul(g.PeekRow(attenSoftmax, i), attenPreProcessResult.inputsUnfolder[i]); contexts.Add(context); } return(g.ConcatRows(contexts)); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { int srcSeqLen = attenPreProcessResult.inputsBatchFirst.Rows / batchSize; using (IComputeGraph g = graph.CreateSubGraph(m_name)) { // Affine decoder state IWeightTensor wc = g.Affine(state, m_Wa, m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] IWeightTensor wc1 = g.View(wc, batchSize, 1, wc.Columns); IWeightTensor wcExp = g.Expand(wc1, batchSize, srcSeqLen, wc.Columns); IWeightTensor ggs = null; if (m_enableCoverageModel) { // Get coverage model status at {t-1} IWeightTensor wCoverage = g.Affine(m_coverage.Hidden, m_Wc, m_bWc); IWeightTensor wCoverage1 = g.View(wCoverage, batchSize, srcSeqLen, -1); ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp); } IWeightTensor ggss = g.View(ggs, batchSize * srcSeqLen, -1); IWeightTensor atten = g.Mul(ggss, m_V); IWeightTensor attenT = g.Transpose(atten); IWeightTensor attenT2 = g.View(attenT, batchSize, srcSeqLen); IWeightTensor attenSoftmax1 = g.Softmax(attenT2, inPlace: true); IWeightTensor attenSoftmax = g.View(attenSoftmax1, batchSize, 1, srcSeqLen); IWeightTensor inputs2 = g.View(attenPreProcessResult.inputsBatchFirst, batchSize, srcSeqLen, attenPreProcessResult.inputsBatchFirst.Columns); IWeightTensor contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); if (m_enableCoverageModel) { // Concatenate tensor as input for coverage model IWeightTensor aCoverage = g.View(attenSoftmax1, attenPreProcessResult.inputsBatchFirst.Rows, 1); IWeightTensor state2 = g.View(state, batchSize, 1, state.Columns); IWeightTensor state3 = g.Expand(state2, batchSize, srcSeqLen, state.Columns); IWeightTensor state4 = g.View(state3, batchSize * srcSeqLen, -1); IWeightTensor concate = g.ConcatColumns(aCoverage, attenPreProcessResult.inputsBatchFirst, state4); m_coverage.Step(concate, graph); } return(contexts); } }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); var atten2 = g.PermuteBatch(atten, m_batchSize); var attenT = g.Transpose2(atten2); var attenT2 = g.View(attenT, m_batchSize, attenPreProcessResult.inputs.Rows / m_batchSize); var attenSoftmax = g.Softmax(attenT2); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
/// <summary> /// Transformer encoder /// </summary> /// <param name="rawInputs"></param> /// <param name="g"></param> /// <returns></returns> public IWeightTensor Encode(IWeightTensor rawInput, IComputeGraph g) { int seqLen = rawInput.Rows / m_batchSize; var posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim); var posEmbeddingRepeat = g.RepeatRows(posEmbedding, m_batchSize); // Transpose to batch-first based sequence var inputs = g.TransposeBatch(rawInput, m_batchSize); inputs = g.Mul(inputs, (float)Math.Sqrt(m_inputDim)); inputs = g.Add(inputs, posEmbeddingRepeat); for (int k = 0; k < m_encoders.Count; k++) { inputs = m_encoders[k].Perform(inputs, g); } // Transpose back to time-first based sequence rawInput = g.TransposeBatch(inputs, seqLen); return(rawInput); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var wc = g.Affine(state, m_Wa, m_bWa); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, m_V); var atten2 = g.TransposeBatch(atten, batchSize); var attenT = g.Transpose(atten2); var attenT2 = g.View(attenT, batchSize, attenPreProcessResult.inputs.Rows / batchSize); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, batchSize, attenSoftmax1.Rows / batchSize, attenSoftmax1.Columns); var inputs2 = g.View(attenPreProcessResult.inputs, batchSize, attenPreProcessResult.inputs.Rows / batchSize, attenPreProcessResult.inputs.Columns); IWeightTensor contexts = g.MulBatch(attenSoftmax, inputs2, batchSize); return(contexts); }
private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs) { var Column = posEmbedding.Columns; inputEmbs = g.Mul(inputEmbs, (float)Math.Sqrt(m_modelMetaData.HiddenDim)); using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false)) { using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, runGradient: false, dims: new long[] { 1, seqLen, Column })) { using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, runGradient: false, dims: new long[] { batchSize, seqLen, Column })) { inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, Column }); inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false); inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, Column }); } } } inputEmbs = g.Dropout(inputEmbs, batchSize, m_dropoutRatio, inPlace: true); return(inputEmbs); }
private float DecodeTransformer(List <List <string> > outInputSeqs, IComputeGraph g, IWeightTensor encOutputs, IWeightTensor encMask, TransformerDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, int deviceId, bool isTraining = true) { float cost = 0.0f; var originalInputLengths = ParallelCorpus.PadSentences(outInputSeqs); int tgtSeqLen = outInputSeqs[0].Count; IWeightTensor tgtDimMask = MaskUtils.BuildPadDimMask(g, tgtSeqLen, originalInputLengths, m_modelMetaData.HiddenDim, deviceId); using (IWeightTensor tgtSelfTriMask = MaskUtils.BuildPadSelfTriMask(g, tgtSeqLen, originalInputLengths, deviceId)) { List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outInputSeqs[i][j], logUnk: true); inputs.Add(g.PeekRow(tgtEmbedding, ix_targets_k)); } } IWeightTensor tgtInputEmbeddings = inputs.Count > 1 ? g.ConcatRows(inputs) : inputs[0]; IWeightTensor decOutput = decoder.Decode(tgtInputEmbeddings, encOutputs, tgtSelfTriMask, encMask, tgtDimMask, batchSize, g); decOutput = g.Mul(decOutput, g.Transpose(tgtEmbedding)); using (IWeightTensor probs = g.Softmax(decOutput, runGradients: false, inPlace: true)) { if (isTraining) { var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(outInputSeqs, ParallelCorpus.EOS); var originalOutputLengths = ParallelCorpus.PadSentences(leftShiftInputSeqs, tgtSeqLen); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { using (IWeightTensor probs_i_j = g.PeekRow(probs, i * tgtSeqLen + j, runGradients: false)) { if (j < originalOutputLengths[i]) { int ix_targets_i_j = m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true); float score_i_j = probs_i_j.GetWeightAt(ix_targets_i_j); if (j < originalOutputLengths[i]) { cost += (float)-Math.Log(score_i_j); } probs_i_j.SetWeightAt(score_i_j - 1, ix_targets_i_j); } else { probs_i_j.CleanWeight(); } } } } decOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int i = 0; i < batchSize; i++) { outInputSeqs[i].Add(targetWords[i * tgtSeqLen + tgtSeqLen - 1]); } } } } return(cost); }
/// <summary> /// Create input embedding from token embeddings, segment embeddings /// </summary> /// <param name="seqs"></param> /// <param name="g"></param> /// <param name="embeddingsTensor"></param> /// <param name="seqOriginalLengths"></param> /// <param name="segmentEmbedding"></param> /// <param name="vocab"></param> /// <returns>The embedding tensor. shape: (batchsize * seqLen, embedding_dim) </returns> public static IWeightTensor CreateTokensEmbeddings(List <List <int> > seqs, IComputeGraph g, IWeightTensor embeddingsTensor, IWeightTensor segmentEmbedding, Vocab vocab, float scaleFactor = 1.0f, bool enableTagEmbedding = false) { int batchSize = seqs.Count; int seqLen = seqs[0].Count; float[] idxs = new float[batchSize * seqLen]; float[] segIdxs = new float[batchSize * seqLen]; List <float[]> tagIdxsList = new List <float[]>(); //float[] tagIdxs = new float[batchSize * seqLen]; for (int i = 0; i < batchSize; i++) { int segIdx = 0; List <int> currTagIdxs = new List <int>(); int currTagLevel = 0; for (int j = 0; j < seqLen; j++) { idxs[i * seqLen + j] = seqs[i][j]; segIdxs[i * seqLen + j] = segIdx; string token = vocab.GetString(seqs[i][j]); if (token == BuildInTokens.SEP) { //A new segment segIdx++; } if (enableTagEmbedding) { if (token.StartsWith("<") && token.EndsWith(">") && BuildInTokens.IsPreDefinedToken(token) == false) { if (token[1] == '/') { currTagLevel--; currTagIdxs[currTagLevel] = -1; } else { //A new opening tag while (tagIdxsList.Count <= currTagLevel) { float[] tagIdxs = new float[batchSize * seqLen]; Array.Fill(tagIdxs, -1.0f); tagIdxsList.Add(tagIdxs); } while (currTagIdxs.Count <= currTagLevel) { currTagIdxs.Add(-1); } currTagIdxs[currTagLevel] = seqs[i][j]; currTagLevel++; } } else { for (int k = 0; k < currTagLevel; k++) { tagIdxsList[k][i * seqLen + j] = currTagIdxs[k]; //Logger.WriteLine($"Add tag embeddings: '{currTagIdxs[k]}'"); } } } } } IWeightTensor tagEmbeddings = null; if (enableTagEmbedding) { for (int k = 0; k < tagIdxsList.Count; k++) { var tagEmbeddings_k = g.IndexSelect(embeddingsTensor, tagIdxsList[k], clearWeights: true); if (tagEmbeddings == null) { tagEmbeddings = tagEmbeddings_k; } else { tagEmbeddings = g.Add(tagEmbeddings, tagEmbeddings_k); } } } IWeightTensor embeddingRst = g.IndexSelect(embeddingsTensor, idxs); if (scaleFactor != 1.0f) { embeddingRst = g.Mul(embeddingRst, scaleFactor, inPlace: true); } // Apply segment embeddings to the input sequence embeddings if (segmentEmbedding != null) { embeddingRst = g.Add(embeddingRst, g.IndexSelect(segmentEmbedding, segIdxs)); } if (tagEmbeddings != null) { embeddingRst = g.Add(embeddingRst, tagEmbeddings); } return(embeddingRst); }