コード例 #1
0
        public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph)
        {
            int srcSeqLen = attenPreProcessResult.inputsBatchFirst.Rows / batchSize;

            using (IComputeGraph g = graph.CreateSubGraph(m_name))
            {
                // Affine decoder state
                IWeightTensor wc = g.Affine(state, m_Wa, m_bWa);

                // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim]
                IWeightTensor wc1   = g.View(wc, batchSize, 1, wc.Columns);
                IWeightTensor wcExp = g.Expand(wc1, batchSize, srcSeqLen, wc.Columns);

                IWeightTensor ggs = null;
                if (m_enableCoverageModel)
                {
                    // Get coverage model status at {t-1}
                    IWeightTensor wCoverage  = g.Affine(m_coverage.Hidden, m_Wc, m_bWc);
                    IWeightTensor wCoverage1 = g.View(wCoverage, batchSize, srcSeqLen, -1);

                    ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp, wCoverage1);
                }
                else
                {
                    ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp);
                }

                IWeightTensor ggss  = g.View(ggs, batchSize * srcSeqLen, -1);
                IWeightTensor atten = g.Mul(ggss, m_V);

                IWeightTensor attenT  = g.Transpose(atten);
                IWeightTensor attenT2 = g.View(attenT, batchSize, srcSeqLen);

                IWeightTensor attenSoftmax1 = g.Softmax(attenT2, inPlace: true);

                IWeightTensor attenSoftmax = g.View(attenSoftmax1, batchSize, 1, srcSeqLen);
                IWeightTensor inputs2      = g.View(attenPreProcessResult.inputsBatchFirst, batchSize, srcSeqLen, attenPreProcessResult.inputsBatchFirst.Columns);

                IWeightTensor contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize);

                if (m_enableCoverageModel)
                {
                    // Concatenate tensor as input for coverage model
                    IWeightTensor aCoverage = g.View(attenSoftmax1, attenPreProcessResult.inputsBatchFirst.Rows, 1);


                    IWeightTensor state2 = g.View(state, batchSize, 1, state.Columns);
                    IWeightTensor state3 = g.Expand(state2, batchSize, srcSeqLen, state.Columns);
                    IWeightTensor state4 = g.View(state3, batchSize * srcSeqLen, -1);


                    IWeightTensor concate = g.ConcatColumns(aCoverage, attenPreProcessResult.inputsBatchFirst, state4);
                    m_coverage.Step(concate, graph);
                }


                return(contexts);
            }
        }
コード例 #2
0
        /// <summary>
        /// Transformer encoder
        /// </summary>
        /// <param name="rawInputs"></param>
        /// <param name="g"></param>
        /// <returns></returns>
        ///

        public (IWeightTensor, IWeightTensor) Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor srcTgtMask, int batchSize, IComputeGraph g, bool outputAttnWeights = false, Dictionary <string, IWeightTensor> cachedTensors = null)
        {
            IWeightTensor attnProbs = null;

            using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder"))
            {
                int seqLenQ = tgtInputs.Rows / batchSize;

                // SeqLenK must be euqal to SeqLenV
                int seqLenK = encOutputBatchFirst.Rows / batchSize;

                IWeightTensor selfMaskTensor = null;
                if (tgtSelfMask != null)
                {
                    selfMaskTensor = subg.Expand(tgtSelfMask, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ });
                }

                IWeightTensor crossMaskTensor = null;
                if (srcTgtMask != null)
                {
                    crossMaskTensor = subg.Expand(srcTgtMask, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK });
                }

                for (int k = 0; k < m_selfAttns.Count; k++)
                {
                    (tgtInputs, attnProbs) = m_selfAttns[k].Perform(tgtInputs, selfMaskTensor, batchSize, subg, outputAttenWeights: false);
                    (tgtInputs, attnProbs) = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, crossMaskTensor, batchSize, subg, outputAttenWeights: (outputAttnWeights && k == m_selfAttns.Count - 1), cachedTensors: cachedTensors);
                    tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg);
                }

                tgtInputs = layerNorm.Norm(tgtInputs, subg);

                tgtInputs.UnbindFromComputeGraph();
                if (attnProbs != null)
                {
                    attnProbs.UnbindFromComputeGraph();
                }

                if (selfMaskTensor != null)
                {
                    selfMaskTensor.Dispose();
                }

                if (crossMaskTensor != null)
                {
                    crossMaskTensor.Dispose();
                }
            }


            //     tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g);

            return(tgtInputs, attnProbs);
        }
コード例 #3
0
        /// <summary>
        /// Transformer encoder
        /// </summary>
        /// <param name="rawInputs"></param>
        /// <param name="g"></param>
        /// <returns></returns>
        ///

        public IWeightTensor Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor decEncAttnMask, IWeightTensor tgtDimMask, int batchSize, IComputeGraph g)
        {
            int tgtSeqLen = tgtInputs.Rows / batchSize;
            int srcSeqLen = encOutputBatchFirst.Rows / batchSize;

            using (IWeightTensor posEmbedding = g.BuildPositionMatrix(tgtSeqLen, m_inputDim))
            {
                using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false))
                {
                    tgtInputs = g.AddMul(posEmbeddingRepeat, tgtInputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true);
                }
            }

            tgtInputs = g.Dropout(tgtInputs, batchSize, m_dropoutRatio, inPlace: true);

            var tgtSelfMaskRep    = g.View(tgtSelfMask, dims: new long[] { 1, batchSize, tgtSeqLen, tgtSeqLen });
            var tgtSelfMaskRepExp = g.Expand(tgtSelfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, tgtSeqLen });

            var decEncAttnMaskRep    = g.View(decEncAttnMask, dims: new long[] { 1, batchSize, tgtSeqLen, srcSeqLen });
            var decEncAttnMaskRepExp = g.Expand(decEncAttnMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, srcSeqLen });

            var tgtSelfMaskRepExpView    = g.View(tgtSelfMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, tgtSeqLen });
            var decEncAttnMaskRepExpView = g.View(decEncAttnMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, srcSeqLen });

            tgtSelfMaskRep.Dispose();
            tgtSelfMaskRepExp.Dispose();

            decEncAttnMaskRep.Dispose();
            decEncAttnMaskRepExp.Dispose();

            using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder"))
            {
                for (int k = 0; k < m_selfAttns.Count; k++)
                {
                    tgtInputs = g.MaskFill(tgtInputs, tgtDimMask, 0.0f);

                    tgtInputs = m_selfAttns[k].Perform(tgtInputs, tgtInputs, tgtInputs, tgtSelfMaskRepExpView, batchSize, subg);
                    tgtInputs = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, decEncAttnMaskRepExpView, batchSize, subg);
                    tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg);
                }

                tgtInputs.UnbindFromComputeGraph();
            }

            tgtInputs = layerNorm.Norm(tgtInputs, g);

            //    tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g);
            return(tgtInputs);
        }
コード例 #4
0
        /// <summary>
        /// Transformer encoder
        /// </summary>
        /// <param name="rawInputs"></param>
        /// <param name="g"></param>
        /// <returns></returns>
        public IWeightTensor Encode(IWeightTensor inputs, int batchSize, IComputeGraph g, IWeightTensor srcSelfMask)
        {
            using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Encoder"))
            {
                IWeightTensor maskTensor = null;
                if (srcSelfMask != null)
                {
                    int seqLen = inputs.Rows / batchSize;
                    using var keyMaskView = subg.View(srcSelfMask, dims: new long[] { batchSize, 1, seqLen, seqLen });
                    maskTensor            = subg.Expand(keyMaskView, dims: new long[] { batchSize, m_multiHeadNum, seqLen, seqLen });
                }

                IWeightTensor attnProbs = null;
                for (int k = 0; k < m_encoders.Count; k++)
                {
                    (inputs, attnProbs) = m_encoders[k].Perform(inputs, maskTensor, batchSize, subg, outputAttenWeights: false);
                    inputs = m_posFFNs[k].Perform(inputs, batchSize, subg);
                }

                inputs = layerNorm.Norm(inputs, subg);

                inputs.UnbindFromComputeGraph();
                if (attnProbs != null)
                {
                    attnProbs.UnbindFromComputeGraph();
                }

                if (maskTensor != null)
                {
                    maskTensor.Dispose();
                }
            }

            return(inputs);
        }
コード例 #5
0
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            if (m_sharedQKV == false)
            {
                throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors.");
            }

            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV"))
            {
                int           seqLenQ    = inputQ.Rows / batchSize;
                IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);

                //Input projections
                float         scale = 1.0f / (float)(m_inputDim);
                IWeightTensor mulQ, mulK, mulV;

                using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns }))
                {
                    using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns }))
                    {
                        using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, 3, scale))
                        {
                            mulQ = g.Select(mulQKV, 0, 0);
                            mulK = g.Select(mulQKV, 0, 1);
                            mulV = g.Select(mulQKV, 0, 2);
                        }
                    }
                }

                IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenQ });
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });

                // Scaled softmax
                scale = 1.0f / (float)(m_d);
                IWeightTensor attn    = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true);
                IWeightTensor o       = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d });

                IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }
コード例 #6
0
ファイル: SequenceLabel.cs プロジェクト: artepic/Seq2SeqSharp
        private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs)
        {
            using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false))
            {
                using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, false, new long[] { 1, seqLen, this.m_modelMetaData.EmbeddingDim }))
                {
                    using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, false, new long[] { batchSize, seqLen, this.m_modelMetaData.EmbeddingDim }))
                    {
                        inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, this.m_modelMetaData.EmbeddingDim });
                        inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false);
                        inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, this.m_modelMetaData.EmbeddingDim });
                    }
                }
            }

            inputEmbs = g.Dropout(inputEmbs, batchSize, this.m_dropoutRatio, true);

            return(inputEmbs);
        }
コード例 #7
0
        public static IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, IWeightTensor inputEmbs, float dropoutRatio)
        {
            var Column = posEmbedding.Columns;
            int seqLen = inputEmbs.Rows / batchSize;

            using (var posEmbeddingPeek = g.Peek(posEmbedding, 0, 0, seqLen))
            {
                using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, dims: new long[] { 1, seqLen, Column }))
                {
                    using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, dims: new long[] { batchSize, seqLen, Column }))
                    {
                        inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, Column });
                        inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, inPlace: true);
                        inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, Column });
                    }
                }
            }

            inputEmbs = g.Dropout(inputEmbs, batchSize, dropoutRatio, inPlace: true);

            return(inputEmbs);
        }
コード例 #8
0
        /// <summary>
        /// Transformer encoder
        /// </summary>
        /// <param name="rawInputs"></param>
        /// <param name="g"></param>
        /// <returns></returns>
        public IWeightTensor Encode(IWeightTensor inputs, IWeightTensor selfMask, IWeightTensor dimMask, int batchSize, IComputeGraph g)
        {
            int seqLen = inputs.Rows / batchSize;

            using (IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim))
            {
                using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false))
                {
                    inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true);
                }
            }

            inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true);

            var selfMaskRep               = g.View(selfMask, dims: new long[] { 1, batchSize, seqLen, seqLen });
            var multiHeadhSelfMaskRep     = g.Expand(selfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, seqLen, seqLen });
            var multiHeadhSelfMaskRepView = g.View(multiHeadhSelfMaskRep, dims: new long[] { m_multiHeadNum *batchSize *seqLen, seqLen });

            selfMaskRep.Dispose();
            multiHeadhSelfMaskRep.Dispose();

            using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Encoder"))
            {
                for (int k = 0; k < m_encoders.Count; k++)
                {
                    inputs = g.MaskFill(inputs, dimMask, 0.0f);

                    inputs = m_encoders[k].Perform(inputs, inputs, inputs, multiHeadhSelfMaskRepView, batchSize, subg);
                    inputs = m_posFFNs[k].Perform(inputs, batchSize, subg);
                }
                inputs.UnbindFromComputeGraph();
            }


            inputs = layerNorm.Norm(inputs, g);

            return(inputs);
        }
コード例 #9
0
        private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs)
        {
            var Column = posEmbedding.Columns;

            inputEmbs = g.Mul(inputEmbs, (float)Math.Sqrt(m_modelMetaData.HiddenDim));

            using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false))
            {
                using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, runGradient: false, dims: new long[] { 1, seqLen, Column }))
                {
                    using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, runGradient: false, dims: new long[] { batchSize, seqLen, Column }))
                    {
                        inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, Column });
                        inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false);
                        inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, Column });
                    }
                }
            }

            inputEmbs = g.Dropout(inputEmbs, batchSize, m_dropoutRatio, inPlace: true);

            return(inputEmbs);
        }
コード例 #10
0
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="inputQ">The input Q tensor</param>
        /// <param name="inputK">The input K tensor</param>
        /// <param name="inputV">The input V tensor</param>
        /// <param name="keyMask">The mask for softmax</param>
        /// <param name="batchSize">Batch size of input data set</param>
        /// <param name="graph">The instance of computing graph</param>
        /// <returns>Transformered output tensor</returns>
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"))
            {
                int seqLenQ = inputQ.Rows / batchSize;

                // SeqLenK must be euqal to SeqLenV
                int seqLenK = inputK.Rows / batchSize;
                int seqLenV = inputV.Rows / batchSize;

                IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);
                if (inputK == inputQ)
                {
                    inputK = inputQNorm;
                }
                if (inputV == inputQ)
                {
                    inputV = inputQNorm;
                }

                //Input projections
                float         scale = 1.0f;
                IWeightTensor allQ  = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK  = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d });
                IWeightTensor allV  = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK });
                IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d });

                // Scaled softmax
                scale = 1.0f / (float)(Math.Sqrt(m_d));
                IWeightTensor attn = g.MulBatch(Qs, Ks, batchSize * m_multiHeadNum, scale);

                if (keyMask != null)
                {
                    using (var keyMaskView = g.View(keyMask, runGradient: false, dims: new long[] { batchSize, 1, seqLenQ, seqLenK }))
                    {
                        using (var keyMaskViewExp = g.Expand(keyMaskView, runGradient: false, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK }))
                        {
                            using (var keyMaskViewExpConti = g.AsContiguous(keyMaskViewExp, runGradient: false))
                            {
                                using (var keyMaskViewExpContiView = g.View(keyMaskViewExpConti, runGradient: false, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK }))
                                {
                                    attn = g.Add(attn, keyMaskViewExpContiView, runGradient1: true, runGradient2: false);
                                }
                            }
                        }
                    }
                }

                IWeightTensor softmax = g.Softmax(attn, inPlace: true);

                IWeightTensor o = g.View(g.MulBatch(softmax, Vs, batchSize * m_multiHeadNum), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d });
                IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }