Ejemplo n.º 1
0
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            if (m_sharedQKV == false)
            {
                throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors.");
            }

            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV"))
            {
                int           seqLenQ    = inputQ.Rows / batchSize;
                IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);

                //Input projections
                float         scale = 1.0f / (float)(m_inputDim);
                IWeightTensor mulQ, mulK, mulV;

                using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns }))
                {
                    using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns }))
                    {
                        using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, 3, scale))
                        {
                            mulQ = g.Select(mulQKV, 0, 0);
                            mulK = g.Select(mulQKV, 0, 1);
                            mulV = g.Select(mulQKV, 0, 2);
                        }
                    }
                }

                IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenQ });
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });

                // Scaled softmax
                scale = 1.0f / (float)(m_d);
                IWeightTensor attn    = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true);
                IWeightTensor o       = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d });

                IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="inputQ">The input Q tensor</param>
        /// <param name="inputK">The input K tensor</param>
        /// <param name="inputV">The input V tensor</param>
        /// <param name="batchSize">Batch size of input data set</param>
        /// <param name="graph">The instance of computing graph</param>
        /// <returns>Transformered output tensor</returns>
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"))
            {
                int seqLenQ = inputQ.Rows / batchSize;

                // SeqLenK must be euqal to SeqLenV
                int seqLenK = inputK.Rows / batchSize;
                int seqLenV = inputV.Rows / batchSize;

                IWeightTensor inputQNorm = layerNorm1.Norm(inputQ, g);
                IWeightTensor inputKNorm = (inputK == inputQ) ? inputQNorm : inputK; // layerNorm1.Norm(inputK, g);
                IWeightTensor inputVNorm = (inputK == inputV) ? inputKNorm : inputV; // layerNorm1.Norm(inputV, g);

                //Input projections
                IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK = g.View(g.Affine(inputKNorm, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d });
                IWeightTensor allV = g.View(g.Affine(inputVNorm, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK });
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d });

                // Scaled softmax
                float         scale = 1.0f / (float)Math.Sqrt(m_d);
                IWeightTensor attn  = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor attn2 = g.View(attn, dims: new long[] { m_multiHeadNum *batchSize *seqLenQ, seqLenK });


                if (keyMask != null)
                {
                    // attn2 = g.Add(attn2, mask, runGradient2: false);
                    attn2 = g.MaskFill(attn2, keyMask, -1e9f);
                }

                IWeightTensor softmax  = g.Softmax(attn2, inPlace: true);
                IWeightTensor softmax2 = g.View(softmax, dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, seqLenK });
                IWeightTensor o        = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d });
                IWeightTensor W        = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="input">The input tensor</param>
        /// <param name="g">The instance of computing graph</param>
        /// <returns></returns>
        public IWeightTensor Perform(IWeightTensor input, IComputeGraph graph)
        {
            IComputeGraph g = graph.CreateSubGraph(m_name);

            var seqLen = input.Rows / m_batchSize;

            //Input projections
            var allQ = g.View(Q.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d);
            var allK = g.View(K.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d);
            var allV = g.View(V.Process(input, g), m_batchSize, seqLen, m_multiHeadNum, m_d);

            //Multi-head attentions
            var Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d);
            var Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * m_batchSize, m_d, seqLen);
            var Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * m_batchSize, seqLen, m_d);

            // Scaled softmax
            float scale = 1.0f / (float)Math.Sqrt(m_d);
            var   attn  = g.MulBatch(Qs, Ks, m_multiHeadNum * m_batchSize, scale);
            var   attn2 = g.View(attn, m_multiHeadNum * m_batchSize * seqLen, seqLen);

            var softmax  = g.Softmax(attn2);
            var softmax2 = g.View(softmax, m_multiHeadNum * m_batchSize, seqLen, seqLen);
            var o        = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * m_batchSize), m_multiHeadNum, m_batchSize, seqLen, m_d);
            var W        = g.View(g.Permute(o, 1, 2, 0, 3), m_batchSize * seqLen, m_multiHeadNum * m_d);

            // Output projection
            var finalAttResults = g.Affine(W, W0, b0);

            //Skip connection and layer normaliztion
            var addedAttResult     = g.Add(finalAttResults, input);
            var normAddedAttResult = layerNorm1.Process(addedAttResult, g);

            //Feed forward
            var ffnResult     = feedForwardLayer1.Process(normAddedAttResult, g);
            var reluFFNResult = g.Relu(ffnResult);
            var ffn2Result    = feedForwardLayer2.Process(reluFFNResult, g);

            //Skip connection and layer normaliztion
            var addFFNResult     = g.Add(ffn2Result, normAddedAttResult);
            var normAddFFNResult = layerNorm2.Process(addFFNResult, g);

            return(normAddFFNResult);
        }
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="input">The input tensor</param>
        /// <param name="g">The instance of computing graph</param>
        /// <returns></returns>
        public IWeightTensor Perform(IWeightTensor input, int batchSize, IComputeGraph graph)
        {
            using (IComputeGraph g = graph.CreateSubGraph(m_name))
            {
                int           seqLen = input.Rows / batchSize;
                IWeightTensor nInput = layerNorm1.Norm(input, g);

                //Input projections
                IWeightTensor allQ = g.View(g.Affine(nInput, Q, Qb), batchSize, seqLen, m_multiHeadNum, m_d);
                IWeightTensor allK = g.View(g.Affine(nInput, K, Kb), batchSize, seqLen, m_multiHeadNum, m_d);
                IWeightTensor allV = g.View(g.Affine(nInput, V, Vb), batchSize, seqLen, m_multiHeadNum, m_d);

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d);
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), m_multiHeadNum * batchSize, m_d, seqLen);
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), m_multiHeadNum * batchSize, seqLen, m_d);

                // Scaled softmax
                float         scale = 1.0f / (float)Math.Sqrt(m_d);
                IWeightTensor attn  = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor attn2 = g.View(attn, m_multiHeadNum * batchSize * seqLen, seqLen);

                IWeightTensor softmax  = g.Softmax(attn2, inPlace: true);
                IWeightTensor softmax2 = g.View(softmax, m_multiHeadNum * batchSize, seqLen, seqLen);
                IWeightTensor o        = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), m_multiHeadNum, batchSize, seqLen, m_d);
                IWeightTensor W        = g.View(g.Permute(o, 1, 2, 0, 3), batchSize * seqLen, m_multiHeadNum * m_d);

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                //Skip connection and layer normaliztion
                IWeightTensor normAddedAttResult = layerNorm2.AddNorm(finalAttResults, input, g);

                //Feed forward
                IWeightTensor ffnResult     = feedForwardLayer1.Process(normAddedAttResult, batchSize, g);
                IWeightTensor reluFFNResult = g.Relu(ffnResult);
                IWeightTensor ffn2Result    = feedForwardLayer2.Process(reluFFNResult, batchSize, g);

                //Skip connection and layer normaliztion
                IWeightTensor addFFNResult = graph.Add(ffn2Result, normAddedAttResult);

                return(addFFNResult);
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="inputQ">The input Q tensor</param>
        /// <param name="inputK">The input K tensor</param>
        /// <param name="inputV">The input V tensor</param>
        /// <param name="batchSize">Batch size of input data set</param>
        /// <param name="graph">The instance of computing graph</param>
        /// <returns>Transformered output tensor</returns>
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            if (m_sharedQKV)
            {
                throw new ArgumentException($"Layer '{m_name}' is in shared QKV mode, please call antoher Perform function with single input tensor.");
            }

            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"))
            {
                int seqLenQ = inputQ.Rows / batchSize;

                // SeqLenK must be euqal to SeqLenV
                int seqLenK = inputK.Rows / batchSize;
                int seqLenV = inputV.Rows / batchSize;

                IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);
                //Input projections
                float         scale = 1.0f / (float)(m_inputDim);
                IWeightTensor allQ  = g.View(g.Affine(inputQNorm, Q, Qb, scale), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK  = g.View(g.Affine(inputK, K, Kb, scale), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d });
                IWeightTensor allV  = g.View(g.Affine(inputV, V, Vb, scale), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK });
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d });

                // Scaled softmax
                scale = 1.0f / (float)(m_d);
                IWeightTensor attn    = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true);
                IWeightTensor o       = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d });

                IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }