/// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="inputQ">The input Q tensor</param>
        /// <param name="keyMask">The mask for softmax</param>
        /// <param name="batchSize">Batch size of input data set</param>
        /// <param name="graph">The instance of computing graph</param>
        /// <returns>Transformered output tensor</returns>
        public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false)
        {
            using IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention");
            int seqLenQ = inputQ.Rows / batchSize;

            IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);

            //Input projections
            var weightedQKV = g.View(g.Affine(inputQNorm, QKV, QKVb), dims: new long[] { batchSize, seqLenQ, 3, m_multiHeadNum, m_d });
            var allQ        = g.Select(weightedQKV, 2, 0);
            var allK        = g.Select(weightedQKV, 2, 1);
            var allV        = g.Select(weightedQKV, 2, 2);


            //Multi-head attentions
            IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d });
            IWeightTensor Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenQ });
            IWeightTensor Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d });

            // Scaled softmax
            float scale = 1.0f / (float)(Math.Sqrt(m_d));
            var   attn  = g.MulBatch(Qs, Ks, scale);

            attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenQ });

            if (keyMask != null)
            {
                attn = g.Add(attn, keyMask, inPlace: true);
            }

            var attnProbs = g.Softmax(attn, inPlace: true);

            IWeightTensor sumAttnWeights = null;

            if (outputAttenWeights)
            {
                //Merge all attention probs over multi-heads
                sumAttnWeights = graph.Sum(attnProbs, 1);
                sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum);
                sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenQ });
            }

            attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenQ });

            IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d });
            IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

            // Output projection
            IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);
            IWeightTensor result          = graph.Add(finalAttResults, inputQ, inPlace: true);


            return(result, sumAttnWeights);
        }
Esempio n. 2
0
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            if (m_sharedQKV == false)
            {
                throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors.");
            }

            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV"))
            {
                int           seqLenQ    = inputQ.Rows / batchSize;
                IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);

                //Input projections
                float         scale = 1.0f / (float)(m_inputDim);
                IWeightTensor mulQ, mulK, mulV;

                using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns }))
                {
                    using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns }))
                    {
                        using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, 3, scale))
                        {
                            mulQ = g.Select(mulQKV, 0, 0);
                            mulK = g.Select(mulQKV, 0, 1);
                            mulV = g.Select(mulQKV, 0, 2);
                        }
                    }
                }

                IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenQ });
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });

                // Scaled softmax
                scale = 1.0f / (float)(m_d);
                IWeightTensor attn    = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true);
                IWeightTensor o       = g.View(g.MulBatch(softmax, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d });

                IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="inputQ">The input Q tensor</param>
        /// <param name="inputK">The input K tensor</param>
        /// <param name="inputV">The input V tensor</param>
        /// <param name="keyMask">The mask for softmax</param>
        /// <param name="batchSize">Batch size of input data set</param>
        /// <param name="graph">The instance of computing graph</param>
        /// <returns>Transformered output tensor</returns>
        public (IWeightTensor, IWeightTensor) Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph, bool outputAttenWeights = false, Dictionary <string, IWeightTensor> cachedTensors = null)
        {
            string keyName = $"{m_name}_MultiHeadAttention";

            using IComputeGraph g = graph.CreateSubGraph(keyName);
            int seqLenQ = inputQ.Rows / batchSize;

            // SeqLenK must be euqal to SeqLenV
            int seqLenK = inputK.Rows / batchSize;
            int seqLenV = inputV.Rows / batchSize;

            IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g);

            //Input projections
            IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });

            //Multi-head attentions
            IWeightTensor Qs = g.View(g.AsContiguous(g.Transpose(allQ, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, m_d });


            IWeightTensor Ks = null;
            IWeightTensor Vs = null;

            if (cachedTensors == null) // We don't use any cached tensors
            {
                IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d });
                IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d });
                Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK });
                Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d });
            }
            else
            {
                string KsCacheName = keyName + "_" + nameof(Ks);
                string VsCacheName = keyName + "_" + nameof(Vs);

                if (cachedTensors.ContainsKey(KsCacheName) == false)
                {
                    IWeightTensor allK = g.View(g.Affine(inputK, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d });
                    Ks = g.View(g.AsContiguous(g.Transpose(g.Transpose(allK, 1, 2), 2, 3)), dims: new long[] { batchSize *m_multiHeadNum, m_d, seqLenK });
                    cachedTensors.Add(KsCacheName, Ks.CopyWeightsRef(KsCacheName, Ks.NeedGradient));
                }
                else
                {
                    Ks = cachedTensors[KsCacheName];
                }

                if (cachedTensors.ContainsKey(VsCacheName) == false)
                {
                    IWeightTensor allV = g.View(g.Affine(inputV, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d });
                    Vs = g.View(g.AsContiguous(g.Transpose(allV, 1, 2)), dims: new long[] { batchSize *m_multiHeadNum, seqLenV, m_d });
                    cachedTensors.Add(VsCacheName, Vs.CopyWeightsRef(VsCacheName, Vs.NeedGradient));
                }
                else
                {
                    Vs = cachedTensors[VsCacheName];
                }
            }


            // Scaled softmax
            float scale = 1.0f / (float)(Math.Sqrt(m_d));
            var   attn  = g.MulBatch(Qs, Ks, scale);

            attn = g.View(attn, dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, seqLenK });

            if (keyMask != null)
            {
                attn = g.Add(attn, keyMask, inPlace: true);
            }

            var attnProbs = g.Softmax(attn, inPlace: true);

            IWeightTensor sumAttnWeights = null;

            if (outputAttenWeights)
            {
                sumAttnWeights = g.Select(attnProbs, 1, 0);
                for (int i = 1; i < m_multiHeadNum; i++)
                {
                    var tmp = g.Select(attnProbs, 1, i);
                    sumAttnWeights = g.Add(sumAttnWeights, tmp);
                }

                sumAttnWeights = graph.Div(sumAttnWeights, (float)m_multiHeadNum);
                sumAttnWeights = graph.View(sumAttnWeights, new long[] { batchSize *seqLenQ, seqLenK });
            }

            attnProbs = g.View(attnProbs, dims: new long[] { batchSize *m_multiHeadNum, seqLenQ, seqLenK });

            IWeightTensor o = g.View(g.MulBatch(attnProbs, Vs), dims: new long[] { batchSize, m_multiHeadNum, seqLenQ, m_d });
            IWeightTensor W = g.View(g.AsContiguous(g.Transpose(o, 1, 2)), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

            // Output projection
            IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);
            IWeightTensor result          = graph.Add(finalAttResults, inputQ, inPlace: true);


            return(result, sumAttnWeights);
        }