Exemple #1
0
        /// <summary>
        /// Transformer encoder
        /// </summary>
        /// <param name="rawInputs"></param>
        /// <param name="g"></param>
        /// <returns></returns>
        ///

        public IWeightTensor Decode(IWeightTensor tgtInputs, IWeightTensor encOutputBatchFirst, IWeightTensor tgtSelfMask, IWeightTensor decEncAttnMask, IWeightTensor tgtDimMask, int batchSize, IComputeGraph g)
        {
            int tgtSeqLen = tgtInputs.Rows / batchSize;
            int srcSeqLen = encOutputBatchFirst.Rows / batchSize;

            using (IWeightTensor posEmbedding = g.BuildPositionMatrix(tgtSeqLen, m_inputDim))
            {
                using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false))
                {
                    tgtInputs = g.AddMul(posEmbeddingRepeat, tgtInputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true);
                }
            }

            tgtInputs = g.Dropout(tgtInputs, batchSize, m_dropoutRatio, inPlace: true);

            var tgtSelfMaskRep    = g.View(tgtSelfMask, dims: new long[] { 1, batchSize, tgtSeqLen, tgtSeqLen });
            var tgtSelfMaskRepExp = g.Expand(tgtSelfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, tgtSeqLen });

            var decEncAttnMaskRep    = g.View(decEncAttnMask, dims: new long[] { 1, batchSize, tgtSeqLen, srcSeqLen });
            var decEncAttnMaskRepExp = g.Expand(decEncAttnMaskRep, dims: new long[] { m_multiHeadNum, batchSize, tgtSeqLen, srcSeqLen });

            var tgtSelfMaskRepExpView    = g.View(tgtSelfMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, tgtSeqLen });
            var decEncAttnMaskRepExpView = g.View(decEncAttnMaskRepExp, dims: new long[] { m_multiHeadNum *batchSize *tgtSeqLen, srcSeqLen });

            tgtSelfMaskRep.Dispose();
            tgtSelfMaskRepExp.Dispose();

            decEncAttnMaskRep.Dispose();
            decEncAttnMaskRepExp.Dispose();

            using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Decoder"))
            {
                for (int k = 0; k < m_selfAttns.Count; k++)
                {
                    tgtInputs = g.MaskFill(tgtInputs, tgtDimMask, 0.0f);

                    tgtInputs = m_selfAttns[k].Perform(tgtInputs, tgtInputs, tgtInputs, tgtSelfMaskRepExpView, batchSize, subg);
                    tgtInputs = m_encAttns[k].Perform(tgtInputs, encOutputBatchFirst, encOutputBatchFirst, decEncAttnMaskRepExpView, batchSize, subg);
                    tgtInputs = m_posFFNs[k].Perform(tgtInputs, batchSize, subg);
                }

                tgtInputs.UnbindFromComputeGraph();
            }

            tgtInputs = layerNorm.Norm(tgtInputs, g);

            //    tgtInputs = m_decoderFFLayer.Process(tgtInputs, batchSize, g);
            return(tgtInputs);
        }
        /// <summary>
        /// Scaled multi-heads attention component with skip connectioned feed forward layers
        /// </summary>
        /// <param name="inputQ">The input Q tensor</param>
        /// <param name="inputK">The input K tensor</param>
        /// <param name="inputV">The input V tensor</param>
        /// <param name="batchSize">Batch size of input data set</param>
        /// <param name="graph">The instance of computing graph</param>
        /// <returns>Transformered output tensor</returns>
        public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor inputK, IWeightTensor inputV, IWeightTensor keyMask, int batchSize, IComputeGraph graph)
        {
            using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention"))
            {
                int seqLenQ = inputQ.Rows / batchSize;

                // SeqLenK must be euqal to SeqLenV
                int seqLenK = inputK.Rows / batchSize;
                int seqLenV = inputV.Rows / batchSize;

                IWeightTensor inputQNorm = layerNorm1.Norm(inputQ, g);
                IWeightTensor inputKNorm = (inputK == inputQ) ? inputQNorm : inputK; // layerNorm1.Norm(inputK, g);
                IWeightTensor inputVNorm = (inputK == inputV) ? inputKNorm : inputV; // layerNorm1.Norm(inputV, g);

                //Input projections
                IWeightTensor allQ = g.View(g.Affine(inputQNorm, Q, Qb), dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d });
                IWeightTensor allK = g.View(g.Affine(inputKNorm, K, Kb), dims: new long[] { batchSize, seqLenK, m_multiHeadNum, m_d });
                IWeightTensor allV = g.View(g.Affine(inputVNorm, V, Vb), dims: new long[] { batchSize, seqLenV, m_multiHeadNum, m_d });

                //Multi-head attentions
                IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, m_d });
                IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum *batchSize, m_d, seqLenK });
                IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum *batchSize, seqLenV, m_d });

                // Scaled softmax
                float         scale = 1.0f / (float)Math.Sqrt(m_d);
                IWeightTensor attn  = g.MulBatch(Qs, Ks, m_multiHeadNum * batchSize, scale);
                IWeightTensor attn2 = g.View(attn, dims: new long[] { m_multiHeadNum *batchSize *seqLenQ, seqLenK });


                if (keyMask != null)
                {
                    // attn2 = g.Add(attn2, mask, runGradient2: false);
                    attn2 = g.MaskFill(attn2, keyMask, -1e9f);
                }

                IWeightTensor softmax  = g.Softmax(attn2, inPlace: true);
                IWeightTensor softmax2 = g.View(softmax, dims: new long[] { m_multiHeadNum *batchSize, seqLenQ, seqLenK });
                IWeightTensor o        = g.View(g.MulBatch(softmax2, Vs, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d });
                IWeightTensor W        = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize *seqLenQ, m_multiHeadNum *m_d });

                // Output projection
                IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true);

                return(graph.Add(finalAttResults, inputQ));
            }
        }
        /// <summary>
        /// Transformer encoder
        /// </summary>
        /// <param name="rawInputs"></param>
        /// <param name="g"></param>
        /// <returns></returns>
        public IWeightTensor Encode(IWeightTensor inputs, IWeightTensor selfMask, IWeightTensor dimMask, int batchSize, IComputeGraph g)
        {
            int seqLen = inputs.Rows / batchSize;

            using (IWeightTensor posEmbedding = g.BuildPositionMatrix(seqLen, m_inputDim))
            {
                using (IWeightTensor posEmbeddingRepeat = g.RepeatRows(posEmbedding, batchSize, runGradient: false))
                {
                    inputs = g.AddMul(posEmbeddingRepeat, inputs, (float)Math.Sqrt(m_inputDim), runGradientW1: false, runGradientW2: true);
                }
            }

            inputs = g.Dropout(inputs, batchSize, m_dropoutRatio, inPlace: true);

            var selfMaskRep               = g.View(selfMask, dims: new long[] { 1, batchSize, seqLen, seqLen });
            var multiHeadhSelfMaskRep     = g.Expand(selfMaskRep, dims: new long[] { m_multiHeadNum, batchSize, seqLen, seqLen });
            var multiHeadhSelfMaskRepView = g.View(multiHeadhSelfMaskRep, dims: new long[] { m_multiHeadNum *batchSize *seqLen, seqLen });

            selfMaskRep.Dispose();
            multiHeadhSelfMaskRep.Dispose();

            using (IComputeGraph subg = g.CreateSubGraph($"{m_name}_Encoder"))
            {
                for (int k = 0; k < m_encoders.Count; k++)
                {
                    inputs = g.MaskFill(inputs, dimMask, 0.0f);

                    inputs = m_encoders[k].Perform(inputs, inputs, inputs, multiHeadhSelfMaskRepView, batchSize, subg);
                    inputs = m_posFFNs[k].Perform(inputs, batchSize, subg);
                }
                inputs.UnbindFromComputeGraph();
            }


            inputs = layerNorm.Norm(inputs, g);

            return(inputs);
        }