コード例 #1
0
        public LSTMAttentionDecoderCell(string name, int batchSize, int hdim, int dim, int contextSize, int deviceId)
        {
            m_name      = name;
            m_hdim      = hdim;
            m_dim       = dim;
            m_deviceId  = deviceId;
            m_batchSize = batchSize;

            m_Wxhc = new WeightTensor(new long[2] {
                dim + hdim + contextSize, hdim * 4
            }, deviceId, normal: true, name: $"{name}.{nameof(m_Wxhc)}", isTrainable: true);
            m_b = new WeightTensor(new long[2] {
                1, hdim * 4
            }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: true);

            Hidden = new WeightTensor(new long[2] {
                batchSize, hdim
            }, 0, deviceId, name: $"{name}.{nameof(Hidden)}", isTrainable: true);
            Cell = new WeightTensor(new long[2] {
                batchSize, hdim
            }, 0, deviceId, name: $"{name}.{nameof(Cell)}", isTrainable: true);

            layerNorm1 = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hdim * 4, deviceId);
            layerNorm2 = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hdim, deviceId);
        }
コード例 #2
0
        public TransformerEncoder(string name, int multiHeadNum, int hiddenDim, int inputDim, int depth, float dropoutRatio, int deviceId, bool isTrainable)
        {
            Logger.WriteLine($"Creating transformer encoder at device '{deviceId}'. HiddenDim = '{hiddenDim}', InputDim = '{inputDim}', Depth = '{depth}', MultiHeadNum = '{multiHeadNum}'");

            m_name         = name;
            m_multiHeadNum = multiHeadNum;
            m_hiddenDim    = hiddenDim;
            m_inputDim     = inputDim;
            m_depth        = depth;
            m_dropoutRatio = dropoutRatio;
            m_deviceId     = deviceId;
            m_isTrainable  = isTrainable;

            if (hiddenDim != inputDim)
            {
                throw new ArgumentException($"hiddenDim is not equal to inputDim in TransformerEncoder.");
            }

            m_encoders.Add(new MultiHeadAttention($"{name}.SelfAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable));
            for (int i = 1; i < depth; i++)
            {
                m_encoders.Add(new MultiHeadAttention($"{name}.SelfAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable));
            }

            for (int i = 0; i < depth; i++)
            {
                m_posFFNs.Add(new PositionwiseFeedForward($"{name}.PosFFN_{i}", hiddenDim, m_dropoutRatio, deviceId, isTrainable));
            }

            layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable);
        }
コード例 #3
0
        public LSTMAttentionDecoderCell(int batchSize, int hdim, int dim, ArchTypeEnums archType, int deviceId)
        {
            int contextSize = hdim * 2;

            this.hdim  = hdim;
            this.dim   = dim;
            m_deviceId = deviceId;

            m_batchSize = batchSize;

            if (archType == ArchTypeEnums.GPU_CUDA)
            {
                Wxhc = new WeightTensor(dim + hdim + contextSize, hdim * 4, deviceId, true);
                b    = new WeightTensor(1, hdim * 4, 0, deviceId);

                this.ht = new WeightTensor(batchSize, hdim, 0, deviceId);
                this.ct = new WeightTensor(batchSize, hdim, 0, deviceId);
            }
            else
            {
                Wxhc = new WeightMatrix(dim + hdim + contextSize, hdim * 4, true);
                b    = new WeightMatrix(1, hdim * 4, 0);

                this.ht = new WeightMatrix(batchSize, hdim, 0);
                this.ct = new WeightMatrix(batchSize, hdim, 0);
            }

            layerNorm1 = new LayerNormalization(hdim * 4, archType, deviceId);
            layerNorm2 = new LayerNormalization(hdim, archType, deviceId);
        }
コード例 #4
0
        public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable, float learningRateFactor = 1.0f)
        {
            m_name         = name;
            m_dropoutRatio = dropoutRatio;

            layerNorm2        = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor);
            feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor);
            feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor);
        }
コード例 #5
0
        public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable)
        {
            this.m_name         = name;
            this.m_hiddenDim    = hiddenDim;
            this.m_dropoutRatio = dropoutRatio;

            this.layerNorm2        = new LayerNormalization($"{name}.{nameof(this.layerNorm2)}", hiddenDim, deviceId, isTrainable);
            this.feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer1)}", hiddenDim, hiddenDim * 4, this.m_dropoutRatio, deviceId, isTrainable);
            this.feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer2)}", hiddenDim * 4, hiddenDim, this.m_dropoutRatio, deviceId, isTrainable);
        }
コード例 #6
0
        public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable, bool sharedQKV = false, float learningRateFactor = 1.0f)
        {
            m_name         = name;
            m_hiddenDim    = hiddenDim;
            m_multiHeadNum = multiHeadNum;
            m_d            = m_hiddenDim / m_multiHeadNum;
            m_dropoutRatio = dropoutRatio;
            m_sharedQKV    = sharedQKV;

            W0 = new WeightTensor(new long[2] {
                hiddenDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor);
            b0 = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: isTrainable);

            if (m_sharedQKV == false)
            {
                Q = new WeightTensor(new long[2] {
                    inputDim, hiddenDim
                }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor);
                Qb = new WeightTensor(new long[2] {
                    1, hiddenDim
                }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor);

                K = new WeightTensor(new long[2] {
                    inputDim, hiddenDim
                }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor);
                Kb = new WeightTensor(new long[2] {
                    1, hiddenDim
                }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor);

                V = new WeightTensor(new long[2] {
                    inputDim, hiddenDim
                }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor);
                Vb = new WeightTensor(new long[2] {
                    1, hiddenDim
                }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor);
            }
            else
            {
                QKV = new WeightTensor(new long[2] {
                    inputDim, hiddenDim * 3
                }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor);
                QKVb = new WeightTensor(new long[2] {
                    1, hiddenDim * 3
                }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor);
            }

            layerNormQ = new LayerNormalization($"{name}.{nameof(layerNormQ)}", m_hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor);
        }
コード例 #7
0
        //  private readonly bool m_sharedQKV = false;

        public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable)
        {
            m_name         = name;
            m_hiddenDim    = hiddenDim;
            m_inputDim     = inputDim;
            m_multiHeadNum = multiHeadNum;
            m_d            = m_hiddenDim / m_multiHeadNum;
            m_dropoutRatio = dropoutRatio;
            // m_sharedQKV = sharedQKV;

            W0 = new WeightTensor(new long[2] {
                hiddenDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: isTrainable, normal: NormType.Uniform);
            b0 = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: isTrainable);

            //if (m_sharedQKV == false)
            //{
            Q = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable, normal: NormType.Uniform);
            Qb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable);

            K = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: isTrainable, normal: NormType.Uniform);
            Kb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: isTrainable);

            V = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: isTrainable, normal: NormType.Uniform);
            Vb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: isTrainable);
            //}
            //else
            //{
            //    QKV = new WeightTensor(new long[] { 3, inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(QKV)}", isTrainable: isTrainable, normal: NormType.Uniform);
            //}

            layerNormQ = new LayerNormalization($"{name}.{nameof(layerNormQ)}", m_hiddenDim, deviceId, isTrainable);
        }
コード例 #8
0
        public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable, bool sharedQKV = false)
        {
            this.m_name         = name;
            this.m_hiddenDim    = hiddenDim;
            this.m_inputDim     = inputDim;
            this.m_multiHeadNum = multiHeadNum;
            this.m_d            = this.m_hiddenDim / this.m_multiHeadNum;
            this.m_dropoutRatio = dropoutRatio;
            this.m_sharedQKV    = sharedQKV;

            this.W0 = new WeightTensor(new long[2] {
                hiddenDim, hiddenDim
            }, deviceId, $"{name}.{nameof(this.W0)}", isTrainable, NormType.Uniform);
            this.b0 = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, $"{name}.{nameof(this.b0)}", isTrainable);

            if (this.m_sharedQKV == false)
            {
                this.Q = new WeightTensor(new long[2] {
                    inputDim, hiddenDim
                }, deviceId, $"{name}.{nameof(this.Q)}", isTrainable, NormType.Uniform);
                this.Qb = new WeightTensor(new long[2] {
                    1, hiddenDim
                }, 0, deviceId, $"{name}.{nameof(this.Qb)}", isTrainable);

                this.K = new WeightTensor(new long[2] {
                    inputDim, hiddenDim
                }, deviceId, $"{name}.{nameof(this.K)}", isTrainable, NormType.Uniform);
                this.Kb = new WeightTensor(new long[2] {
                    1, hiddenDim
                }, 0, deviceId, $"{name}.{nameof(this.Kb)}", isTrainable);

                this.V = new WeightTensor(new long[2] {
                    inputDim, hiddenDim
                }, deviceId, $"{name}.{nameof(this.V)}", isTrainable, NormType.Uniform);
                this.Vb = new WeightTensor(new long[2] {
                    1, hiddenDim
                }, 0, deviceId, $"{name}.{nameof(this.Vb)}", isTrainable);
            }
            else
            {
                this.QKV = new WeightTensor(new long[] { 3, inputDim, hiddenDim }, deviceId, $"{name}.{nameof(this.QKV)}", isTrainable, NormType.Uniform);
            }

            this.layerNormQ = new LayerNormalization($"{name}.{nameof(this.layerNormQ)}", this.m_hiddenDim, deviceId, isTrainable);
        }
コード例 #9
0
ファイル: LSTMCell.cs プロジェクト: zhongkaifu/Seq2SeqSharp
        public LSTMCell(string name, int hdim, int dim, int deviceId, bool isTrainable)
        {
            m_name = name;

            m_Wxh = new WeightTensor(new long[2] {
                dim + hdim, hdim * 4
            }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wxh)}", isTrainable: isTrainable);
            m_b = new WeightTensor(new long[2] {
                1, hdim * 4
            }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: isTrainable);

            m_hdim     = hdim;
            m_dim      = dim;
            m_deviceId = deviceId;

            m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hdim * 4, deviceId, isTrainable: isTrainable);
            m_layerNorm2 = new LayerNormalization($"{name}.{nameof(m_layerNorm2)}", hdim, deviceId, isTrainable: isTrainable);
        }
コード例 #10
0
        public LSTMAttentionDecoderCell(string name, int hiddenDim, int inputDim, int contextDim, int deviceId, bool isTrainable)
        {
            m_name      = name;
            m_hiddenDim = hiddenDim;
            m_inputDim  = inputDim;
            m_deviceId  = deviceId;

            Logger.WriteLine($"Create LSTM attention decoder cell '{name}' HiddemDim = '{hiddenDim}', InputDim = '{inputDim}', ContextDim = '{contextDim}', DeviceId = '{deviceId}'");

            m_Wxhc = new WeightTensor(new long[2] {
                inputDim + hiddenDim + contextDim, hiddenDim * 4
            }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wxhc)}", isTrainable: isTrainable);
            m_b = new WeightTensor(new long[2] {
                1, hiddenDim * 4
            }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: isTrainable);

            m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hiddenDim * 4, deviceId, isTrainable);
            m_layerNorm2 = new LayerNormalization($"{name}.{nameof(m_layerNorm2)}", hiddenDim, deviceId, isTrainable);
        }
コード例 #11
0
ファイル: LSTMCell.cs プロジェクト: jiaguoxinzhi/Seq2SeqSharp
        public LSTMCell(string name, int batchSize, int hdim, int dim, int deviceId)
        {
            m_name = name;

            m_Wxh = new WeightTensor(new long[2] {
                dim + hdim, hdim * 4
            }, deviceId, normal: true, name: $"{name}.{nameof(m_Wxh)}", isTrainable: true);
            m_b = new WeightTensor(new long[2] {
                1, hdim * 4
            }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: true);

            m_hdim      = hdim;
            m_dim       = dim;
            m_batchSize = batchSize;
            m_deviceId  = deviceId;

            m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hdim * 4, deviceId);
            m_layerNorm2 = new LayerNormalization($"{name}.{nameof(m_layerNorm2)}", hdim, deviceId);
        }
コード例 #12
0
        public SelfAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId)
        {
            m_name         = name;
            m_hiddenDim    = hiddenDim;
            m_multiHeadNum = multiHeadNum;
            m_d            = m_hiddenDim / m_multiHeadNum;
            m_dropoutRatio = dropoutRatio;

            W0 = new WeightTensor(new long[2] {
                hiddenDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: true);
            b0 = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: true);

            Q = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: true);
            Qb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: true);

            K = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: true);
            Kb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: true);

            V = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: true);
            Vb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: true);


            layerNorm1        = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hiddenDim, deviceId);
            layerNorm2        = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId);
            feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId);
            feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId);
        }
コード例 #13
0
        public TransformerDecoder(string name, int multiHeadNum, int hiddenDim, int inputDim, int depth, float dropoutRatio, int deviceId, bool isTrainable, float learningRateFactor = 1.0f)
        {
            Logger.WriteLine($"Creating transformer decoder at device '{deviceId}'. HiddenDim = '{hiddenDim}', InputDim = '{inputDim}', Depth = '{depth}', MultiHeadNum = '{multiHeadNum}'");

            m_name         = name;
            m_multiHeadNum = multiHeadNum;
            m_hiddenDim    = hiddenDim;
            m_inputDim     = inputDim;
            //    m_outputDim = outputDim;
            m_depth              = depth;
            m_dropoutRatio       = dropoutRatio;
            m_deviceId           = deviceId;
            m_isTrainable        = isTrainable;
            m_learningRateFactor = learningRateFactor;

            if (hiddenDim != inputDim)
            {
                throw new ArgumentException($"hiddenDim is not equal to inputDim in TransformerDecoder.");
            }

            m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, sharedQKV: true, learningRateFactor: learningRateFactor));
            for (int i = 1; i < depth; i++)
            {
                m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, sharedQKV: true, learningRateFactor: learningRateFactor));
            }

            m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, learningRateFactor: learningRateFactor));
            for (int i = 1; i < depth; i++)
            {
                m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, learningRateFactor: learningRateFactor));
            }

            for (int i = 0; i < depth; i++)
            {
                m_posFFNs.Add(new PositionwiseFeedForward($"{name}.PosFFN_{i}", hiddenDim, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor));
            }


            layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor);

            //     m_decoderFFLayer = new FeedForwardLayer($"{name}.FeedForward", hiddenDim, outputDim, 0.0f, deviceId: deviceId, isTrainable: isTrainable);
        }
コード例 #14
0
        public LSTMCell(int batchSize, int hdim, int dim, ArchTypeEnums archType, int deviceId)
        {
            if (archType == ArchTypeEnums.GPU_CUDA)
            {
                Wxh = new WeightTensor(dim + hdim, hdim * 4, deviceId, true);
                b   = new WeightTensor(1, hdim * 4, 0, deviceId);
            }
            else
            {
                Wxh = new WeightMatrix(dim + hdim, hdim * 4, true);
                b   = new WeightMatrix(1, hdim * 4, 0);
            }

            this.hdim      = hdim;
            this.dim       = dim;
            this.batchSize = batchSize;
            this.deviceId  = deviceId;

            layerNorm1 = new LayerNormalization(hdim * 4, archType, deviceId);
            layerNorm2 = new LayerNormalization(hdim, archType, deviceId);
        }
コード例 #15
0
        public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable)
        {
            m_name         = name;
            m_hiddenDim    = hiddenDim;
            m_multiHeadNum = multiHeadNum;
            m_d            = m_hiddenDim / m_multiHeadNum;
            m_dropoutRatio = dropoutRatio;

            W0 = new WeightTensor(new long[2] {
                hiddenDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: isTrainable, normal: NormType.Uniform);
            b0 = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: isTrainable);

            Q = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable);
            Qb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable);

            K = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: isTrainable);
            Kb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: isTrainable);

            V = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: isTrainable);
            Vb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: isTrainable);


            layerNorm1 = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hiddenDim, deviceId, isTrainable);
        }