public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable, float learningRateFactor = 1.0f)
        {
            m_name         = name;
            m_dropoutRatio = dropoutRatio;

            layerNorm2        = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor);
            feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor);
            feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor);
        }
        public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable)
        {
            this.m_name         = name;
            this.m_hiddenDim    = hiddenDim;
            this.m_dropoutRatio = dropoutRatio;

            this.layerNorm2        = new LayerNormalization($"{name}.{nameof(this.layerNorm2)}", hiddenDim, deviceId, isTrainable);
            this.feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer1)}", hiddenDim, hiddenDim * 4, this.m_dropoutRatio, deviceId, isTrainable);
            this.feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer2)}", hiddenDim * 4, hiddenDim, this.m_dropoutRatio, deviceId, isTrainable);
        }
예제 #3
0
        private void InitWeights()
        {
            Logger.WriteLine($"Initializing weights...");

            m_srcEmbedding = new IWeightMatrix[m_deviceIds.Length];
            m_tgtEmbedding = new IWeightMatrix[m_deviceIds.Length];

            m_biEncoder = new BiEncoder[m_deviceIds.Length];
            m_decoder   = new AttentionDecoder[m_deviceIds.Length];

            m_decoderFFLayer = new FeedForwardLayer[m_deviceIds.Length];

            for (int i = 0; i < m_deviceIds.Length; i++)
            {
                Logger.WriteLine($"Initializing weights for device '{m_deviceIds[i]}'");
                if (m_archType == ArchTypeEnums.GPU_CUDA)
                {
                    //m_Whd[i] = new WeightTensor(HiddenSize, m_tgtIndexToWord.Count + 3, m_deviceIds[i], true);
                    //m_bd[i] = new WeightTensor(1, m_tgtIndexToWord.Count + 3, 0, m_deviceIds[i]);

                    m_srcEmbedding[i] = new WeightTensor(m_srcIndexToWord.Count, WordVectorSize, m_deviceIds[i], true);
                    m_tgtEmbedding[i] = new WeightTensor(m_tgtIndexToWord.Count + 3, WordVectorSize, m_deviceIds[i], true);
                }
                else
                {
                    //m_Whd[i] = new WeightMatrix(HiddenSize, m_tgtIndexToWord.Count + 3, true);
                    //m_bd[i] = new WeightMatrix(1, m_tgtIndexToWord.Count + 3, 0);

                    m_srcEmbedding[i] = new WeightMatrix(m_srcIndexToWord.Count, WordVectorSize, true);
                    m_tgtEmbedding[i] = new WeightMatrix(m_tgtIndexToWord.Count + 3, WordVectorSize, true);
                }

                Logger.WriteLine($"Initializing encoders and decoders for device '{m_deviceIds[i]}'...");

                m_biEncoder[i] = new BiEncoder(m_batchSize, HiddenSize, WordVectorSize, Depth, m_archType, m_deviceIds[i]);
                m_decoder[i]   = new AttentionDecoder(m_batchSize, HiddenSize, WordVectorSize, HiddenSize * 2, Depth, m_archType, m_deviceIds[i]);

                m_decoderFFLayer[i] = new FeedForwardLayer(HiddenSize, m_tgtIndexToWord.Count + 3, m_archType, m_deviceIds[i]);
            }

            InitWeightsFactory();
        }
예제 #4
0
        public SelfAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId)
        {
            m_name         = name;
            m_hiddenDim    = hiddenDim;
            m_multiHeadNum = multiHeadNum;
            m_d            = m_hiddenDim / m_multiHeadNum;
            m_dropoutRatio = dropoutRatio;

            W0 = new WeightTensor(new long[2] {
                hiddenDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: true);
            b0 = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: true);

            Q = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: true);
            Qb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: true);

            K = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: true);
            Kb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: true);

            V = new WeightTensor(new long[2] {
                inputDim, hiddenDim
            }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: true);
            Vb = new WeightTensor(new long[2] {
                1, hiddenDim
            }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: true);


            layerNorm1        = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hiddenDim, deviceId);
            layerNorm2        = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId);
            feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId);
            feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId);
        }
예제 #5
0
        public TransformerDecoder(string name, int multiHeadNum, int hiddenDim, int inputDim, int outputDim, int depth, float dropoutRatio, int deviceId, bool isTrainable)
        {
            Logger.WriteLine($"Creating transformer decoder at device '{deviceId}'. HiddenDim = '{hiddenDim}', InputDim = '{inputDim}', Depth = '{depth}', MultiHeadNum = '{multiHeadNum}'");

            m_name         = name;
            m_multiHeadNum = multiHeadNum;
            m_hiddenDim    = hiddenDim;
            m_inputDim     = inputDim;
            m_outputDim    = outputDim;
            m_depth        = depth;
            m_dropoutRatio = dropoutRatio;
            m_deviceId     = deviceId;
            m_isTrainable  = isTrainable;

            if (hiddenDim != inputDim)
            {
                throw new ArgumentException($"hiddenDim is not equal to inputDim in TransformerEncoder.");
            }

            m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable));
            for (int i = 1; i < depth; i++)
            {
                m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable));
            }

            m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable));
            for (int i = 1; i < depth; i++)
            {
                m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable));
            }

            for (int i = 0; i < depth; i++)
            {
                m_posFFNs.Add(new PositionwiseFeedForward($"{name}.PosFFN_{i}", hiddenDim, m_dropoutRatio, deviceId, isTrainable));
            }


            layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable);

            m_decoderFFLayer = new FeedForwardLayer($"{name}.FeedForward", hiddenDim, outputDim, 0.0f, deviceId: deviceId, isTrainable: isTrainable);
        }
예제 #6
0
        private void CreateEncoderDecoderEmbeddings()
        {
            (m_encoder, m_decoder) = CreateEncoderDecoder();

            m_srcEmbedding   = new IWeightTensor[m_deviceIds.Length];
            m_tgtEmbedding   = new IWeightTensor[m_deviceIds.Length];
            m_decoderFFLayer = new FeedForwardLayer[m_deviceIds.Length];

            for (int i = 0; i < m_deviceIds.Length; i++)
            {
                Logger.WriteLine($"Initializing weights for device '{m_deviceIds[i]}'");
                m_srcEmbedding[i] = new WeightTensor(new long[2] {
                    m_srcIndexToWord.Count, m_embeddingDim
                }, m_deviceIds[i], normal: true, name: "SrcEmbeddings", isTrainable: true);
                m_tgtEmbedding[i] = new WeightTensor(new long[2] {
                    m_tgtIndexToWord.Count + 3, m_embeddingDim
                }, m_deviceIds[i], normal: true, name: "TgtEmbeddings", isTrainable: true);

                m_decoderFFLayer[i] = new FeedForwardLayer("FeedForward", m_hiddenDim, m_tgtIndexToWord.Count + 3, m_deviceIds[i]);
            }

            InitWeightsFactory();
        }
예제 #7
0
        public AttentionDecoder(string name, int hiddenDim, int embeddingDim, int contextDim, int outputDim, float dropoutRatio, int depth, int deviceId, bool enableCoverageModel, bool isTrainable)
        {
            m_name                = name;
            m_hdim                = hiddenDim;
            m_embDim              = embeddingDim;
            m_context             = contextDim;
            m_depth               = depth;
            m_deviceId            = deviceId;
            m_outputDim           = outputDim;
            m_dropoutRatio        = dropoutRatio;
            m_enableCoverageModel = enableCoverageModel;
            m_isTrainable         = isTrainable;

            m_attentionLayer = new AttentionUnit($"{name}.AttnUnit", hiddenDim, contextDim, deviceId, enableCoverageModel, isTrainable: isTrainable);

            m_decoders.Add(new LSTMAttentionDecoderCell($"{name}.LSTMAttn_0", hiddenDim, embeddingDim, contextDim, deviceId, isTrainable));
            for (int i = 1; i < depth; i++)
            {
                m_decoders.Add(new LSTMAttentionDecoderCell($"{name}.LSTMAttn_{i}", hiddenDim, hiddenDim, contextDim, deviceId, isTrainable));
            }

            m_decoderFFLayer = new FeedForwardLayer($"{name}.FeedForward", hiddenDim, outputDim, 0.0f, deviceId: deviceId, isTrainable: isTrainable);
        }
예제 #8
0
        /// <summary>
        /// Decode output sentences in training
        /// </summary>
        /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param>
        /// <param name="g"></param>
        /// <param name="encodedOutputs"></param>
        /// <param name="decoder"></param>
        /// <param name="decoderFFLayer"></param>
        /// <param name="embedding"></param>
        /// <returns></returns>
        private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightTensor embedding,
                             int batchSize, bool isTraining = true)
        {
            float cost = 0.0f;

            int[] ix_inputs = new int[batchSize];
            for (int i = 0; i < ix_inputs.Length; i++)
            {
                ix_inputs[i] = (int)SENTTAGS.START;
            }

            // Initialize variables accoridng to current mode
            var           originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null;
            int           seqLen       = isTraining ? outputSentences[0].Count : 64;
            var           dropoutRatio = isTraining ? m_dropoutRatio : 0.0f;
            HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>();

            if (!isTraining)
            {
                if (outputSentences.Count != 0)
                {
                    throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode.");
                }
                for (int i = 0; i < batchSize; i++)
                {
                    outputSentences.Add(new List <string>());
                }
            }

            // Pre-process for attention model
            var attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g);

            for (int i = 0; i < seqLen; i++)
            {
                //Get embedding for all sentence in the batch at position i
                List <IWeightTensor> inputs = new List <IWeightTensor>();
                for (int j = 0; j < batchSize; j++)
                {
                    inputs.Add(g.PeekRow(embedding, ix_inputs[j]));
                }
                var inputsM = g.ConcatRows(inputs);

                //Decode output sentence at position i
                var eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g);
                eOutput = g.Dropout(eOutput, batchSize, dropoutRatio, true);
                eOutput = decoderFFLayer.Process(eOutput, batchSize, g);

                //Softmax for output
                using (var probs = g.Softmax(eOutput, runGradients: false, inPlace: true))
                {
                    if (isTraining)
                    {
                        //Calculate loss for each word in the batch
                        for (int k = 0; k < batchSize; k++)
                        {
                            using (var probs_k = g.PeekRow(probs, k, runGradients: false))
                            {
                                var ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]);
                                var score_k      = probs_k.GetWeightAt(ix_targets_k);
                                if (i < originalOutputLengths[k])
                                {
                                    cost += (float)-Math.Log(score_k);
                                }

                                probs_k.SetWeightAt(score_k - 1, ix_targets_k);
                                ix_inputs[k] = ix_targets_k;
                            }
                        }
                        eOutput.CopyWeightsToGradients(probs);
                    }
                    else
                    {
                        // Output "i"th target word
                        var targetIdx   = g.Argmax(probs, 1);
                        var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList());
                        for (int j = 0; j < targetWords.Count; j++)
                        {
                            if (setEndSentId.Contains(j) == false)
                            {
                                outputSentences[j].Add(targetWords[j]);

                                if (targetWords[j] == ParallelCorpus.EOS)
                                {
                                    setEndSentId.Add(j);
                                }
                            }
                        }

                        ix_inputs = targetIdx;
                    }
                }

                if (isTraining)
                {
                    ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency
                    g.RunTopBackward();
                    if (m_dropoutRatio > 0.0f)
                    {
                        g.RunTopBackward();
                    }
                }
                else
                {
                    if (setEndSentId.Count == batchSize)
                    {
                        // All target sentences in current batch are finished, so we exit.
                        break;
                    }
                }
            }

            return(cost);
        }
예제 #9
0
        /// <summary>
        /// Decode output sentences in training
        /// </summary>
        /// <param name="outputSentences"></param>
        /// <param name="g"></param>
        /// <param name="encodedOutputs"></param>
        /// <param name="decoder"></param>
        /// <param name="Whd"></param>
        /// <param name="bd"></param>
        /// <param name="Embedding"></param>
        /// <param name="predictSentence"></param>
        /// <returns></returns>
        private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence)
        {
            predictSentence = null;
            float cost = 0.0f;
            var   attPreProcessResult = decoder.PreProcess(encodedOutputs, g);

            var originalOutputLengths = PadSentences(outputSentences);
            int seqLen = outputSentences[0].Count;

            int[] ix_inputs  = new int[m_batchSize];
            int[] ix_targets = new int[m_batchSize];
            for (int i = 0; i < ix_inputs.Length; i++)
            {
                ix_inputs[i] = (int)SENTTAGS.START;
            }

            for (int i = 0; i < seqLen + 1; i++)
            {
                //Get embedding for all sentence in the batch at position i
                List <IWeightMatrix> inputs = new List <IWeightMatrix>();
                for (int j = 0; j < m_batchSize; j++)
                {
                    List <string> OutputSentence = outputSentences[j];

                    ix_targets[j] = (int)SENTTAGS.UNK;
                    if (i >= seqLen)
                    {
                        ix_targets[j] = (int)SENTTAGS.END;
                    }
                    else
                    {
                        if (m_tgtWordToIndex.ContainsKey(OutputSentence[i]))
                        {
                            ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]];
                        }
                    }

                    var x = g.PeekRow(Embedding, ix_inputs[j]);

                    inputs.Add(x);
                }

                var inputsM = g.ConcatRows(inputs);

                //Decode output sentence at position i
                var eOutput = decoder.Decode(inputsM, attPreProcessResult, g);
                if (m_dropoutRatio > 0.0f)
                {
                    eOutput = g.Dropout(eOutput, m_dropoutRatio);
                }

                var o = decoderFFLayer.Process(eOutput, g);

                //Softmax for output
//                var o = g.MulAdd(eOutput, Whd, bds);
                var probs = g.Softmax(o, false);

                o.ReleaseWeight();

                //Calculate loss for each word in the batch
                List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false);
                for (int k = 0; k < m_batchSize; k++)
                {
                    var probs_k = probs_g[k];
                    var score_k = probs_k.GetWeightAt(ix_targets[k]);

                    if (i < originalOutputLengths[k] + 1)
                    {
                        cost += (float)-Math.Log(score_k);
                    }

                    probs_k.SetWeightAt(score_k - 1, ix_targets[k]);

                    ix_inputs[k] = ix_targets[k];
                    probs_k.Dispose();
                }

                o.SetGradientByWeight(probs);

                //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency
                g.RunTopBackward();
                g.RunTopBackward();
                if (m_dropoutRatio > 0.0f)
                {
                    g.RunTopBackward();
                }
            }

            return(cost);
        }