public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable, float learningRateFactor = 1.0f) { m_name = name; m_dropoutRatio = dropoutRatio; layerNorm2 = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor); feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor); feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor); }
public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable) { this.m_name = name; this.m_hiddenDim = hiddenDim; this.m_dropoutRatio = dropoutRatio; this.layerNorm2 = new LayerNormalization($"{name}.{nameof(this.layerNorm2)}", hiddenDim, deviceId, isTrainable); this.feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer1)}", hiddenDim, hiddenDim * 4, this.m_dropoutRatio, deviceId, isTrainable); this.feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer2)}", hiddenDim * 4, hiddenDim, this.m_dropoutRatio, deviceId, isTrainable); }
private void InitWeights() { Logger.WriteLine($"Initializing weights..."); m_srcEmbedding = new IWeightMatrix[m_deviceIds.Length]; m_tgtEmbedding = new IWeightMatrix[m_deviceIds.Length]; m_biEncoder = new BiEncoder[m_deviceIds.Length]; m_decoder = new AttentionDecoder[m_deviceIds.Length]; m_decoderFFLayer = new FeedForwardLayer[m_deviceIds.Length]; for (int i = 0; i < m_deviceIds.Length; i++) { Logger.WriteLine($"Initializing weights for device '{m_deviceIds[i]}'"); if (m_archType == ArchTypeEnums.GPU_CUDA) { //m_Whd[i] = new WeightTensor(HiddenSize, m_tgtIndexToWord.Count + 3, m_deviceIds[i], true); //m_bd[i] = new WeightTensor(1, m_tgtIndexToWord.Count + 3, 0, m_deviceIds[i]); m_srcEmbedding[i] = new WeightTensor(m_srcIndexToWord.Count, WordVectorSize, m_deviceIds[i], true); m_tgtEmbedding[i] = new WeightTensor(m_tgtIndexToWord.Count + 3, WordVectorSize, m_deviceIds[i], true); } else { //m_Whd[i] = new WeightMatrix(HiddenSize, m_tgtIndexToWord.Count + 3, true); //m_bd[i] = new WeightMatrix(1, m_tgtIndexToWord.Count + 3, 0); m_srcEmbedding[i] = new WeightMatrix(m_srcIndexToWord.Count, WordVectorSize, true); m_tgtEmbedding[i] = new WeightMatrix(m_tgtIndexToWord.Count + 3, WordVectorSize, true); } Logger.WriteLine($"Initializing encoders and decoders for device '{m_deviceIds[i]}'..."); m_biEncoder[i] = new BiEncoder(m_batchSize, HiddenSize, WordVectorSize, Depth, m_archType, m_deviceIds[i]); m_decoder[i] = new AttentionDecoder(m_batchSize, HiddenSize, WordVectorSize, HiddenSize * 2, Depth, m_archType, m_deviceIds[i]); m_decoderFFLayer[i] = new FeedForwardLayer(HiddenSize, m_tgtIndexToWord.Count + 3, m_archType, m_deviceIds[i]); } InitWeightsFactory(); }
public SelfAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId) { m_name = name; m_hiddenDim = hiddenDim; m_multiHeadNum = multiHeadNum; m_d = m_hiddenDim / m_multiHeadNum; m_dropoutRatio = dropoutRatio; W0 = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: true); b0 = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: true); Q = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: true); Qb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: true); K = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: true); Kb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: true); V = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: true); Vb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: true); layerNorm1 = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hiddenDim, deviceId); layerNorm2 = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId); feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId); feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId); }
public TransformerDecoder(string name, int multiHeadNum, int hiddenDim, int inputDim, int outputDim, int depth, float dropoutRatio, int deviceId, bool isTrainable) { Logger.WriteLine($"Creating transformer decoder at device '{deviceId}'. HiddenDim = '{hiddenDim}', InputDim = '{inputDim}', Depth = '{depth}', MultiHeadNum = '{multiHeadNum}'"); m_name = name; m_multiHeadNum = multiHeadNum; m_hiddenDim = hiddenDim; m_inputDim = inputDim; m_outputDim = outputDim; m_depth = depth; m_dropoutRatio = dropoutRatio; m_deviceId = deviceId; m_isTrainable = isTrainable; if (hiddenDim != inputDim) { throw new ArgumentException($"hiddenDim is not equal to inputDim in TransformerEncoder."); } m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable)); for (int i = 1; i < depth; i++) { m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable)); } m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable)); for (int i = 1; i < depth; i++) { m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable)); } for (int i = 0; i < depth; i++) { m_posFFNs.Add(new PositionwiseFeedForward($"{name}.PosFFN_{i}", hiddenDim, m_dropoutRatio, deviceId, isTrainable)); } layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable); m_decoderFFLayer = new FeedForwardLayer($"{name}.FeedForward", hiddenDim, outputDim, 0.0f, deviceId: deviceId, isTrainable: isTrainable); }
private void CreateEncoderDecoderEmbeddings() { (m_encoder, m_decoder) = CreateEncoderDecoder(); m_srcEmbedding = new IWeightTensor[m_deviceIds.Length]; m_tgtEmbedding = new IWeightTensor[m_deviceIds.Length]; m_decoderFFLayer = new FeedForwardLayer[m_deviceIds.Length]; for (int i = 0; i < m_deviceIds.Length; i++) { Logger.WriteLine($"Initializing weights for device '{m_deviceIds[i]}'"); m_srcEmbedding[i] = new WeightTensor(new long[2] { m_srcIndexToWord.Count, m_embeddingDim }, m_deviceIds[i], normal: true, name: "SrcEmbeddings", isTrainable: true); m_tgtEmbedding[i] = new WeightTensor(new long[2] { m_tgtIndexToWord.Count + 3, m_embeddingDim }, m_deviceIds[i], normal: true, name: "TgtEmbeddings", isTrainable: true); m_decoderFFLayer[i] = new FeedForwardLayer("FeedForward", m_hiddenDim, m_tgtIndexToWord.Count + 3, m_deviceIds[i]); } InitWeightsFactory(); }
public AttentionDecoder(string name, int hiddenDim, int embeddingDim, int contextDim, int outputDim, float dropoutRatio, int depth, int deviceId, bool enableCoverageModel, bool isTrainable) { m_name = name; m_hdim = hiddenDim; m_embDim = embeddingDim; m_context = contextDim; m_depth = depth; m_deviceId = deviceId; m_outputDim = outputDim; m_dropoutRatio = dropoutRatio; m_enableCoverageModel = enableCoverageModel; m_isTrainable = isTrainable; m_attentionLayer = new AttentionUnit($"{name}.AttnUnit", hiddenDim, contextDim, deviceId, enableCoverageModel, isTrainable: isTrainable); m_decoders.Add(new LSTMAttentionDecoderCell($"{name}.LSTMAttn_0", hiddenDim, embeddingDim, contextDim, deviceId, isTrainable)); for (int i = 1; i < depth; i++) { m_decoders.Add(new LSTMAttentionDecoderCell($"{name}.LSTMAttn_{i}", hiddenDim, hiddenDim, contextDim, deviceId, isTrainable)); } m_decoderFFLayer = new FeedForwardLayer($"{name}.FeedForward", hiddenDim, outputDim, 0.0f, deviceId: deviceId, isTrainable: isTrainable); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode var originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; var dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model var attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); eOutput = g.Dropout(eOutput, batchSize, dropoutRatio, true); eOutput = decoderFFLayer.Process(eOutput, batchSize, g); //Softmax for output using (var probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (var probs_k = g.PeekRow(probs, k, runGradients: false)) { var ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); var score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } var o = decoderFFLayer.Process(eOutput, g); //Softmax for output // var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } return(cost); }