/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="inputSentences"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightMatrix Encode(IComputeGraph g, List <List <string> > inputSentences, Encoder encoder, Encoder reversEncoder, IWeightMatrix Embedding) { PadSentences(inputSentences); List <IWeightMatrix> forwardOutputs = new List <IWeightMatrix>(); List <IWeightMatrix> backwardOutputs = new List <IWeightMatrix>(); int seqLen = inputSentences[0].Count; List <IWeightMatrix> forwardInput = new List <IWeightMatrix>(); for (int i = 0; i < seqLen; i++) { for (int j = 0; j < inputSentences.Count; j++) { var inputSentence = inputSentences[j]; int ix_source = (int)SENTTAGS.UNK; if (m_srcWordToIndex.ContainsKey(inputSentence[i])) { ix_source = m_srcWordToIndex[inputSentence[i]]; } var x = g.PeekRow(Embedding, ix_source); forwardInput.Add(x); } } var forwardInputsM = g.ConcatRows(forwardInput); List <IWeightMatrix> attResults = new List <IWeightMatrix>(); for (int i = 0; i < seqLen; i++) { var emb_i = g.PeekRow(forwardInputsM, i * inputSentences.Count, inputSentences.Count); attResults.Add(emb_i); } for (int i = 0; i < seqLen; i++) { var eOutput = encoder.Encode(attResults[i], g); forwardOutputs.Add(eOutput); var eOutput2 = reversEncoder.Encode(attResults[seqLen - i - 1], g); backwardOutputs.Add(eOutput2); } backwardOutputs.Reverse(); var encodedOutput = g.ConcatRowColumn(forwardOutputs, backwardOutputs); return(encodedOutput); }
/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="inputSentences"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightTensor Encode(IComputeGraph g, List <List <string> > inputSentences, IEncoder encoder, IWeightTensor Embedding) { PadSentences(inputSentences); List <IWeightTensor> forwardOutputs = new List <IWeightTensor>(); List <IWeightTensor> backwardOutputs = new List <IWeightTensor>(); int seqLen = inputSentences[0].Count; List <IWeightTensor> forwardInput = new List <IWeightTensor>(); for (int i = 0; i < seqLen; i++) { for (int j = 0; j < inputSentences.Count; j++) { var inputSentence = inputSentences[j]; int ix_source = (int)SENTTAGS.UNK; if (m_srcWordToIndex.ContainsKey(inputSentence[i])) { ix_source = m_srcWordToIndex[inputSentence[i]]; } else { Logger.WriteLine($"'{inputSentence[i]}' is an unknown word."); } var x = g.PeekRow(Embedding, ix_source); forwardInput.Add(x); } } var forwardInputsM = g.ConcatRows(forwardInput); return(encoder.Encode(forwardInputsM, g)); }
/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="srcSnts"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightTensor Encode(IComputeGraph g, List <List <string> > srcSnts, IEncoder encoder, IWeightTensor Embedding, IWeightTensor srcSelfMask, IWeightTensor posEmbedding, List <int> originalSrcLengths) { var seqLen = srcSnts[0].Count; var batchSize = srcSnts.Count; var inputs = new List <IWeightTensor>(); // Generate batch-first based input embeddings for (var j = 0; j < batchSize; j++) { var originalLength = originalSrcLengths[j]; for (var i = 0; i < seqLen; i++) { var ix_source = this.m_modelMetaData.Vocab.GetSourceWordIndex(srcSnts[j][i], true); var emb = g.PeekRow(Embedding, ix_source, runGradients: i < originalLength ? true : false); inputs.Add(emb); } } var inputEmbs = g.ConcatRows(inputs); if (this.m_modelMetaData.EncoderType == EncoderTypeEnums.Transformer) { inputEmbs = this.AddPositionEmbedding(g, posEmbedding, batchSize, seqLen, inputEmbs); } return(encoder.Encode(inputEmbs, batchSize, g, srcSelfMask)); }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputsUnfolder[0].Rows); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); List <IWeightMatrix> attens = g.UnFolderRow(atten, m_batchSize); List <IWeightMatrix> contexts = new List <IWeightMatrix>(); List <IWeightMatrix> attensT = new List <IWeightMatrix>(); for (int i = 0; i < m_batchSize; i++) { attensT.Add(g.Transpose2(attens[i])); } var attenT = g.ConcatRows(attensT); var attenSoftmax = g.SoftmaxM(attenT); for (int i = 0; i < m_batchSize; i++) { IWeightMatrix context = g.Mul(g.PeekRow(attenSoftmax, i), attenPreProcessResult.inputsUnfolder[i]); contexts.Add(context); } return(g.ConcatRows(contexts)); }
public void VisualizeNeuralNetwork(string visNNFilePath) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); // Build input sentence List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(null); int batchSize = inputSeqs.Count; IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false, visNetwork: true); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Run encoder IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); // Prepare for attention over encoder-decoder AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); // Run decoder IWeightTensor x = g.PeekRow(tgtEmbedding, (int)SENTTAGS.START); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); IWeightTensor probs = g.Softmax(eOutput); g.VisualizeNeuralNetToFile(visNNFilePath); }
/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="inputSentences"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightTensor Encode(IComputeGraph g, List <List <string> > inputSentences, IEncoder encoder, IWeightTensor Embedding) { int seqLen = inputSentences[0].Count; int batchSize = inputSentences.Count; List <IWeightTensor> forwardInput = new List <IWeightTensor>(); for (int i = 0; i < seqLen; i++) { for (int j = 0; j < inputSentences.Count; j++) { int ix_source = m_modelMetaData.Vocab.GetSourceWordIndex(inputSentences[j][i], logUnk: true); forwardInput.Add(g.PeekRow(Embedding, ix_source)); } } return(encoder.Encode(g.ConcatRows(forwardInput), batchSize, g)); }
private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs) { using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false)) { using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, false, new long[] { 1, seqLen, this.m_modelMetaData.EmbeddingDim })) { using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, false, new long[] { batchSize, seqLen, this.m_modelMetaData.EmbeddingDim })) { inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, this.m_modelMetaData.EmbeddingDim }); inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false); inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, this.m_modelMetaData.EmbeddingDim }); } } } inputEmbs = g.Dropout(inputEmbs, batchSize, this.m_dropoutRatio, true); return(inputEmbs); }
/// <summary> /// Encode source sentences and output encoded weights /// </summary> /// <param name="g"></param> /// <param name="srcSnts"></param> /// <param name="encoder"></param> /// <param name="reversEncoder"></param> /// <param name="Embedding"></param> /// <returns></returns> private IWeightTensor Encode(IComputeGraph g, List <List <string> > srcSnts, IEncoder encoder, IWeightTensor Embedding, IWeightTensor selfMask, IWeightTensor dimMask) { int seqLen = srcSnts[0].Count; int batchSize = srcSnts.Count; List <IWeightTensor> forwardInput = new List <IWeightTensor>(); // Generate batch-first based input embeddings for (int j = 0; j < batchSize; j++) { for (int i = 0; i < seqLen; i++) { int ix_source = m_modelMetaData.Vocab.GetSourceWordIndex(srcSnts[j][i], logUnk: true); forwardInput.Add(g.PeekRow(Embedding, ix_source)); } } return(encoder.Encode(g.ConcatRows(forwardInput), selfMask, dimMask, batchSize, g)); }
public IWeightTensor Encode(IWeightTensor rawInputs, int batchSize, IComputeGraph g, IWeightTensor srcSelfMask) { int seqLen = rawInputs.Rows / batchSize; rawInputs = g.TransposeBatch(rawInputs, seqLen); List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < seqLen; i++) { IWeightTensor emb_i = g.PeekRow(rawInputs, i * batchSize, batchSize); inputs.Add(emb_i); } List <IWeightTensor> forwardOutputs = new List <IWeightTensor>(); List <IWeightTensor> backwardOutputs = new List <IWeightTensor>(); List <IWeightTensor> layerOutputs = inputs.ToList(); for (int i = 0; i < m_depth; i++) { for (int j = 0; j < seqLen; j++) { IWeightTensor forwardOutput = m_forwardEncoders[i].Step(layerOutputs[j], g); forwardOutputs.Add(forwardOutput); IWeightTensor backwardOutput = m_backwardEncoders[i].Step(layerOutputs[inputs.Count - j - 1], g); backwardOutputs.Add(backwardOutput); } backwardOutputs.Reverse(); layerOutputs.Clear(); for (int j = 0; j < seqLen; j++) { IWeightTensor concatW = g.ConcatColumns(forwardOutputs[j], backwardOutputs[j]); layerOutputs.Add(concatW); } } var result = g.ConcatRows(layerOutputs); return(g.TransposeBatch(result, batchSize)); }
private float DecodeOutput(string[] OutputSentence, IComputeGraph g, float cost, SparseWeightMatrix sparseInput, List <WeightMatrix> encoded, AttentionDecoder decoder, WeightMatrix Whd, WeightMatrix bd, WeightMatrix Embedding) { int ix_input = (int)SENTTAGS.START; for (int i = 0; i < OutputSentence.Length + 1; i++) { int ix_target = (int)SENTTAGS.UNK; if (i == OutputSentence.Length) { ix_target = (int)SENTTAGS.END; } else { if (t_wordToIndex.ContainsKey(OutputSentence[i])) { ix_target = t_wordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_input); var eOutput = decoder.Decode(sparseInput, x, encoded, g); if (UseDropout) { eOutput = g.Dropout(eOutput, 0.2f); } var o = g.muladd(eOutput, Whd, bd); if (UseDropout) { o = g.Dropout(o, 0.2f); } var probs = g.SoftmaxWithCrossEntropy(o); cost += (float)-Math.Log(probs.Weight[ix_target]); o.Gradient = probs.Weight; o.Gradient[ix_target] -= 1; ix_input = ix_target; } return(cost); }
public IWeightTensor Encode(IWeightTensor rawInputs, IComputeGraph g) { int seqLen = rawInputs.Rows / m_batchSize; List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < seqLen; i++) { var emb_i = g.PeekRow(rawInputs, i * m_batchSize, m_batchSize); inputs.Add(emb_i); } List <IWeightTensor> forwardOutputs = new List <IWeightTensor>(); List <IWeightTensor> backwardOutputs = new List <IWeightTensor>(); List <IWeightTensor> layerOutputs = inputs.ToList(); for (int i = 0; i < m_depth; i++) { for (int j = 0; j < seqLen; j++) { var forwardOutput = m_forwardEncoders[i].Step(layerOutputs[j], g); forwardOutputs.Add(forwardOutput); var backwardOutput = m_backwardEncoders[i].Step(layerOutputs[inputs.Count - j - 1], g); backwardOutputs.Add(backwardOutput); } backwardOutputs.Reverse(); layerOutputs.Clear(); for (int j = 0; j < seqLen; j++) { var concatW = g.ConcatColumns(forwardOutputs[j], backwardOutputs[j]); layerOutputs.Add(concatW); } } return(g.ConcatRows(layerOutputs)); }
private IWeightTensor AddPositionEmbedding(IComputeGraph g, IWeightTensor posEmbedding, int batchSize, int seqLen, IWeightTensor inputEmbs) { var Column = posEmbedding.Columns; inputEmbs = g.Mul(inputEmbs, (float)Math.Sqrt(m_modelMetaData.HiddenDim)); using (var posEmbeddingPeek = g.PeekRow(posEmbedding, 0, seqLen, false)) { using (var posEmbeddingPeekView = g.View(posEmbeddingPeek, runGradient: false, dims: new long[] { 1, seqLen, Column })) { using (var posEmbeddingPeekViewExp = g.Expand(posEmbeddingPeekView, runGradient: false, dims: new long[] { batchSize, seqLen, Column })) { inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize, seqLen, Column }); inputEmbs = g.Add(inputEmbs, posEmbeddingPeekViewExp, true, false); inputEmbs = g.View(inputEmbs, dims: new long[] { batchSize *seqLen, Column }); } } } inputEmbs = g.Dropout(inputEmbs, batchSize, m_dropoutRatio, inPlace: true); return(inputEmbs); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="g">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side. In training mode, it inputs target tokens, otherwise, it outputs target tokens generated by decoder</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> private float RunForwardOnSingleDevice(IComputeGraph g, List <List <string> > srcSnts, List <List <string> > tgtSnts, int deviceIdIdx, bool isTraining) { (IEncoder encoder, IWeightTensor srcEmbedding, FeedForwardLayer decoderFFLayer) = GetNetworksOnDeviceAt(deviceIdIdx); int batchSize = srcSnts.Count; // Reset networks encoder.Reset(g.GetWeightFactory(), batchSize); // Encoding input source sentences ParallelCorpus.PadSentences(srcSnts); if (isTraining) { ParallelCorpus.PadSentences(tgtSnts); if (srcSnts[0].Count != tgtSnts[0].Count) { throw new ArgumentException($"The length of source side and target side must be equal. source length = '{srcSnts[0].Count}', target length = '{tgtSnts[0].Count}'"); } } int seqLen = srcSnts[0].Count; IWeightTensor encodedWeightMatrix = Encode(g.CreateSubGraph("Encoder"), srcSnts, encoder, srcEmbedding); IWeightTensor ffLayer = decoderFFLayer.Process(encodedWeightMatrix, batchSize, g); IWeightTensor ffLayerBatch = g.TransposeBatch(ffLayer, batchSize); // Logger.WriteLine("1"); float cost = 0.0f; using (var probs = g.Softmax(ffLayerBatch, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { for (int j = 0; j < seqLen; j++) { using (var probs_k_j = g.PeekRow(probs, k * seqLen + j, runGradients: false)) { var ix_targets_k_j = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); var score_k = probs_k_j.GetWeightAt(ix_targets_k_j); cost += (float)-Math.Log(score_k); probs_k_j.SetWeightAt(score_k - 1, ix_targets_k_j); } } ////CRF part //using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) //{ // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.ForwardBackward(seqLen, weights_k); // int[] trueTags = new int[seqLen]; // for (int j = 0; j < seqLen; j++) // { // trueTags[j] = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); // } // m_crfDecoder.UpdateBigramTransition(seqLen, crfOutput_k, trueTags); //} } ffLayerBatch.CopyWeightsToGradients(probs); } else { // CRF decoder //for (int k = 0; k < batchSize; k++) //{ // //CRF part // using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) // { // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.DecodeNBestCRF(weights_k, seqLen, 1); // var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(crfOutput_k[0].ToList()); // tgtSnts.Add(targetWords); // } //} // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int k = 0; k < batchSize; k++) { tgtSnts.Add(targetWords.GetRange(k * seqLen, seqLen)); } } } return(cost); }
/// <summary> /// Given input sentence and generate output sentence by seq2seq model with beam search /// </summary> /// <param name="input"></param> /// <param name="beamSearchSize"></param> /// <param name="maxOutputLength"></param> /// <returns></returns> public List <List <string> > Predict(List <string> input, int beamSearchSize = 1, int maxOutputLength = 100) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(input); int batchSize = 1; // For predict with beam search, we currently only supports one sentence per call IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Construct beam search status list List <BeamSearchStatus> bssList = new List <BeamSearchStatus>(); BeamSearchStatus bss = new BeamSearchStatus(); bss.OutputIds.Add((int)SENTTAGS.START); bss.CTs = rnnDecoder.GetCTs(); bss.HTs = rnnDecoder.GetHTs(); bssList.Add(bss); IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); List <BeamSearchStatus> newBSSList = new List <BeamSearchStatus>(); bool finished = false; int outputLength = 0; while (finished == false && outputLength < maxOutputLength) { finished = true; for (int i = 0; i < bssList.Count; i++) { bss = bssList[i]; if (bss.OutputIds[bss.OutputIds.Count - 1] == (int)SENTTAGS.END) { newBSSList.Add(bss); } else if (bss.OutputIds.Count > maxOutputLength) { newBSSList.Add(bss); } else { finished = false; int ix_input = bss.OutputIds[bss.OutputIds.Count - 1]; rnnDecoder.SetCTs(bss.CTs); rnnDecoder.SetHTs(bss.HTs); IWeightTensor x = g.PeekRow(tgtEmbedding, ix_input); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); using (IWeightTensor probs = g.Softmax(eOutput)) { List <int> preds = probs.GetTopNMaxWeightIdx(beamSearchSize); for (int j = 0; j < preds.Count; j++) { BeamSearchStatus newBSS = new BeamSearchStatus(); newBSS.OutputIds.AddRange(bss.OutputIds); newBSS.OutputIds.Add(preds[j]); newBSS.CTs = rnnDecoder.GetCTs(); newBSS.HTs = rnnDecoder.GetHTs(); float score = probs.GetWeightAt(preds[j]); newBSS.Score = bss.Score; newBSS.Score += (float)(-Math.Log(score)); //var lengthPenalty = Math.Pow((5.0f + newBSS.OutputIds.Count) / 6, 0.6); //newBSS.Score /= (float)lengthPenalty; newBSSList.Add(newBSS); } } } } bssList = BeamSearch.GetTopNBSS(newBSSList, beamSearchSize); newBSSList.Clear(); outputLength++; } // Convert output target word ids to real string List <List <string> > results = new List <List <string> >(); for (int i = 0; i < bssList.Count; i++) { results.Add(m_modelMetaData.Vocab.ConvertTargetIdsToString(bssList[i].OutputIds)); } return(results); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSnts">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="tgtEmbedding"></param> /// <returns></returns> private float DecodeAttentionLSTM(List <List <string> > outputSnts, IComputeGraph g, IWeightTensor encOutputs, AttentionDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[i][0]); } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSnts) : null; int seqLen = isTraining ? outputSnts[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encOutputs, batchSize, g); for (int i = 1; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(tgtEmbedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSnts[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } ix_inputs = targetIdx; } } } return(cost); }
private float DecodeTransformer(List <List <string> > outInputSeqs, IComputeGraph g, IWeightTensor encOutputs, IWeightTensor encMask, TransformerDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, int deviceId, bool isTraining = true) { float cost = 0.0f; var originalInputLengths = ParallelCorpus.PadSentences(outInputSeqs); int tgtSeqLen = outInputSeqs[0].Count; IWeightTensor tgtDimMask = MaskUtils.BuildPadDimMask(g, tgtSeqLen, originalInputLengths, m_modelMetaData.HiddenDim, deviceId); using (IWeightTensor tgtSelfTriMask = MaskUtils.BuildPadSelfTriMask(g, tgtSeqLen, originalInputLengths, deviceId)) { List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outInputSeqs[i][j], logUnk: true); inputs.Add(g.PeekRow(tgtEmbedding, ix_targets_k)); } } IWeightTensor tgtInputEmbeddings = inputs.Count > 1 ? g.ConcatRows(inputs) : inputs[0]; IWeightTensor decOutput = decoder.Decode(tgtInputEmbeddings, encOutputs, tgtSelfTriMask, encMask, tgtDimMask, batchSize, g); decOutput = g.Mul(decOutput, g.Transpose(tgtEmbedding)); using (IWeightTensor probs = g.Softmax(decOutput, runGradients: false, inPlace: true)) { if (isTraining) { var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(outInputSeqs, ParallelCorpus.EOS); var originalOutputLengths = ParallelCorpus.PadSentences(leftShiftInputSeqs, tgtSeqLen); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { using (IWeightTensor probs_i_j = g.PeekRow(probs, i * tgtSeqLen + j, runGradients: false)) { if (j < originalOutputLengths[i]) { int ix_targets_i_j = m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true); float score_i_j = probs_i_j.GetWeightAt(ix_targets_i_j); if (j < originalOutputLengths[i]) { cost += (float)-Math.Log(score_i_j); } probs_i_j.SetWeightAt(score_i_j - 1, ix_targets_i_j); } else { probs_i_j.CleanWeight(); } } } } decOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int i = 0; i < batchSize; i++) { outInputSeqs[i].Add(targetWords[i * tgtSeqLen + tgtSeqLen - 1]); } } } } return(cost); }
/// <summary> /// Run forward part on given single device /// </summary> /// <param name="g">The computing graph for current device. It gets created and passed by the framework</param> /// <param name="srcSnts">A batch of input tokenized sentences in source side</param> /// <param name="tgtSnts">A batch of output tokenized sentences in target side. In training mode, it inputs target tokens, otherwise, it outputs target tokens generated by decoder</param> /// <param name="deviceIdIdx">The index of current device</param> /// <returns>The cost of forward part</returns> private float RunForwardOnSingleDevice(IComputeGraph g, List <List <string> > srcSnts, List <List <string> > tgtSnts, int deviceIdIdx, bool isTraining) { var(encoder, srcEmbedding, posEmbedding, decoderFFLayer) = this.GetNetworksOnDeviceAt(deviceIdIdx); // Reset networks encoder.Reset(g.GetWeightFactory(), srcSnts.Count); var originalSrcLengths = ParallelCorpus.PadSentences(srcSnts); var seqLen = srcSnts[0].Count; var batchSize = srcSnts.Count; // Encoding input source sentences var encOutput = this.Encode(g, srcSnts, encoder, srcEmbedding, null, posEmbedding, originalSrcLengths); var ffLayer = decoderFFLayer.Process(encOutput, batchSize, g); var ffLayerBatch = g.TransposeBatch(ffLayer, batchSize); var cost = 0.0f; using (var probs = g.Softmax(ffLayerBatch, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (var k = 0; k < batchSize; k++) { for (var j = 0; j < seqLen; j++) { using (var probs_k_j = g.PeekRow(probs, k * seqLen + j, runGradients: false)) { var ix_targets_k_j = this.m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); var score_k = probs_k_j.GetWeightAt(ix_targets_k_j); cost += (float)-Math.Log(score_k); probs_k_j.SetWeightAt(score_k - 1, ix_targets_k_j); } } ////CRF part //using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) //{ // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.ForwardBackward(seqLen, weights_k); // int[] trueTags = new int[seqLen]; // for (int j = 0; j < seqLen; j++) // { // trueTags[j] = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSnts[k][j]); // } // m_crfDecoder.UpdateBigramTransition(seqLen, crfOutput_k, trueTags); //} } ffLayerBatch.CopyWeightsToGradients(probs); } else { // CRF decoder //for (int k = 0; k < batchSize; k++) //{ // //CRF part // using (var probs_k = g.PeekRow(probs, k * seqLen, seqLen, runGradients: false)) // { // var weights_k = probs_k.ToWeightArray(); // var crfOutput_k = m_crfDecoder.DecodeNBestCRF(weights_k, seqLen, 1); // var targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(crfOutput_k[0].ToList()); // tgtSnts.Add(targetWords); // } //} // Output "i"th target word var targetIdx = g.Argmax(probs, 1); var targetWords = this.m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (var k = 0; k < batchSize; k++) { tgtSnts[k] = targetWords.GetRange(k * seqLen, seqLen); } } } return(cost); }
private float DecodeTransformer(List <List <string> > tgtSeqs, IComputeGraph g, IWeightTensor encOutputs, TransformerDecoder decoder, IWeightTensor tgtEmbedding, IWeightTensor posEmbedding, int batchSize, int deviceId, List <int> srcOriginalLenghts, bool isTraining = true) { float cost = 0.0f; var tgtOriginalLengths = ParallelCorpus.PadSentences(tgtSeqs); int tgtSeqLen = tgtSeqs[0].Count; int srcSeqLen = encOutputs.Rows / batchSize; using (IWeightTensor srcTgtMask = MaskUtils.BuildSrcTgtMask(g, srcSeqLen, tgtSeqLen, tgtOriginalLengths, srcOriginalLenghts, deviceId)) { using (IWeightTensor tgtSelfTriMask = MaskUtils.BuildPadSelfTriMask(g, tgtSeqLen, tgtOriginalLengths, deviceId)) { List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(tgtSeqs[i][j], logUnk: true); var emb = g.PeekRow(tgtEmbedding, ix_targets_k, runGradients: j < tgtOriginalLengths[i] ? true : false); inputs.Add(emb); } } IWeightTensor inputEmbs = inputs.Count > 1 ? g.ConcatRows(inputs) : inputs[0]; inputEmbs = AddPositionEmbedding(g, posEmbedding, batchSize, tgtSeqLen, inputEmbs); IWeightTensor decOutput = decoder.Decode(inputEmbs, encOutputs, tgtSelfTriMask, srcTgtMask, batchSize, g); using (IWeightTensor probs = g.Softmax(decOutput, runGradients: false, inPlace: true)) { if (isTraining) { var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(tgtSeqs, ParallelCorpus.EOS); for (int i = 0; i < batchSize; i++) { for (int j = 0; j < tgtSeqLen; j++) { using (IWeightTensor probs_i_j = g.PeekRow(probs, i * tgtSeqLen + j, runGradients: false)) { if (j < tgtOriginalLengths[i]) { int ix_targets_i_j = m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true); float score_i_j = probs_i_j.GetWeightAt(ix_targets_i_j); cost += (float)-Math.Log(score_i_j); probs_i_j.SetWeightAt(score_i_j - 1, ix_targets_i_j); } else { probs_i_j.CleanWeight(); } } } } decOutput.CopyWeightsToGradients(probs); } //if (isTraining) //{ // var leftShiftInputSeqs = ParallelCorpus.LeftShiftSnts(tgtSeqs, ParallelCorpus.EOS); // int[] targetIds = new int[batchSize * tgtSeqLen]; // int ids = 0; // for (int i = 0; i < batchSize; i++) // { // for (int j = 0; j < tgtSeqLen; j++) // { // targetIds[ids] = j < tgtOriginalLengths[i] ? m_modelMetaData.Vocab.GetTargetWordIndex(leftShiftInputSeqs[i][j], logUnk: true) : -1; // ids++; // } // } // cost += g.UpdateCost(probs, targetIds); // decOutput.CopyWeightsToGradients(probs); //} else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int i = 0; i < batchSize; i++) { tgtSeqs[i].Add(targetWords[i * tgtSeqLen + tgtSeqLen - 1]); } } } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } var o = decoderFFLayer.Process(eOutput, g); //Softmax for output // var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } return(cost); }