public void VisualizeNeuralNetwork(string visNNFilePath) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); // Build input sentence List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(null); int batchSize = inputSeqs.Count; IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false, visNetwork: true); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Run encoder IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); // Prepare for attention over encoder-decoder AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); // Run decoder IWeightTensor x = g.PeekRow(tgtEmbedding, (int)SENTTAGS.START); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); IWeightTensor probs = g.Softmax(eOutput); g.VisualizeNeuralNetToFile(visNNFilePath); }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputsUnfolder[0].Rows); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); List <IWeightMatrix> attens = g.UnFolderRow(atten, m_batchSize); List <IWeightMatrix> contexts = new List <IWeightMatrix>(); List <IWeightMatrix> attensT = new List <IWeightMatrix>(); for (int i = 0; i < m_batchSize; i++) { attensT.Add(g.Transpose2(attens[i])); } var attenT = g.ConcatRows(attensT); var attenSoftmax = g.SoftmaxM(attenT); for (int i = 0; i < m_batchSize; i++) { IWeightMatrix context = g.Mul(g.PeekRow(attenSoftmax, i), attenPreProcessResult.inputsUnfolder[i]); contexts.Add(context); } return(g.ConcatRows(contexts)); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { int srcSeqLen = attenPreProcessResult.inputsBatchFirst.Rows / batchSize; using (IComputeGraph g = graph.CreateSubGraph(m_name)) { // Affine decoder state IWeightTensor wc = g.Affine(state, m_Wa, m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] IWeightTensor wc1 = g.View(wc, batchSize, 1, wc.Columns); IWeightTensor wcExp = g.Expand(wc1, batchSize, srcSeqLen, wc.Columns); IWeightTensor ggs = null; if (m_enableCoverageModel) { // Get coverage model status at {t-1} IWeightTensor wCoverage = g.Affine(m_coverage.Hidden, m_Wc, m_bWc); IWeightTensor wCoverage1 = g.View(wCoverage, batchSize, srcSeqLen, -1); ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attenPreProcessResult.uhs, wcExp); } IWeightTensor ggss = g.View(ggs, batchSize * srcSeqLen, -1); IWeightTensor atten = g.Mul(ggss, m_V); IWeightTensor attenT = g.Transpose(atten); IWeightTensor attenT2 = g.View(attenT, batchSize, srcSeqLen); IWeightTensor attenSoftmax1 = g.Softmax(attenT2, inPlace: true); IWeightTensor attenSoftmax = g.View(attenSoftmax1, batchSize, 1, srcSeqLen); IWeightTensor inputs2 = g.View(attenPreProcessResult.inputsBatchFirst, batchSize, srcSeqLen, attenPreProcessResult.inputsBatchFirst.Columns); IWeightTensor contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); if (m_enableCoverageModel) { // Concatenate tensor as input for coverage model IWeightTensor aCoverage = g.View(attenSoftmax1, attenPreProcessResult.inputsBatchFirst.Rows, 1); IWeightTensor state2 = g.View(state, batchSize, 1, state.Columns); IWeightTensor state3 = g.Expand(state2, batchSize, srcSeqLen, state.Columns); IWeightTensor state4 = g.View(state3, batchSize * srcSeqLen, -1); IWeightTensor concate = g.ConcatColumns(aCoverage, attenPreProcessResult.inputsBatchFirst, state4); m_coverage.Step(concate, graph); } return(contexts); } }
public AttentionPreProcessResult PreProcess(IWeightTensor inputs, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name + "_PreProcess"); AttentionPreProcessResult r = new AttentionPreProcessResult(); r.uhs = g.Affine(inputs, m_Ua, m_bUa); r.inputs = g.TransposeBatch(inputs, batchSize); return(r); }
public AttentionPreProcessResult PreProcess(IWeightMatrix inputs, IComputeGraph g) { AttentionPreProcessResult r = new AttentionPreProcessResult(); IWeightMatrix bUas = g.RepeatRows(bUa, inputs.Rows); r.uhs = g.MulAdd(inputs, Ua, bUas); r.inputs = g.ConcatRows(g.UnFolderRow(inputs, m_batchSize)); return(r); }
public IWeightMatrix Decode(IWeightMatrix input, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var V = input; var lastStatus = this.decoders.FirstOrDefault().ct; var context = attentionLayer.Perform(lastStatus, attenPreProcessResult, g); foreach (var decoder in decoders) { var e = decoder.Step(context, V, g); V = e; } return(V); }
public IWeightTensor Decode(IWeightTensor input, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph g) { var V = input; var lastStatus = this.m_decoders.LastOrDefault().Cell; var context = m_attentionLayer.Perform(lastStatus, attenPreProcessResult, batchSize, g); foreach (var decoder in m_decoders) { var e = decoder.Step(context, V, g); V = e; } return(V); }
public IWeightTensor Decode(IWeightTensor input, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph g) { IWeightTensor V = input; IWeightTensor lastStatus = m_decoders.LastOrDefault().Cell; IWeightTensor context = m_attentionLayer.Perform(lastStatus, attenPreProcessResult, batchSize, g); foreach (LSTMAttentionDecoderCell decoder in m_decoders) { IWeightTensor e = decoder.Step(context, V, g); V = e; } IWeightTensor eOutput = g.Dropout(V, batchSize, m_dropoutRatio, false); // eOutput = m_decoderFFLayer.Process(eOutput, batchSize, g); return(eOutput); }
public IWeightMatrix Perform(IWeightMatrix state, AttentionPreProcessResult attenPreProcessResult, IComputeGraph g) { var bWas = g.RepeatRows(bWa, state.Rows); var wc = g.MulAdd(state, Wa, bWas); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / m_batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, V); var atten2 = g.PermuteBatch(atten, m_batchSize); var attenT = g.Transpose2(atten2); var attenT2 = g.View(attenT, m_batchSize, attenPreProcessResult.inputs.Rows / m_batchSize); var attenSoftmax = g.Softmax(attenT2); IWeightMatrix contexts = g.MulBatch(attenSoftmax, attenPreProcessResult.inputs, m_batchSize); return(contexts); }
public AttentionPreProcessResult PreProcess(IWeightTensor encOutput, int batchSize, IComputeGraph g) { int srcSeqLen = encOutput.Rows / batchSize; AttentionPreProcessResult r = new AttentionPreProcessResult { encOutput = encOutput }; r.Uhs = g.Affine(r.encOutput, m_Ua, m_bUa); r.Uhs = g.View(r.Uhs, dims: new long[] { batchSize, srcSeqLen, -1 }); if (m_enableCoverageModel) { m_coverage.Reset(g.GetWeightFactory(), r.encOutput.Rows); } return(r); }
public AttentionPreProcessResult PreProcess(IWeightTensor inputs, int batchSize, IComputeGraph g) { int srcSeqLen = inputs.Rows / batchSize; AttentionPreProcessResult r = new AttentionPreProcessResult { rawInputs = inputs, inputsBatchFirst = g.TransposeBatch(inputs, batchSize) }; r.uhs = g.Affine(r.inputsBatchFirst, m_Ua, m_bUa); r.uhs = g.View(r.uhs, batchSize, srcSeqLen, -1); if (m_enableCoverageModel) { m_coverage.Reset(g.GetWeightFactory(), r.inputsBatchFirst.Rows); } return(r); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attenPreProcessResult, int batchSize, IComputeGraph graph) { IComputeGraph g = graph.CreateSubGraph(m_name); var wc = g.Affine(state, m_Wa, m_bWa); var wcs = g.RepeatRows(wc, attenPreProcessResult.inputs.Rows / batchSize); var ggs = g.AddTanh(attenPreProcessResult.uhs, wcs); var atten = g.Mul(ggs, m_V); var atten2 = g.TransposeBatch(atten, batchSize); var attenT = g.Transpose(atten2); var attenT2 = g.View(attenT, batchSize, attenPreProcessResult.inputs.Rows / batchSize); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, batchSize, attenSoftmax1.Rows / batchSize, attenSoftmax1.Columns); var inputs2 = g.View(attenPreProcessResult.inputs, batchSize, attenPreProcessResult.inputs.Rows / batchSize, attenPreProcessResult.inputs.Columns); IWeightTensor contexts = g.MulBatch(attenSoftmax, inputs2, batchSize); return(contexts); }
/// <summary> /// Given input sentence and generate output sentence by seq2seq model with beam search /// </summary> /// <param name="input"></param> /// <param name="beamSearchSize"></param> /// <param name="maxOutputLength"></param> /// <returns></returns> public List <List <string> > Predict(List <string> input, int beamSearchSize = 1, int maxOutputLength = 100) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(input); int batchSize = 1; // For predict with beam search, we currently only supports one sentence per call IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Construct beam search status list List <BeamSearchStatus> bssList = new List <BeamSearchStatus>(); BeamSearchStatus bss = new BeamSearchStatus(); bss.OutputIds.Add((int)SENTTAGS.START); bss.CTs = rnnDecoder.GetCTs(); bss.HTs = rnnDecoder.GetHTs(); bssList.Add(bss); IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); List <BeamSearchStatus> newBSSList = new List <BeamSearchStatus>(); bool finished = false; int outputLength = 0; while (finished == false && outputLength < maxOutputLength) { finished = true; for (int i = 0; i < bssList.Count; i++) { bss = bssList[i]; if (bss.OutputIds[bss.OutputIds.Count - 1] == (int)SENTTAGS.END) { newBSSList.Add(bss); } else if (bss.OutputIds.Count > maxOutputLength) { newBSSList.Add(bss); } else { finished = false; int ix_input = bss.OutputIds[bss.OutputIds.Count - 1]; rnnDecoder.SetCTs(bss.CTs); rnnDecoder.SetHTs(bss.HTs); IWeightTensor x = g.PeekRow(tgtEmbedding, ix_input); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); using (IWeightTensor probs = g.Softmax(eOutput)) { List <int> preds = probs.GetTopNMaxWeightIdx(beamSearchSize); for (int j = 0; j < preds.Count; j++) { BeamSearchStatus newBSS = new BeamSearchStatus(); newBSS.OutputIds.AddRange(bss.OutputIds); newBSS.OutputIds.Add(preds[j]); newBSS.CTs = rnnDecoder.GetCTs(); newBSS.HTs = rnnDecoder.GetHTs(); float score = probs.GetWeightAt(preds[j]); newBSS.Score = bss.Score; newBSS.Score += (float)(-Math.Log(score)); //var lengthPenalty = Math.Pow((5.0f + newBSS.OutputIds.Count) / 6, 0.6); //newBSS.Score /= (float)lengthPenalty; newBSSList.Add(newBSS); } } } } bssList = BeamSearch.GetTopNBSS(newBSSList, beamSearchSize); newBSSList.Clear(); outputLength++; } // Convert output target word ids to real string List <List <string> > results = new List <List <string> >(); for (int i = 0; i < bssList.Count; i++) { results.Add(m_modelMetaData.Vocab.ConvertTargetIdsToString(bssList[i].OutputIds)); } return(results); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSnts">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="tgtEmbedding"></param> /// <returns></returns> private float DecodeAttentionLSTM(List <List <string> > outputSnts, IComputeGraph g, IWeightTensor encOutputs, AttentionDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[i][0]); } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSnts) : null; int seqLen = isTraining ? outputSnts[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encOutputs, batchSize, g); for (int i = 1; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(tgtEmbedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSnts[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } ix_inputs = targetIdx; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
public IWeightTensor Perform(IWeightTensor state, AttentionPreProcessResult attnPre, int batchSize, IComputeGraph graph) { var srcSeqLen = attnPre.encOutput.Rows / batchSize; using (var g = graph.CreateSubGraph(this.m_name)) { // Affine decoder state var wc = g.Affine(state, this.m_Wa, this.m_bWa); // Expand dims from [batchSize x decoder_dim] to [batchSize x srcSeqLen x decoder_dim] var wc1 = g.View(wc, dims: new long[] { batchSize, 1, wc.Columns }); var wcExp = g.Expand(wc1, dims: new long[] { batchSize, srcSeqLen, wc.Columns }); IWeightTensor ggs = null; if (this.m_enableCoverageModel) { // Get coverage model status at {t-1} var wCoverage = g.Affine(this.m_coverage.Hidden, this.m_Wc, this.m_bWc); var wCoverage1 = g.View(wCoverage, dims: new long[] { batchSize, srcSeqLen, -1 }); ggs = g.AddTanh(attnPre.Uhs, wcExp, wCoverage1); } else { ggs = g.AddTanh(attnPre.Uhs, wcExp); } var ggss = g.View(ggs, dims: new long[] { batchSize *srcSeqLen, -1 }); var atten = g.Mul(ggss, this.m_V); var attenT = g.Transpose(atten); var attenT2 = g.View(attenT, dims: new long[] { batchSize, srcSeqLen }); var attenSoftmax1 = g.Softmax(attenT2, inPlace: true); var attenSoftmax = g.View(attenSoftmax1, dims: new long[] { batchSize, 1, srcSeqLen }); var inputs2 = g.View(attnPre.encOutput, dims: new long[] { batchSize, srcSeqLen, attnPre.encOutput.Columns }); var contexts = graph.MulBatch(attenSoftmax, inputs2, batchSize); contexts = graph.View(contexts, dims: new long[] { batchSize, attnPre.encOutput.Columns }); if (this.m_enableCoverageModel) { // Concatenate tensor as input for coverage model var aCoverage = g.View(attenSoftmax1, dims: new long[] { attnPre.encOutput.Rows, 1 }); var state2 = g.View(state, dims: new long[] { batchSize, 1, state.Columns }); var state3 = g.Expand(state2, dims: new long[] { batchSize, srcSeqLen, state.Columns }); var state4 = g.View(state3, dims: new long[] { batchSize *srcSeqLen, -1 }); var concate = g.ConcatColumns(aCoverage, attnPre.encOutput, state4); this.m_coverage.Step(concate, graph); } return(contexts); } }