public void Load(string modelFilePath) { EncodedModelFilePath = modelFilePath; ModelAttentionData tosave = new ModelAttentionData(); BinaryFormatter bf = new BinaryFormatter(); FileStream fs = new FileStream(EncodedModelFilePath, FileMode.Open, FileAccess.Read); tosave = bf.Deserialize(fs) as ModelAttentionData; fs.Close(); fs.Dispose(); this.bd = tosave.bd; this.clipval = tosave.clipval; this.decoder = tosave.decoder; this.Depth = tosave.Depth; this.encoder = tosave.encoder; this.HiddenSize = tosave.hidden_sizes; this.learning_rate = tosave.learning_rate; this.WordVectorSize = tosave.letter_size; this.max_word = 100; this.regc = tosave.regc; this.reversEncoder = tosave.ReversEncoder; this.UseDropout = tosave.UseDropout; this.Whd = tosave.Whd; this.s_Embedding = tosave.s_Wil; this.s_wordToIndex = tosave.s_wordToIndex; this.s_indexToWord = tosave.s_indexToWord; this.t_Embedding = tosave.t_Wil; this.t_wordToIndex = tosave.t_wordToIndex; this.t_indexToWord = tosave.t_indexToWord; }
public void VisualizeNeuralNetwork(string visNNFilePath) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); // Build input sentence List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(null); int batchSize = inputSeqs.Count; IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false, visNetwork: true); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Run encoder IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); // Prepare for attention over encoder-decoder AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); // Run decoder IWeightTensor x = g.PeekRow(tgtEmbedding, (int)SENTTAGS.START); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); IWeightTensor probs = g.Softmax(eOutput); g.VisualizeNeuralNetToFile(visNNFilePath); }
private void CleanWeightsCash(Encoder encoder, Encoder ReversEncoder, AttentionDecoder decoder, WeightMatrix Whd, WeightMatrix bd, WeightMatrix s_Embedding, WeightMatrix t_Embedding) { var model = encoder.getParams(); model.AddRange(decoder.getParams()); model.AddRange(ReversEncoder.getParams()); model.Add(s_Embedding); model.Add(t_Embedding); model.Add(Whd); model.Add(bd); solver.CleanCash(model); }
private void UpdateParameters(Encoder encoder, Encoder ReversEncoder, AttentionDecoder decoder, WeightMatrix Whd, WeightMatrix bd, WeightMatrix s_Embedding, WeightMatrix t_Embedding) { var model = encoder.getParams(); model.AddRange(decoder.getParams()); model.AddRange(ReversEncoder.getParams()); model.Add(s_Embedding); model.Add(t_Embedding); model.Add(Whd); model.Add(bd); solver.UpdateWeights(model, learning_rate, regc, clipval); }
public AttentionSeq2Seq(int inputSize, int hiddenSize, int depth, Corpus trainCorpus, string srcVocabFilePath, string tgtVocabFilePath, string srcEmbeddingFilePath, string tgtEmbeddingFilePath, bool useSparseFeature, bool useDropout, string modelFilePath) { this.TrainCorpus = trainCorpus; this.Depth = depth; // list of sizes of hidden layers WordVectorSize = inputSize; // size of word embeddings. EncodedModelFilePath = modelFilePath; this.HiddenSize = hiddenSize; if (String.IsNullOrEmpty(srcVocabFilePath) == false && String.IsNullOrEmpty(tgtVocabFilePath) == false) { Logger.WriteLine($"Loading vocabulary files from '{srcVocabFilePath}' and '{tgtVocabFilePath}'..."); LoadVocab(srcVocabFilePath, tgtVocabFilePath); } else { Logger.WriteLine("Building vocabulary from training corpus..."); BuildVocab(trainCorpus); } this.Whd = new WeightMatrix(HiddenSize, t_vocab.Count + 3, true); this.bd = new WeightMatrix(1, t_vocab.Count + 3, 0); s_Embedding = new WeightMatrix(s_vocab.Count, WordVectorSize, true); t_Embedding = new WeightMatrix(t_vocab.Count + 3, WordVectorSize, true); if (String.IsNullOrEmpty(srcEmbeddingFilePath) == false) { Logger.WriteLine($"Loading ExtEmbedding model from '{srcEmbeddingFilePath}' for source side."); LoadWordEmbedding(srcEmbeddingFilePath, s_Embedding, s_wordToIndex); } if (String.IsNullOrEmpty(tgtEmbeddingFilePath) == false) { Logger.WriteLine($"Loading ExtEmbedding model from '{tgtEmbeddingFilePath}' for target side."); LoadWordEmbedding(tgtEmbeddingFilePath, t_Embedding, t_wordToIndex); } encoder = new Encoder(HiddenSize, WordVectorSize, depth); reversEncoder = new Encoder(HiddenSize, WordVectorSize, depth); int sparseFeatureSize = useSparseFeature ? s_vocab.Count : 0; decoder = new AttentionDecoder(sparseFeatureSize, HiddenSize, WordVectorSize, depth); }
private void InitWeights() { Logger.WriteLine($"Initializing weights..."); m_srcEmbedding = new IWeightMatrix[m_deviceIds.Length]; m_tgtEmbedding = new IWeightMatrix[m_deviceIds.Length]; m_biEncoder = new BiEncoder[m_deviceIds.Length]; m_decoder = new AttentionDecoder[m_deviceIds.Length]; m_decoderFFLayer = new FeedForwardLayer[m_deviceIds.Length]; for (int i = 0; i < m_deviceIds.Length; i++) { Logger.WriteLine($"Initializing weights for device '{m_deviceIds[i]}'"); if (m_archType == ArchTypeEnums.GPU_CUDA) { //m_Whd[i] = new WeightTensor(HiddenSize, m_tgtIndexToWord.Count + 3, m_deviceIds[i], true); //m_bd[i] = new WeightTensor(1, m_tgtIndexToWord.Count + 3, 0, m_deviceIds[i]); m_srcEmbedding[i] = new WeightTensor(m_srcIndexToWord.Count, WordVectorSize, m_deviceIds[i], true); m_tgtEmbedding[i] = new WeightTensor(m_tgtIndexToWord.Count + 3, WordVectorSize, m_deviceIds[i], true); } else { //m_Whd[i] = new WeightMatrix(HiddenSize, m_tgtIndexToWord.Count + 3, true); //m_bd[i] = new WeightMatrix(1, m_tgtIndexToWord.Count + 3, 0); m_srcEmbedding[i] = new WeightMatrix(m_srcIndexToWord.Count, WordVectorSize, true); m_tgtEmbedding[i] = new WeightMatrix(m_tgtIndexToWord.Count + 3, WordVectorSize, true); } Logger.WriteLine($"Initializing encoders and decoders for device '{m_deviceIds[i]}'..."); m_biEncoder[i] = new BiEncoder(m_batchSize, HiddenSize, WordVectorSize, Depth, m_archType, m_deviceIds[i]); m_decoder[i] = new AttentionDecoder(m_batchSize, HiddenSize, WordVectorSize, HiddenSize * 2, Depth, m_archType, m_deviceIds[i]); m_decoderFFLayer[i] = new FeedForwardLayer(HiddenSize, m_tgtIndexToWord.Count + 3, m_archType, m_deviceIds[i]); } InitWeightsFactory(); }
private (IEncoder[], AttentionDecoder[]) CreateEncoderDecoder() { Logger.WriteLine($"Creating encoders and decoders..."); IEncoder[] encoder = new IEncoder[m_deviceIds.Length]; AttentionDecoder[] decoder = new AttentionDecoder[m_deviceIds.Length]; for (int i = 0; i < m_deviceIds.Length; i++) { if (m_encoderType == EncoderTypeEnums.BiLSTM) { encoder[i] = new BiEncoder("BiLSTMEncoder", m_batchSize, m_hiddenDim, m_embeddingDim, m_encoderLayerDepth, m_deviceIds[i]); decoder[i] = new AttentionDecoder("AttnLSTMDecoder", m_batchSize, m_hiddenDim, m_embeddingDim, m_hiddenDim * 2, m_decoderLayerDepth, m_deviceIds[i]); } else { encoder[i] = new TransformerEncoder("TransformerEncoder", m_batchSize, m_multiHeadNum, m_hiddenDim, m_embeddingDim, m_encoderLayerDepth, m_deviceIds[i]); decoder[i] = new AttentionDecoder("AttnLSTMDecoder", m_batchSize, m_hiddenDim, m_embeddingDim, m_hiddenDim, m_decoderLayerDepth, m_deviceIds[i]); } } return(encoder, decoder); }
/// <summary> /// Given input sentence and generate output sentence by seq2seq model with beam search /// </summary> /// <param name="input"></param> /// <param name="beamSearchSize"></param> /// <param name="maxOutputLength"></param> /// <returns></returns> public List <List <string> > Predict(List <string> input, int beamSearchSize = 1, int maxOutputLength = 100) { (IEncoder encoder, IDecoder decoder, IWeightTensor srcEmbedding, IWeightTensor tgtEmbedding) = GetNetworksOnDeviceAt(-1); List <List <string> > inputSeqs = ParallelCorpus.ConstructInputTokens(input); int batchSize = 1; // For predict with beam search, we currently only supports one sentence per call IComputeGraph g = CreateComputGraph(m_defaultDeviceId, needBack: false); AttentionDecoder rnnDecoder = decoder as AttentionDecoder; encoder.Reset(g.GetWeightFactory(), batchSize); rnnDecoder.Reset(g.GetWeightFactory(), batchSize); // Construct beam search status list List <BeamSearchStatus> bssList = new List <BeamSearchStatus>(); BeamSearchStatus bss = new BeamSearchStatus(); bss.OutputIds.Add((int)SENTTAGS.START); bss.CTs = rnnDecoder.GetCTs(); bss.HTs = rnnDecoder.GetHTs(); bssList.Add(bss); IWeightTensor encodedWeightMatrix = Encode(g, inputSeqs, encoder, srcEmbedding, null, null); AttentionPreProcessResult attPreProcessResult = rnnDecoder.PreProcess(encodedWeightMatrix, batchSize, g); List <BeamSearchStatus> newBSSList = new List <BeamSearchStatus>(); bool finished = false; int outputLength = 0; while (finished == false && outputLength < maxOutputLength) { finished = true; for (int i = 0; i < bssList.Count; i++) { bss = bssList[i]; if (bss.OutputIds[bss.OutputIds.Count - 1] == (int)SENTTAGS.END) { newBSSList.Add(bss); } else if (bss.OutputIds.Count > maxOutputLength) { newBSSList.Add(bss); } else { finished = false; int ix_input = bss.OutputIds[bss.OutputIds.Count - 1]; rnnDecoder.SetCTs(bss.CTs); rnnDecoder.SetHTs(bss.HTs); IWeightTensor x = g.PeekRow(tgtEmbedding, ix_input); IWeightTensor eOutput = rnnDecoder.Decode(x, attPreProcessResult, batchSize, g); using (IWeightTensor probs = g.Softmax(eOutput)) { List <int> preds = probs.GetTopNMaxWeightIdx(beamSearchSize); for (int j = 0; j < preds.Count; j++) { BeamSearchStatus newBSS = new BeamSearchStatus(); newBSS.OutputIds.AddRange(bss.OutputIds); newBSS.OutputIds.Add(preds[j]); newBSS.CTs = rnnDecoder.GetCTs(); newBSS.HTs = rnnDecoder.GetHTs(); float score = probs.GetWeightAt(preds[j]); newBSS.Score = bss.Score; newBSS.Score += (float)(-Math.Log(score)); //var lengthPenalty = Math.Pow((5.0f + newBSS.OutputIds.Count) / 6, 0.6); //newBSS.Score /= (float)lengthPenalty; newBSSList.Add(newBSS); } } } } bssList = BeamSearch.GetTopNBSS(newBSSList, beamSearchSize); newBSSList.Clear(); outputLength++; } // Convert output target word ids to real string List <List <string> > results = new List <List <string> >(); for (int i = 0; i < bssList.Count; i++) { results.Add(m_modelMetaData.Vocab.ConvertTargetIdsToString(bssList[i].OutputIds)); } return(results); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSnts">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="tgtEmbedding"></param> /// <returns></returns> private float DecodeAttentionLSTM(List <List <string> > outputSnts, IComputeGraph g, IWeightTensor encOutputs, AttentionDecoder decoder, IWeightTensor tgtEmbedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[i][0]); } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSnts) : null; int seqLen = isTraining ? outputSnts[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encOutputs, batchSize, g); for (int i = 1; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(tgtEmbedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSnts[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSnts[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } ix_inputs = targetIdx; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences">In training mode, they are golden target sentences, otherwise, they are target sentences generated by the decoder</param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="decoderFFLayer"></param> /// <param name="embedding"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightTensor encodedOutputs, AttentionDecoder decoder, IWeightTensor embedding, int batchSize, bool isTraining = true) { float cost = 0.0f; int[] ix_inputs = new int[batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } // Initialize variables accoridng to current mode List <int> originalOutputLengths = isTraining ? ParallelCorpus.PadSentences(outputSentences) : null; int seqLen = isTraining ? outputSentences[0].Count : 64; float dropoutRatio = isTraining ? m_dropoutRatio : 0.0f; HashSet <int> setEndSentId = isTraining ? null : new HashSet <int>(); if (!isTraining) { if (outputSentences.Count != 0) { throw new ArgumentException($"The list for output sentences must be empty if current is not in training mode."); } for (int i = 0; i < batchSize; i++) { outputSentences.Add(new List <string>()); } } // Pre-process for attention model AttentionPreProcessResult attPreProcessResult = decoder.PreProcess(encodedOutputs, batchSize, g); for (int i = 0; i < seqLen; i++) { //Get embedding for all sentence in the batch at position i List <IWeightTensor> inputs = new List <IWeightTensor>(); for (int j = 0; j < batchSize; j++) { inputs.Add(g.PeekRow(embedding, ix_inputs[j])); } IWeightTensor inputsM = g.ConcatRows(inputs); //Decode output sentence at position i IWeightTensor eOutput = decoder.Decode(inputsM, attPreProcessResult, batchSize, g); //Softmax for output using (IWeightTensor probs = g.Softmax(eOutput, runGradients: false, inPlace: true)) { if (isTraining) { //Calculate loss for each word in the batch for (int k = 0; k < batchSize; k++) { using (IWeightTensor probs_k = g.PeekRow(probs, k, runGradients: false)) { int ix_targets_k = m_modelMetaData.Vocab.GetTargetWordIndex(outputSentences[k][i]); float score_k = probs_k.GetWeightAt(ix_targets_k); if (i < originalOutputLengths[k]) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets_k); ix_inputs[k] = ix_targets_k; } } eOutput.CopyWeightsToGradients(probs); } else { // Output "i"th target word int[] targetIdx = g.Argmax(probs, 1); List <string> targetWords = m_modelMetaData.Vocab.ConvertTargetIdsToString(targetIdx.ToList()); for (int j = 0; j < targetWords.Count; j++) { if (setEndSentId.Contains(j) == false) { outputSentences[j].Add(targetWords[j]); if (targetWords[j] == ParallelCorpus.EOS) { setEndSentId.Add(j); } } } ix_inputs = targetIdx; } } if (isTraining) { ////Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } else { if (setEndSentId.Count == batchSize) { // All target sentences in current batch are finished, so we exit. break; } } } return(cost); }
/// <summary> /// Decode output sentences in training /// </summary> /// <param name="outputSentences"></param> /// <param name="g"></param> /// <param name="encodedOutputs"></param> /// <param name="decoder"></param> /// <param name="Whd"></param> /// <param name="bd"></param> /// <param name="Embedding"></param> /// <param name="predictSentence"></param> /// <returns></returns> private float Decode(List <List <string> > outputSentences, IComputeGraph g, IWeightMatrix encodedOutputs, AttentionDecoder decoder, FeedForwardLayer decoderFFLayer, IWeightMatrix Embedding, out List <List <string> > predictSentence) { predictSentence = null; float cost = 0.0f; var attPreProcessResult = decoder.PreProcess(encodedOutputs, g); var originalOutputLengths = PadSentences(outputSentences); int seqLen = outputSentences[0].Count; int[] ix_inputs = new int[m_batchSize]; int[] ix_targets = new int[m_batchSize]; for (int i = 0; i < ix_inputs.Length; i++) { ix_inputs[i] = (int)SENTTAGS.START; } for (int i = 0; i < seqLen + 1; i++) { //Get embedding for all sentence in the batch at position i List <IWeightMatrix> inputs = new List <IWeightMatrix>(); for (int j = 0; j < m_batchSize; j++) { List <string> OutputSentence = outputSentences[j]; ix_targets[j] = (int)SENTTAGS.UNK; if (i >= seqLen) { ix_targets[j] = (int)SENTTAGS.END; } else { if (m_tgtWordToIndex.ContainsKey(OutputSentence[i])) { ix_targets[j] = m_tgtWordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_inputs[j]); inputs.Add(x); } var inputsM = g.ConcatRows(inputs); //Decode output sentence at position i var eOutput = decoder.Decode(inputsM, attPreProcessResult, g); if (m_dropoutRatio > 0.0f) { eOutput = g.Dropout(eOutput, m_dropoutRatio); } var o = decoderFFLayer.Process(eOutput, g); //Softmax for output // var o = g.MulAdd(eOutput, Whd, bds); var probs = g.Softmax(o, false); o.ReleaseWeight(); //Calculate loss for each word in the batch List <IWeightMatrix> probs_g = g.UnFolderRow(probs, m_batchSize, false); for (int k = 0; k < m_batchSize; k++) { var probs_k = probs_g[k]; var score_k = probs_k.GetWeightAt(ix_targets[k]); if (i < originalOutputLengths[k] + 1) { cost += (float)-Math.Log(score_k); } probs_k.SetWeightAt(score_k - 1, ix_targets[k]); ix_inputs[k] = ix_targets[k]; probs_k.Dispose(); } o.SetGradientByWeight(probs); //Hacky: Run backward for last feed forward layer and dropout layer in order to save memory usage, since it's not time sequence dependency g.RunTopBackward(); g.RunTopBackward(); if (m_dropoutRatio > 0.0f) { g.RunTopBackward(); } } return(cost); }
private void Reset(Encoder encoder, Encoder reversEncoder, AttentionDecoder decoder) { encoder.Reset(); reversEncoder.Reset(); decoder.Reset(); }
private float DecodeOutput(string[] OutputSentence, IComputeGraph g, float cost, SparseWeightMatrix sparseInput, List <WeightMatrix> encoded, AttentionDecoder decoder, WeightMatrix Whd, WeightMatrix bd, WeightMatrix Embedding) { int ix_input = (int)SENTTAGS.START; for (int i = 0; i < OutputSentence.Length + 1; i++) { int ix_target = (int)SENTTAGS.UNK; if (i == OutputSentence.Length) { ix_target = (int)SENTTAGS.END; } else { if (t_wordToIndex.ContainsKey(OutputSentence[i])) { ix_target = t_wordToIndex[OutputSentence[i]]; } } var x = g.PeekRow(Embedding, ix_input); var eOutput = decoder.Decode(sparseInput, x, encoded, g); if (UseDropout) { eOutput = g.Dropout(eOutput, 0.2f); } var o = g.muladd(eOutput, Whd, bd); if (UseDropout) { o = g.Dropout(o, 0.2f); } var probs = g.SoftmaxWithCrossEntropy(o); cost += (float)-Math.Log(probs.Weight[ix_target]); o.Gradient = probs.Weight; o.Gradient[ix_target] -= 1; ix_input = ix_target; } return(cost); }
private void Reset(IWeightFactory weightFactory, Encoder encoder, Encoder reversEncoder, AttentionDecoder decoder) { encoder.Reset(weightFactory); reversEncoder.Reset(weightFactory); decoder.Reset(weightFactory); }
private float UpdateParameters(float learningRate, Encoder encoder, Encoder ReversEncoder, AttentionDecoder decoder, IWeightMatrix Whd, IWeightMatrix bd, IWeightMatrix s_Embedding, IWeightMatrix t_Embedding, int batchSize) { var model = encoder.getParams(); model.AddRange(decoder.getParams()); model.AddRange(ReversEncoder.getParams()); model.Add(s_Embedding); model.Add(t_Embedding); model.Add(Whd); model.Add(bd); return(m_solver.UpdateWeights(model, batchSize, learningRate, m_regc, m_clipvalue, m_archType)); }