public int[][] ProcessNBest(Sentence sent, int nbest) { if (m_Rnn.IsCRFTraining == false) { throw new ArgumentException("N-best result is only for RNN-CRF model."); } Sequence seq = m_Featurizer.ExtractFeatures(sent); int[][] predicted = m_Rnn.DecodeNBestCRF(seq, nbest); return predicted; }
public int[] Process(Sentence sent) { Sequence seq = m_Featurizer.ExtractFeatures(sent); int[] predicted; if (m_Rnn.IsCRFTraining == true) { predicted = m_Rnn.DecodeCRF(seq); } else { predicted = m_Rnn.DecodeNN(seq); } return predicted; }
public void SetLabel(Sentence sent, TagSet tagSet) { List<string[]> tokensList = sent.TokensList; if (tokensList.Count != States.Length) { throw new DataMisalignedException(String.Format("Error: Inconsistent token({0}) and state({1}) size. Tokens list: {2}", tokensList.Count, States.Length, sent.ToString())); } for (int i = 0; i < tokensList.Count; i++) { string strTagName = tokensList[i][tokensList[i].Length - 1]; int tagId = tagSet.GetIndex(strTagName); if (tagId < 0) { throw new DataMisalignedException(String.Format("Error: tag {0} is unknown. Tokens list: {1}", strTagName, sent.ToString())); } States[i].Label = tagId; } }
public int[] Process(Sentence sent) { Sequence seq = m_Featurizer.ExtractFeatures(sent); int[] predicted; if (m_Rnn.IsCRFModel() == true) { predicted = m_Rnn.DecodeCRF(seq); } else { predicted = m_Rnn.DecodeNN(seq); } //Remove the beginning and end character from result int[] results = new int[predicted.Length - 2]; for (int i = 1; i < predicted.Length - 1; i++) { results[i - 1] = predicted[i]; } return results; }
public bool SetLabel(Sentence sent, TagSet tagSet) { List<string[]> features = sent.GetFeatureSet(); if (features.Count != m_States.Length) { return false; } for (int i = 0; i < features.Count; i++) { string strTagName = features[i][features[i].Length - 1]; int tagId = tagSet.GetIndex(strTagName); if (tagId < 0) { Console.WriteLine("Error: tag {0} is unknown.", strTagName); return false; } m_States[i].SetLabel(tagId); } return true; }
public int[][] ProcessNBest(Sentence sent, int nbest) { if (m_Rnn.IsCRFModel() == false) { return null; } Sequence seq = m_Featurizer.ExtractFeatures(sent); int[][] predicted = m_Rnn.DecodeNBestCRF(seq, nbest); //Remove the beginning and end character from result int[][] results = new int[nbest][]; for (int k = 0; k < nbest; k++) { results[k] = new int[predicted[k].Length - 2]; for (int i = 1; i < predicted[k].Length - 1; i++) { results[k][i - 1] = predicted[k][i]; } } return results; }
private static void Test() { if (String.IsNullOrEmpty(strTagFile) == true) { Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't specified.", strTagFile); UsageTest(); return; } //Load tag name TagSet tagSet = new TagSet(strTagFile); if (String.IsNullOrEmpty(strModelFile) == true) { Logger.WriteLine(Logger.Level.err, "FAILED: The model file {0} isn't specified.", strModelFile); UsageTest(); return; } if (String.IsNullOrEmpty(strFeatureConfigFile) == true) { Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} isn't specified.", strFeatureConfigFile); UsageTest(); return; } if (strOutputFile.Length == 0) { Logger.WriteLine(Logger.Level.err, "FAILED: The output file name should not be empty."); UsageTest(); return; } //Create feature extractors and load word embedding data from file Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet); featurizer.ShowFeatureSize(); //Create instance for decoder RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer); if (File.Exists(strTestFile) == false) { Logger.WriteLine(Logger.Level.err, "FAILED: The test corpus {0} isn't existed.", strTestFile); UsageTest(); return; } StreamReader sr = new StreamReader(strTestFile); StreamWriter sw = new StreamWriter(strOutputFile); while (true) { Sentence sent = new Sentence(ReadRecord(sr)); if (sent.TokensList.Count <= 2) { //No more record, it only contains <s> and </s> break; } if (nBest == 1) { int[] output = decoder.Process(sent); //Output decoded result //Append the decoded result into the end of feature set of each token StringBuilder sb = new StringBuilder(); for (int i = 0; i < sent.TokensList.Count; i++) { string tokens = String.Join("\t", sent.TokensList[i]); sb.Append(tokens); sb.Append("\t"); sb.Append(tagSet.GetTagName(output[i])); sb.AppendLine(); } sw.WriteLine(sb.ToString()); } else { int[][] output = decoder.ProcessNBest(sent, nBest); StringBuilder sb = new StringBuilder(); for (int i = 0; i < nBest; i++) { for (int j = 0; j < sent.TokensList.Count; j++) { string tokens = String.Join("\t", sent.TokensList[i]); sb.Append(tokens); sb.Append("\t"); sb.Append(tagSet.GetTagName(output[i][j])); sb.AppendLine(); } sb.AppendLine(); } sw.WriteLine(sb.ToString()); } } sr.Close(); sw.Close(); }
static void LoadDataset(string strFileName, Featurizer featurizer, DataSet dataSet) { CheckCorpus(strFileName); StreamReader sr = new StreamReader(strFileName); int RecordCount = 0; while (true) { //Extract features from it and convert it into sequence Sentence sent = new Sentence(ReadRecord(sr)); if (sent.TokensList.Count <= 2) { //No more record, it only contain <s> and </s> break; } Sequence seq = featurizer.ExtractFeatures(sent); //Set label for the sequence seq.SetLabel(sent, featurizer.TagSet); //Add the sequence into data set dataSet.SequenceList.Add(seq); //Show state at every 1000 record RecordCount++; if (RecordCount % 10000 == 0) { Logger.WriteLine(Logger.Level.info, "{0}...", RecordCount); } } sr.Close(); }
private static void Test() { if (File.Exists(strTagFile) == false) { Console.WriteLine("FAILED: The tag mapping file {0} isn't existed.", strTagFile); UsageTest(); return; } //Load tag id and its name from file TagSet tagSet = new TagSet(strTagFile); if (File.Exists(strModelFile) == false) { Console.WriteLine("FAILED: The model file {0} isn't existed.", strModelFile); UsageTest(); return; } if (File.Exists(strFeatureConfigFile) == false) { Console.WriteLine("FAILED: The feature configuration file {0} isn't existed.", strFeatureConfigFile); UsageTest(); return; } if (strOutputFile.Length == 0) { Console.WriteLine("FAILED: The output file name should not be empty."); UsageTest(); return; } //Create feature extractors and load word embedding data from file Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet); featurizer.ShowFeatureSize(); //Create an instance for the model // Model model = new Model(strModelFile); //Create instance for decoder RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer); if (File.Exists(strTestFile) == false) { Console.WriteLine("FAILED: The test corpus {0} isn't existed.", strTestFile); UsageTest(); return; } StreamReader sr = new StreamReader(strTestFile); StreamWriter sw = new StreamWriter(strOutputFile); while (true) { List<string> tokenList = ReadRecord(sr); if (tokenList.Count == 0) { //No more record break; } Sentence sent = new Sentence(); sent.SetFeatures(tokenList); if (nBest == 1) { int[] output = decoder.Process(sent); //Output decoded result //Append the decoded result into the end of feature set of each token StringBuilder sb = new StringBuilder(); for (int i = 0; i < tokenList.Count; i++) { sb.Append(tokenList[i]); sb.Append("\t"); sb.Append(tagSet.GetTagName(output[i])); sb.AppendLine(); } sw.WriteLine(sb.ToString()); } else { int[][] output = decoder.ProcessNBest(sent, nBest); if (output == null) { Console.WriteLine("FAILED: decode failed. Dump current sentence..."); sent.DumpFeatures(); return; } StringBuilder sb = new StringBuilder(); for (int i = 0; i < nBest; i++) { for (int j = 0; j < tokenList.Count; j++) { sb.Append(tokenList[j]); sb.Append("\t"); sb.Append(tagSet.GetTagName(output[i][j])); sb.AppendLine(); } sb.AppendLine(); } sw.WriteLine(sb.ToString()); } } sr.Close(); sw.Close(); }
static void LoadDataset(string strFileName, Featurizer featurizer, DataSet dataSet) { CheckCorpus(strFileName); StreamReader sr = new StreamReader(strFileName); int RecordCount = 0; while (true) { List<string> tokenList = ReadRecord(sr); if (tokenList.Count == 0) { //No more record break; } //Extract features from it and convert it into sequence Sentence sent = new Sentence(); sent.SetFeatures(tokenList); Sequence seq = featurizer.ExtractFeatures(sent); //Set label for the sequence if (seq.SetLabel(sent, featurizer.GetTagSet()) == false) { Console.WriteLine("Error: Invalidated record."); sent.DumpFeatures(); continue; } //Add the sequence into data set dataSet.Add(seq); //Show state at every 1000 record RecordCount++; if (RecordCount % 10000 == 0) { Console.Write("{0}...", RecordCount); } } Console.WriteLine(); sr.Close(); }
public Sequence ExtractFeatures(Sentence sentence) { int n = sentence.TokensList.Count; Sequence sequence = new Sequence(n); //For each token, get its sparse and dense feature set according configuration and training corpus for (int i = 0; i < n; i++) { State state = sequence.States[i]; ExtractSparseFeature(i, n, sentence.TokensList, state); state.DenseData = ExtractDenseFeature(i, n, sentence.TokensList); } return sequence; }
static IDictionary<string, int> ExtractFeatureSetFromFile() { //Load templates from given file Logger.WriteLine("Loading feature template from {0}...", strTemplateFile); templateFeaturizer = new TemplateFeaturizer(); templateFeaturizer.LoadTemplateFromFile(strTemplateFile); Logger.WriteLine("Generate feature set..."); BigDictionary<string, int> feature2freq = new BigDictionary<string, int>(); List<string[]> tokenList = new List<string[]>(); string strLine = null; Sentence sentence = null; using (StreamReader srCorpus = new StreamReader(strInputFile, Encoding.UTF8)) { while ((strLine = srCorpus.ReadLine()) != null) { strLine = strLine.Trim(); if (strLine.Length == 0) { //The end of current record sentence = new Sentence(tokenList); for (int i = 0; i < sentence.TokensList.Count; i++) { //Get feature of i-th token List<string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i); foreach (string strFeature in featureList) { if (feature2freq.ContainsKey(strFeature) == false) { feature2freq.Add(strFeature, 0); } feature2freq[strFeature]++; } } tokenList.Clear(); } else { tokenList.Add(strLine.Split('\t')); } } //The end of current record sentence = new Sentence(tokenList); for (int i = 0; i < sentence.TokensList.Count; i++) { //Get feature of i-th token List<string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i); foreach (string strFeature in featureList) { if (feature2freq.ContainsKey(strFeature) == false) { feature2freq.Add(strFeature, 0); } feature2freq[strFeature]++; } } } //Only save the feature whose frequency is not less than minfreq Logger.WriteLine("Filter out features whose frequency is less than {0}", minfreq); SortedDictionary<string, int> features = new SortedDictionary<string, int>(StringComparer.Ordinal); foreach (KeyValuePair<string, int> pair in feature2freq) { if (pair.Value >= minfreq) { features.Add(pair.Key, pair.Value); } } return features; }
public override int[] TestSeq2Seq(Sentence srcSentence, Featurizer featurizer) { throw new NotImplementedException(); }
public override int[] TestSeq2Seq(Sentence srcSentence, Featurizer featurizer) { State curState = featurizer.ExtractFeatures(new string[] { "<s>" }); curState.Label = featurizer.TagSet.GetIndex("<s>"); //Reset all layers foreach (SimpleLayer layer in HiddenLayerList) { layer.netReset(false); } //Extract features from source sentence Sequence srcSequence = featurizer.AutoEncoder.Featurizer.ExtractFeatures(srcSentence); double[] srcHiddenAvgOutput; Dictionary <int, float> srcSparseFeatures; ExtractSourceSentenceFeature(featurizer.AutoEncoder, srcSequence, curState.SparseFeature.Length, out srcHiddenAvgOutput, out srcSparseFeatures); int numLayers = HiddenLayerList.Count; List <int> predicted = new List <int>(); predicted.Add(curState.Label); while (true) { //Build sparse features SparseVector sparseVector = new SparseVector(); sparseVector.SetLength(curState.SparseFeature.Length + srcSequence.SparseFeatureSize); sparseVector.AddKeyValuePairData(curState.SparseFeature); sparseVector.AddKeyValuePairData(srcSparseFeatures); //Compute first layer double[] denseFeatures = RNNHelper.ConcatenateVector(curState.DenseFeature, srcHiddenAvgOutput); HiddenLayerList[0].computeLayer(sparseVector, denseFeatures, false); //Compute middle layers for (int i = 1; i < numLayers; i++) { //We use previous layer's output as dense feature for current layer denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[i - 1].cellOutput, srcHiddenAvgOutput); HiddenLayerList[i].computeLayer(sparseVector, denseFeatures, false); } //Compute output layer denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[numLayers - 1].cellOutput, srcHiddenAvgOutput); OutputLayer.computeLayer(sparseVector, denseFeatures, false); OutputLayer.Softmax(false); int nextTagId = OutputLayer.GetBestOutputIndex(false); string nextWord = featurizer.TagSet.GetTagName(nextTagId); curState = featurizer.ExtractFeatures(new string[] { nextWord }); curState.Label = nextTagId; predicted.Add(nextTagId); if (nextWord == "</s>" || predicted.Count >= 100) { break; } } return(predicted.ToArray()); }
public Sequence ExtractFeatures(Sentence sentence) { Sequence sequence = new Sequence(); int n = sentence.GetTokenSize(); List<string[]> features = sentence.GetFeatureSet(); //For each token, get its sparse and dense feature set according configuration and training corpus sequence.SetSize(n); for (int i = 0; i < n; i++) { State state = sequence.Get(i); ExtractSparseFeature(i, n, features, state); var spDenseFeature = ExtractDenseFeature(i, n, features); state.SetDenseData(spDenseFeature); } return sequence; }
public override int[] TestSeq2Seq(Sentence srcSentence, Config featurizer) { var curState = featurizer.BuildState(new[] { "<s>" }); curState.Label = featurizer.TagSet.GetIndex("<s>"); //Reset all layers foreach (var layer in HiddenLayerList) { layer.Reset(false); } //Extract features from source sentence var srcSequence = featurizer.Seq2SeqAutoEncoder.Config.BuildSequence(srcSentence); float[] srcHiddenAvgOutput; Dictionary <int, float> srcSparseFeatures; ExtractSourceSentenceFeature(featurizer.Seq2SeqAutoEncoder, srcSequence, curState.SparseFeature.Length, out srcHiddenAvgOutput, out srcSparseFeatures); var numLayers = HiddenLayerList.Count; var predicted = new List <int> { curState.Label }; while (true) { //Build sparse features var sparseVector = new SparseVector(); sparseVector.SetLength(curState.SparseFeature.Length + srcSequence.SparseFeatureSize); sparseVector.AddKeyValuePairData(curState.SparseFeature); sparseVector.AddKeyValuePairData(srcSparseFeatures); //Compute first layer var denseFeatures = RNNHelper.ConcatenateVector(curState.DenseFeature, srcHiddenAvgOutput); HiddenLayerList[0].ForwardPass(sparseVector, denseFeatures, false); //Compute middle layers for (var i = 1; i < numLayers; i++) { //We use previous layer's output as dense feature for current layer denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[i - 1].Cell, srcHiddenAvgOutput); HiddenLayerList[i].ForwardPass(sparseVector, denseFeatures, false); } //Compute output layer denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[numLayers - 1].Cell, srcHiddenAvgOutput); OutputLayer.ForwardPass(sparseVector, denseFeatures, false); OutputLayer.Softmax(false); var nextTagId = OutputLayer.GetBestOutputIndex(false); var nextWord = featurizer.TagSet.GetTagName(nextTagId); curState = featurizer.BuildState(new[] { nextWord }); curState.Label = nextTagId; predicted.Add(nextTagId); if (nextWord == "</s>" || predicted.Count >= 100) { break; } } return(predicted.ToArray()); }