Beispiel #1
0
        public int[][] ProcessNBest(Sentence sent, int nbest)
        {
            if (m_Rnn.IsCRFTraining == false)
            {
                throw new ArgumentException("N-best result is only for RNN-CRF model.");
            }

            Sequence seq = m_Featurizer.ExtractFeatures(sent);
            int[][] predicted = m_Rnn.DecodeNBestCRF(seq, nbest);

            return predicted;
        }
Beispiel #2
0
        public int[] Process(Sentence sent)
        {
            Sequence seq = m_Featurizer.ExtractFeatures(sent);
            int[] predicted;
            if (m_Rnn.IsCRFTraining == true)
            {
                predicted = m_Rnn.DecodeCRF(seq);
            }
            else
            {
                predicted = m_Rnn.DecodeNN(seq);
            }

            return predicted;
        }
Beispiel #3
0
        public void SetLabel(Sentence sent, TagSet tagSet)
        {
            List<string[]> tokensList = sent.TokensList;
            if (tokensList.Count != States.Length)
            {
                throw new DataMisalignedException(String.Format("Error: Inconsistent token({0}) and state({1}) size. Tokens list: {2}",
                    tokensList.Count, States.Length, sent.ToString()));
            }

            for (int i = 0; i < tokensList.Count; i++)
            {
                string strTagName = tokensList[i][tokensList[i].Length - 1];
                int tagId = tagSet.GetIndex(strTagName);
                if (tagId < 0)
                {
                    throw new DataMisalignedException(String.Format("Error: tag {0} is unknown. Tokens list: {1}", 
                        strTagName, sent.ToString()));
                }

                States[i].Label = tagId;
            }
        }
Beispiel #4
0
        public int[] Process(Sentence sent)
        {
            Sequence seq = m_Featurizer.ExtractFeatures(sent);
            int[] predicted;
            if (m_Rnn.IsCRFModel() == true)
            {
                predicted = m_Rnn.DecodeCRF(seq);
            }
            else
            {
                predicted = m_Rnn.DecodeNN(seq);
            }

            //Remove the beginning and end character from result
            int[] results = new int[predicted.Length - 2];
            for (int i = 1; i < predicted.Length - 1; i++)
            {
                results[i - 1] = predicted[i];
            }

            return results;
        }
Beispiel #5
0
        public bool SetLabel(Sentence sent, TagSet tagSet)
        {
            List<string[]> features = sent.GetFeatureSet();
            if (features.Count != m_States.Length)
            {
                return false;
            }

            for (int i = 0; i < features.Count; i++)
            {
                string strTagName = features[i][features[i].Length - 1];
                int tagId = tagSet.GetIndex(strTagName);
                if (tagId < 0)
                {
                    Console.WriteLine("Error: tag {0} is unknown.", strTagName);
                    return false;
                }

                m_States[i].SetLabel(tagId);
            }

            return true;
        }
Beispiel #6
0
        public int[][] ProcessNBest(Sentence sent, int nbest)
        {
            if (m_Rnn.IsCRFModel() == false)
            {
                return null;
            }

            Sequence seq = m_Featurizer.ExtractFeatures(sent);
            int[][] predicted = m_Rnn.DecodeNBestCRF(seq, nbest);

            //Remove the beginning and end character from result
            int[][] results = new int[nbest][];

            for (int k = 0; k < nbest; k++)
            {
                results[k] = new int[predicted[k].Length - 2];
                for (int i = 1; i < predicted[k].Length - 1; i++)
                {
                    results[k][i - 1] = predicted[k][i];
                }
            }
            return results;
        }
Beispiel #7
0
        private static void Test()
        {
            if (String.IsNullOrEmpty(strTagFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't specified.", strTagFile);
                UsageTest();
                return;
            }

            //Load tag name
            TagSet tagSet = new TagSet(strTagFile);

            if (String.IsNullOrEmpty(strModelFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The model file {0} isn't specified.", strModelFile);
                UsageTest();
                return;
            }

            if (String.IsNullOrEmpty(strFeatureConfigFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} isn't specified.", strFeatureConfigFile);
                UsageTest();
                return;
            }

            if (strOutputFile.Length == 0)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The output file name should not be empty.");
                UsageTest();
                return;
            }

            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);
            featurizer.ShowFeatureSize();

            //Create instance for decoder
            RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer);


            if (File.Exists(strTestFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The test corpus {0} isn't existed.", strTestFile);
                UsageTest();
                return;
            }

            StreamReader sr = new StreamReader(strTestFile);
            StreamWriter sw = new StreamWriter(strOutputFile);

            while (true)
            {
                Sentence sent = new Sentence(ReadRecord(sr));
                if (sent.TokensList.Count <= 2)
                {
                    //No more record, it only contains <s> and </s>
                    break;
                }

                if (nBest == 1)
                {
                    int[] output = decoder.Process(sent);
                    //Output decoded result
                    //Append the decoded result into the end of feature set of each token
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < sent.TokensList.Count; i++)
                    {
                        string tokens = String.Join("\t", sent.TokensList[i]);
                        sb.Append(tokens);
                        sb.Append("\t");
                        sb.Append(tagSet.GetTagName(output[i]));
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
                else
                {
                    int[][] output = decoder.ProcessNBest(sent, nBest);
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < nBest; i++)
                    {
                        for (int j = 0; j < sent.TokensList.Count; j++)
                        {
                            string tokens = String.Join("\t", sent.TokensList[i]);
                            sb.Append(tokens);
                            sb.Append("\t");
                            sb.Append(tagSet.GetTagName(output[i][j]));
                            sb.AppendLine();
                        }
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
            }

            sr.Close();
            sw.Close();
        }
Beispiel #8
0
        static void LoadDataset(string strFileName, Featurizer featurizer, DataSet dataSet)
        {
            CheckCorpus(strFileName);

            StreamReader sr = new StreamReader(strFileName);
            int RecordCount = 0;

            while (true)
            {
                //Extract features from it and convert it into sequence
                Sentence sent = new Sentence(ReadRecord(sr));
                if (sent.TokensList.Count <= 2)
                {
                    //No more record, it only contain <s> and </s>
                    break;
                }

                Sequence seq = featurizer.ExtractFeatures(sent);

                //Set label for the sequence
                seq.SetLabel(sent, featurizer.TagSet);

                //Add the sequence into data set
                dataSet.SequenceList.Add(seq);

                //Show state at every 1000 record
                RecordCount++;
                if (RecordCount % 10000 == 0)
                {
                    Logger.WriteLine(Logger.Level.info, "{0}...", RecordCount);
                }
            }

            sr.Close();

        }
Beispiel #9
0
        private static void Test()
        {
            if (File.Exists(strTagFile) == false)
            {
                Console.WriteLine("FAILED: The tag mapping file {0} isn't existed.", strTagFile);
                UsageTest();
                return;
            }

            //Load tag id and its name from file
            TagSet tagSet = new TagSet(strTagFile);

            if (File.Exists(strModelFile) == false)
            {
                Console.WriteLine("FAILED: The model file {0} isn't existed.", strModelFile);
                UsageTest();
                return;
            }

            if (File.Exists(strFeatureConfigFile) == false)
            {
                Console.WriteLine("FAILED: The feature configuration file {0} isn't existed.", strFeatureConfigFile);
                UsageTest();
                return;
            }

            if (strOutputFile.Length == 0)
            {
                Console.WriteLine("FAILED: The output file name should not be empty.");
                UsageTest();
                return;
            }

            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);
            featurizer.ShowFeatureSize();

            //Create an instance for the model
               // Model model = new Model(strModelFile);

            //Create instance for decoder
            RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer);

            if (File.Exists(strTestFile) == false)
            {
                Console.WriteLine("FAILED: The test corpus {0} isn't existed.", strTestFile);
                UsageTest();
                return;
            }

            StreamReader sr = new StreamReader(strTestFile);
            StreamWriter sw = new StreamWriter(strOutputFile);

            while (true)
            {
                List<string> tokenList = ReadRecord(sr);
                if (tokenList.Count == 0)
                {
                    //No more record
                    break;
                }

                Sentence sent = new Sentence();
                sent.SetFeatures(tokenList);

                if (nBest == 1)
                {
                    int[] output = decoder.Process(sent);
                    //Output decoded result
                    //Append the decoded result into the end of feature set of each token
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < tokenList.Count; i++)
                    {
                        sb.Append(tokenList[i]);
                        sb.Append("\t");
                        sb.Append(tagSet.GetTagName(output[i]));
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
                else
                {
                    int[][] output = decoder.ProcessNBest(sent, nBest);
                    if (output == null)
                    {
                        Console.WriteLine("FAILED: decode failed. Dump current sentence...");
                        sent.DumpFeatures();
                        return;
                    }

                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < nBest; i++)
                    {
                        for (int j = 0; j < tokenList.Count; j++)
                        {
                            sb.Append(tokenList[j]);
                            sb.Append("\t");
                            sb.Append(tagSet.GetTagName(output[i][j]));
                            sb.AppendLine();
                        }
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
            }

            sr.Close();
            sw.Close();
        }
Beispiel #10
0
        static void LoadDataset(string strFileName, Featurizer featurizer, DataSet dataSet)
        {
            CheckCorpus(strFileName);

            StreamReader sr = new StreamReader(strFileName);
            int RecordCount = 0;

            while (true)
            {
                List<string> tokenList = ReadRecord(sr);
                if (tokenList.Count == 0)
                {
                    //No more record
                    break;
                }

                //Extract features from it and convert it into sequence
                Sentence sent = new Sentence();
                sent.SetFeatures(tokenList);
                Sequence seq = featurizer.ExtractFeatures(sent);

                //Set label for the sequence
                if (seq.SetLabel(sent, featurizer.GetTagSet()) == false)
                {
                    Console.WriteLine("Error: Invalidated record.");
                    sent.DumpFeatures();
                    continue;
                }

                //Add the sequence into data set
                dataSet.Add(seq);

                //Show state at every 1000 record
                RecordCount++;
                if (RecordCount % 10000 == 0)
                {
                    Console.Write("{0}...", RecordCount);
                }
            }

            Console.WriteLine();

            sr.Close();
        }
Beispiel #11
0
        public Sequence ExtractFeatures(Sentence sentence)
        {
            int n = sentence.TokensList.Count;
            Sequence sequence = new Sequence(n);

            //For each token, get its sparse and dense feature set according configuration and training corpus
            for (int i = 0; i < n; i++)
            {
                State state = sequence.States[i];
                ExtractSparseFeature(i, n, sentence.TokensList, state);

                state.DenseData = ExtractDenseFeature(i, n, sentence.TokensList);
            }

            return sequence;
        }
Beispiel #12
0
        static IDictionary<string, int> ExtractFeatureSetFromFile()
        {
            //Load templates from given file
            Logger.WriteLine("Loading feature template from {0}...", strTemplateFile);
            templateFeaturizer = new TemplateFeaturizer();
            templateFeaturizer.LoadTemplateFromFile(strTemplateFile);

            Logger.WriteLine("Generate feature set...");
            BigDictionary<string, int> feature2freq = new BigDictionary<string, int>();

            List<string[]> tokenList = new List<string[]>();
            string strLine = null;
            Sentence sentence = null;

            using (StreamReader srCorpus = new StreamReader(strInputFile, Encoding.UTF8))
            {
                while ((strLine = srCorpus.ReadLine()) != null)
                {
                    strLine = strLine.Trim();
                    if (strLine.Length == 0)
                    {
                        //The end of current record
                        sentence = new Sentence(tokenList);
                        for (int i = 0; i < sentence.TokensList.Count; i++)
                        {
                            //Get feature of i-th token
                            List<string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i);
                            foreach (string strFeature in featureList)
                            {
                                if (feature2freq.ContainsKey(strFeature) == false)
                                {
                                    feature2freq.Add(strFeature, 0);
                                }
                                feature2freq[strFeature]++;
                            }
                        }

                        tokenList.Clear();
                    }
                    else
                    {
                        tokenList.Add(strLine.Split('\t'));
                    }
                }

                //The end of current record
                sentence = new Sentence(tokenList);
                for (int i = 0; i < sentence.TokensList.Count; i++)
                {
                    //Get feature of i-th token
                    List<string> featureList = templateFeaturizer.GenerateFeature(sentence.TokensList, i);
                    foreach (string strFeature in featureList)
                    {
                        if (feature2freq.ContainsKey(strFeature) == false)
                        {
                            feature2freq.Add(strFeature, 0);
                        }
                        feature2freq[strFeature]++;
                    }
                }
            }

            //Only save the feature whose frequency is not less than minfreq
            Logger.WriteLine("Filter out features whose frequency is less than {0}", minfreq);
            SortedDictionary<string, int> features = new SortedDictionary<string, int>(StringComparer.Ordinal);
            foreach (KeyValuePair<string, int> pair in feature2freq)
            {
                if (pair.Value >= minfreq)
                {
                    features.Add(pair.Key, pair.Value);
                }
            }

            return features;
        }
Beispiel #13
0
 public override int[] TestSeq2Seq(Sentence srcSentence, Featurizer featurizer)
 {
     throw new NotImplementedException();
 }
Beispiel #14
0
        public override int[] TestSeq2Seq(Sentence srcSentence, Featurizer featurizer)
        {
            State curState = featurizer.ExtractFeatures(new string[] { "<s>" });

            curState.Label = featurizer.TagSet.GetIndex("<s>");

            //Reset all layers
            foreach (SimpleLayer layer in HiddenLayerList)
            {
                layer.netReset(false);
            }

            //Extract features from source sentence
            Sequence srcSequence = featurizer.AutoEncoder.Featurizer.ExtractFeatures(srcSentence);

            double[] srcHiddenAvgOutput;
            Dictionary <int, float> srcSparseFeatures;

            ExtractSourceSentenceFeature(featurizer.AutoEncoder, srcSequence, curState.SparseFeature.Length, out srcHiddenAvgOutput, out srcSparseFeatures);

            int        numLayers = HiddenLayerList.Count;
            List <int> predicted = new List <int>();

            predicted.Add(curState.Label);
            while (true)
            {
                //Build sparse features
                SparseVector sparseVector = new SparseVector();
                sparseVector.SetLength(curState.SparseFeature.Length + srcSequence.SparseFeatureSize);
                sparseVector.AddKeyValuePairData(curState.SparseFeature);
                sparseVector.AddKeyValuePairData(srcSparseFeatures);

                //Compute first layer
                double[] denseFeatures = RNNHelper.ConcatenateVector(curState.DenseFeature, srcHiddenAvgOutput);
                HiddenLayerList[0].computeLayer(sparseVector, denseFeatures, false);

                //Compute middle layers
                for (int i = 1; i < numLayers; i++)
                {
                    //We use previous layer's output as dense feature for current layer
                    denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[i - 1].cellOutput, srcHiddenAvgOutput);
                    HiddenLayerList[i].computeLayer(sparseVector, denseFeatures, false);
                }

                //Compute output layer
                denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[numLayers - 1].cellOutput, srcHiddenAvgOutput);
                OutputLayer.computeLayer(sparseVector, denseFeatures, false);

                OutputLayer.Softmax(false);

                int    nextTagId = OutputLayer.GetBestOutputIndex(false);
                string nextWord  = featurizer.TagSet.GetTagName(nextTagId);

                curState       = featurizer.ExtractFeatures(new string[] { nextWord });
                curState.Label = nextTagId;

                predicted.Add(nextTagId);

                if (nextWord == "</s>" || predicted.Count >= 100)
                {
                    break;
                }
            }

            return(predicted.ToArray());
        }
Beispiel #15
0
        public Sequence ExtractFeatures(Sentence sentence)
        {
            Sequence sequence = new Sequence();
            int n = sentence.GetTokenSize();
            List<string[]> features = sentence.GetFeatureSet();

            //For each token, get its sparse and dense feature set according configuration and training corpus
            sequence.SetSize(n);
            for (int i = 0; i < n; i++)
            {
                State state = sequence.Get(i);
                ExtractSparseFeature(i, n, features, state);

                var spDenseFeature = ExtractDenseFeature(i, n, features);
                state.SetDenseData(spDenseFeature);
            }

            return sequence;
        }
Beispiel #16
0
        public override int[] TestSeq2Seq(Sentence srcSentence, Config featurizer)
        {
            var curState = featurizer.BuildState(new[] { "<s>" });

            curState.Label = featurizer.TagSet.GetIndex("<s>");

            //Reset all layers
            foreach (var layer in HiddenLayerList)
            {
                layer.Reset(false);
            }

            //Extract features from source sentence
            var srcSequence = featurizer.Seq2SeqAutoEncoder.Config.BuildSequence(srcSentence);

            float[] srcHiddenAvgOutput;
            Dictionary <int, float> srcSparseFeatures;

            ExtractSourceSentenceFeature(featurizer.Seq2SeqAutoEncoder, srcSequence, curState.SparseFeature.Length,
                                         out srcHiddenAvgOutput, out srcSparseFeatures);

            var numLayers = HiddenLayerList.Count;
            var predicted = new List <int> {
                curState.Label
            };

            while (true)
            {
                //Build sparse features
                var sparseVector = new SparseVector();
                sparseVector.SetLength(curState.SparseFeature.Length + srcSequence.SparseFeatureSize);
                sparseVector.AddKeyValuePairData(curState.SparseFeature);
                sparseVector.AddKeyValuePairData(srcSparseFeatures);

                //Compute first layer
                var denseFeatures = RNNHelper.ConcatenateVector(curState.DenseFeature, srcHiddenAvgOutput);
                HiddenLayerList[0].ForwardPass(sparseVector, denseFeatures, false);

                //Compute middle layers
                for (var i = 1; i < numLayers; i++)
                {
                    //We use previous layer's output as dense feature for current layer
                    denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[i - 1].Cell, srcHiddenAvgOutput);
                    HiddenLayerList[i].ForwardPass(sparseVector, denseFeatures, false);
                }

                //Compute output layer
                denseFeatures = RNNHelper.ConcatenateVector(HiddenLayerList[numLayers - 1].Cell,
                                                            srcHiddenAvgOutput);
                OutputLayer.ForwardPass(sparseVector, denseFeatures, false);

                OutputLayer.Softmax(false);

                var nextTagId = OutputLayer.GetBestOutputIndex(false);
                var nextWord  = featurizer.TagSet.GetTagName(nextTagId);

                curState       = featurizer.BuildState(new[] { nextWord });
                curState.Label = nextTagId;

                predicted.Add(nextTagId);

                if (nextWord == "</s>" || predicted.Count >= 100)
                {
                    break;
                }
            }

            return(predicted.ToArray());
        }