Example #1
0
        private void ComputingFeatureSize()
        {
            var fc = featureContext;

            SparseFeatureSize = 0;
            if (tFeaturizer != null)
            {
                if (fc.ContainsKey(TFEATURE_CONTEXT))
                {
                    SparseFeatureSize += tFeaturizer.GetFeatureSize() * fc[TFEATURE_CONTEXT].Count;
                }
            }

            if (fc.ContainsKey(RT_FEATURE_CONTEXT))
            {
                SparseFeatureSize += TagSet.GetSize() * fc[RT_FEATURE_CONTEXT].Count;
            }
        }
Example #2
0
        public void SetLabel(Sentence sent, TagSet tagSet)
        {
            List<string[]> tokensList = sent.TokensList;
            if (tokensList.Count != States.Length)
            {
                throw new DataMisalignedException(String.Format("Error: Inconsistent token({0}) and state({1}) size. Tokens list: {2}",
                    tokensList.Count, States.Length, sent.ToString()));
            }

            for (int i = 0; i < tokensList.Count; i++)
            {
                string strTagName = tokensList[i][tokensList[i].Length - 1];
                int tagId = tagSet.GetIndex(strTagName);
                if (tagId < 0)
                {
                    throw new DataMisalignedException(String.Format("Error: tag {0} is unknown. Tokens list: {1}", 
                        strTagName, sent.ToString()));
                }

                States[i].Label = tagId;
            }
        }
Example #3
0
        public void SetLabel(Sentence sent, TagSet tagSet)
        {
            List <string[]> tokensList = sent.TokensList;

            if (tokensList.Count != States.Length)
            {
                throw new DataMisalignedException(String.Format("Error: Inconsistent token({0}) and state({1}) size. Tokens list: {2}",
                                                                tokensList.Count, States.Length, sent.ToString()));
            }

            for (int i = 0; i < tokensList.Count; i++)
            {
                string strTagName = tokensList[i][tokensList[i].Length - 1];
                int    tagId      = tagSet.GetIndex(strTagName);
                if (tagId < 0)
                {
                    throw new DataMisalignedException(String.Format("Error: tag {0} is unknown. Tokens list: {1}",
                                                                    strTagName, sent.ToString()));
                }

                States[i].Label = tagId;
            }
        }
Example #4
0
        public bool SetLabel(Sentence sent, TagSet tagSet)
        {
            List<string[]> features = sent.GetFeatureSet();
            if (features.Count != m_States.Length)
            {
                return false;
            }

            for (int i = 0; i < features.Count; i++)
            {
                string strTagName = features[i][features[i].Length - 1];
                int tagId = tagSet.GetIndex(strTagName);
                if (tagId < 0)
                {
                    Console.WriteLine("Error: tag {0} is unknown.", strTagName);
                    return false;
                }

                m_States[i].SetLabel(tagId);
            }

            return true;
        }
Example #5
0
        private void ExtractSparseFeature(int currentState, int numStates, List <string[]> features, State pState)
        {
            var sparseFeature = new Dictionary <int, float>();
            var start         = 0;
            var fc            = featureContext;

            //Extract TFeatures in given context window
            if (tFeaturizer != null)
            {
                if (fc.ContainsKey(TFEATURE_CONTEXT))
                {
                    var v = fc[TFEATURE_CONTEXT];
                    for (var j = 0; j < v.Count; j++)
                    {
                        var offset = TruncPosition(currentState + v[j], 0, numStates);

                        var tfeatureList = tFeaturizer.GetFeatureIds(features, offset);
                        foreach (var featureId in tfeatureList)
                        {
                            if (tFeatureWeightType == TFEATURE_WEIGHT_TYPE_ENUM.BINARY)
                            {
                                sparseFeature[start + featureId] = 1;
                            }
                            else
                            {
                                if (sparseFeature.ContainsKey(start + featureId) == false)
                                {
                                    sparseFeature.Add(start + featureId, 1);
                                }
                                else
                                {
                                    sparseFeature[start + featureId]++;
                                }
                            }
                        }
                        start += tFeaturizer.GetFeatureSize();
                    }
                }
            }

            // Create place hold for run time feature
            // The real feature value is calculated at run time
            if (fc.ContainsKey(RT_FEATURE_CONTEXT))
            {
                var v = fc[RT_FEATURE_CONTEXT];
                pState.RuntimeFeatures = new PriviousLabelFeature[v.Count];
                for (var j = 0; j < v.Count; j++)
                {
                    if (v[j] < 0)
                    {
                        pState.AddRuntimeFeaturePlacehold(j, v[j], sparseFeature.Count, start);
                        sparseFeature[start] = 0; //Placehold a position
                        start += TagSet.GetSize();
                    }
                    else
                    {
                        throw new Exception("The offset of run time feature should be negative.");
                    }
                }
            }

            var spSparseFeature = pState.SparseFeature;

            spSparseFeature.SetLength(SparseFeatureSize);
            spSparseFeature.AddKeyValuePairData(sparseFeature);
        }
Example #6
0
 public Config(string strFeatureConfigFileName, TagSet tagSet)
 {
     LoadFeatureConfigFromFile(strFeatureConfigFileName);
     TagSet = tagSet;
     ComputingFeatureSize();
 }
Example #7
0
        private static void Train()
        {
            Logger.LogFile = "RNNSharpConsole.log";

            if (File.Exists(strTagFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't existed.", strTagFile);
                UsageTrain();
                return;
            }

            //Load tag id and its name from file
            TagSet tagSet = new TagSet(strTagFile);

            //Create configuration instance and set parameters
            ModelSetting RNNConfig = new ModelSetting();
            RNNConfig.ModelFile = strModelFile;
            RNNConfig.NumHidden = layersize;
            RNNConfig.IsCRFTraining = (iCRF == 1) ? true : false;
            RNNConfig.ModelDirection = iDir;
            RNNConfig.ModelType = modelType;
            RNNConfig.MaxIteration = maxIter;
            RNNConfig.SaveStep = savestep;
            RNNConfig.LearningRate = alpha;
            RNNConfig.Dropout = dropout;
            RNNConfig.Bptt = bptt;

            //Dump RNN setting on console
            RNNConfig.DumpSetting();

            if (File.Exists(strFeatureConfigFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} doesn't exist.", strFeatureConfigFile);
                UsageTrain();
                return;
            }
            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);
            featurizer.ShowFeatureSize();

            if (featurizer.IsRunTimeFeatureUsed() == true && iDir == 1)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: Run time feature is not available for bi-directional RNN model.");
                UsageTrain();
                return;
            }

            if (File.Exists(strTrainFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The training corpus doesn't exist.");
                UsageTrain();
                return;
            }

            if (File.Exists(strValidFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The validation corpus doesn't exist.");
                UsageTrain();
                return;
            }

            //Create RNN encoder and save necessary parameters
            RNNEncoder encoder = new RNNEncoder(RNNConfig);

            //LoadFeatureConfig training corpus and extract feature set
            encoder.TrainingSet = new DataSet(tagSet.GetSize());
            LoadDataset(strTrainFile, featurizer, encoder.TrainingSet);

            //LoadFeatureConfig validated corpus and extract feature set
            encoder.ValidationSet = new DataSet(tagSet.GetSize());
            LoadDataset(strValidFile, featurizer, encoder.ValidationSet);

            if (iCRF == 1)
            {
                Logger.WriteLine(Logger.Level.info, "Initialize output tag bigram transition probability...");
                //Build tag bigram transition matrix
                encoder.TrainingSet.BuildLabelBigramTransition();
            }

            //Start to train the model
            encoder.Train();

        }
Example #8
0
        private static void Test()
        {
            if (String.IsNullOrEmpty(strTagFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't specified.", strTagFile);
                UsageTest();
                return;
            }

            //Load tag name
            TagSet tagSet = new TagSet(strTagFile);

            if (String.IsNullOrEmpty(strModelFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The model file {0} isn't specified.", strModelFile);
                UsageTest();
                return;
            }

            if (String.IsNullOrEmpty(strFeatureConfigFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} isn't specified.", strFeatureConfigFile);
                UsageTest();
                return;
            }

            if (strOutputFile.Length == 0)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The output file name should not be empty.");
                UsageTest();
                return;
            }

            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);
            featurizer.ShowFeatureSize();

            //Create instance for decoder
            RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer);


            if (File.Exists(strTestFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The test corpus {0} isn't existed.", strTestFile);
                UsageTest();
                return;
            }

            StreamReader sr = new StreamReader(strTestFile);
            StreamWriter sw = new StreamWriter(strOutputFile);

            while (true)
            {
                Sentence sent = new Sentence(ReadRecord(sr));
                if (sent.TokensList.Count <= 2)
                {
                    //No more record, it only contains <s> and </s>
                    break;
                }

                if (nBest == 1)
                {
                    int[] output = decoder.Process(sent);
                    //Output decoded result
                    //Append the decoded result into the end of feature set of each token
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < sent.TokensList.Count; i++)
                    {
                        string tokens = String.Join("\t", sent.TokensList[i]);
                        sb.Append(tokens);
                        sb.Append("\t");
                        sb.Append(tagSet.GetTagName(output[i]));
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
                else
                {
                    int[][] output = decoder.ProcessNBest(sent, nBest);
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < nBest; i++)
                    {
                        for (int j = 0; j < sent.TokensList.Count; j++)
                        {
                            string tokens = String.Join("\t", sent.TokensList[i]);
                            sb.Append(tokens);
                            sb.Append("\t");
                            sb.Append(tagSet.GetTagName(output[i][j]));
                            sb.AppendLine();
                        }
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
            }

            sr.Close();
            sw.Close();
        }
Example #9
0
 public Featurizer(string strFeatureConfigFileName, TagSet tagSet)
 {
     LoadFeatureConfigFromFile(strFeatureConfigFileName);
     TagSet = tagSet;
     InitComponentFeaturizer();
 }
Example #10
0
        private static void Train()
        {
            if (File.Exists(strTagFile) == false)
            {
                Console.WriteLine("FAILED: The tag mapping file {0} isn't existed.", strTagFile);
                UsageTrain();
                return;
            }

            //Load tag id and its name from file
            TagSet tagSet = new TagSet(strTagFile);

            //Create configuration instance and set parameters
            ModelSetting RNNConfig = new ModelSetting();
            RNNConfig.SetModelFile(strModelFile);
            RNNConfig.SetNumHidden(layersize);
            RNNConfig.SetCRFTraining((iCRF == 1) ? true : false);
            RNNConfig.SetDir(iDir);
            RNNConfig.SetModelType(modelType);
            RNNConfig.SetMaxIteration(maxIter);
            RNNConfig.SetSaveStep(savestep);
            RNNConfig.SetLearningRate(alpha);
            RNNConfig.SetRegularization(beta);
            RNNConfig.SetBptt(bptt);

            //Dump RNN setting on console
            RNNConfig.DumpSetting();

            if (File.Exists(strFeatureConfigFile) == false)
            {
                Console.WriteLine("FAILED: The feature configuration file {0} isn't existed.", strFeatureConfigFile);
                UsageTrain();
                return;
            }
            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);
            featurizer.ShowFeatureSize();

            if (File.Exists(strTrainFile) == false)
            {
                Console.WriteLine("FAILED: The training corpus {0} isn't existed.", strTrainFile);
                UsageTrain();
                return;
            }

            //LoadFeatureConfig training corpus and extract feature set
            DataSet dataSetTrain = new DataSet(tagSet.GetSize());
            LoadDataset(strTrainFile, featurizer, dataSetTrain);

            if (File.Exists(strValidFile) == false)
            {
                Console.WriteLine("FAILED: The validated corpus {0} isn't existed.", strValidFile);
                UsageTrain();
                return;
            }

            //LoadFeatureConfig validated corpus and extract feature set
            DataSet dataSetValidation = new DataSet(tagSet.GetSize());
            LoadDataset(strValidFile, featurizer, dataSetValidation);

            //Create RNN encoder and save necessary parameters
            RNNEncoder encoder = new RNNEncoder(RNNConfig);
            encoder.SetTrainingSet(dataSetTrain);
            encoder.SetValidationSet(dataSetValidation);

            if (iCRF == 1)
            {
                Console.WriteLine("Initialize output tag bigram transition probability...");
                //Build tag bigram transition matrix
                dataSetTrain.BuildLabelBigramTransition();
                encoder.SetLabelBigramTransition(dataSetTrain.GetLabelBigramTransition());
            }

            //Start to train the model
            encoder.Train();
        }
Example #11
0
        private static void Test()
        {
            if (File.Exists(strTagFile) == false)
            {
                Console.WriteLine("FAILED: The tag mapping file {0} isn't existed.", strTagFile);
                UsageTest();
                return;
            }

            //Load tag id and its name from file
            TagSet tagSet = new TagSet(strTagFile);

            if (File.Exists(strModelFile) == false)
            {
                Console.WriteLine("FAILED: The model file {0} isn't existed.", strModelFile);
                UsageTest();
                return;
            }

            if (File.Exists(strFeatureConfigFile) == false)
            {
                Console.WriteLine("FAILED: The feature configuration file {0} isn't existed.", strFeatureConfigFile);
                UsageTest();
                return;
            }

            if (strOutputFile.Length == 0)
            {
                Console.WriteLine("FAILED: The output file name should not be empty.");
                UsageTest();
                return;
            }

            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);
            featurizer.ShowFeatureSize();

            //Create an instance for the model
               // Model model = new Model(strModelFile);

            //Create instance for decoder
            RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer);

            if (File.Exists(strTestFile) == false)
            {
                Console.WriteLine("FAILED: The test corpus {0} isn't existed.", strTestFile);
                UsageTest();
                return;
            }

            StreamReader sr = new StreamReader(strTestFile);
            StreamWriter sw = new StreamWriter(strOutputFile);

            while (true)
            {
                List<string> tokenList = ReadRecord(sr);
                if (tokenList.Count == 0)
                {
                    //No more record
                    break;
                }

                Sentence sent = new Sentence();
                sent.SetFeatures(tokenList);

                if (nBest == 1)
                {
                    int[] output = decoder.Process(sent);
                    //Output decoded result
                    //Append the decoded result into the end of feature set of each token
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < tokenList.Count; i++)
                    {
                        sb.Append(tokenList[i]);
                        sb.Append("\t");
                        sb.Append(tagSet.GetTagName(output[i]));
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
                else
                {
                    int[][] output = decoder.ProcessNBest(sent, nBest);
                    if (output == null)
                    {
                        Console.WriteLine("FAILED: decode failed. Dump current sentence...");
                        sent.DumpFeatures();
                        return;
                    }

                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < nBest; i++)
                    {
                        for (int j = 0; j < tokenList.Count; j++)
                        {
                            sb.Append(tokenList[j]);
                            sb.Append("\t");
                            sb.Append(tagSet.GetTagName(output[i][j]));
                            sb.AppendLine();
                        }
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
            }

            sr.Close();
            sw.Close();
        }
Example #12
0
 public Featurizer(string strFeatureConfigFileName, TagSet tagSet)
 {
     LoadFeatureConfigFromFile(strFeatureConfigFileName);
     TagSet = tagSet;
     InitComponentFeaturizer();
 }