Ejemplo n.º 1
0
        static void LoadDataset(string strFileName, Featurizer featurizer, DataSet dataSet)
        {
            CheckCorpus(strFileName);

            StreamReader sr          = new StreamReader(strFileName);
            int          RecordCount = 0;

            while (true)
            {
                //Extract features from it and convert it into sequence
                Sentence sent = new Sentence(ReadRecord(sr));
                if (sent.TokensList.Count <= 2)
                {
                    //No more record, it only contain <s> and </s>
                    break;
                }

                Sequence seq = featurizer.ExtractFeatures(sent);

                //Set label for the sequence
                seq.SetLabel(sent, featurizer.TagSet);

                //Add the sequence into data set
                dataSet.SequenceList.Add(seq);

                //Show state at every 1000 record
                RecordCount++;
                if (RecordCount % 10000 == 0)
                {
                    Logger.WriteLine("{0}...", RecordCount);
                }
            }

            sr.Close();
        }
Ejemplo n.º 2
0
 public Model(MultinomialLogisticRegression regression, Featurizer featurizer, FeatureSpace featureSpace, HashSet<Target> targets)
 {
     this.regression = regression;
     this.FeatureSpace = featureSpace;
     this.Featurizer = featurizer;
     this.Targets = targets;
 }
Ejemplo n.º 3
0
        static Model preprocess()
        {
            Logger.Log("Begin training");
            var reader = new Reader();
            var featurizer = new Featurizer();

            featurizer.Blacklist = new Blacklist(@"Data\Features\Blacklist.txt");

            var targets = new HashSet<Target>();

            var featureSpace = featurizer.CreateFeatureSpace(Entities());
            Logger.Log("Feature space created, {0} features", featureSpace.Size);
            Logger.Log("# of features by type:");
            foreach (var pair in featureSpace.featureTypeCount)
            {
                Logger.Log("{0}: {1}", pair.Key, pair.Value);
            }

            Logger.Log("Operating with {0} entities", featureSpace.NumEntities);

            var learner = new Learner(featurizer);
            var model = learner.Learn(Entities(), featureSpace.NumEntities, featureSpace, targets);
            Logger.Log("Model learned");

            model.Save(@"Data\model");
            Logger.Log("Model serialized to file");

            return model;
        }
Ejemplo n.º 4
0
        static void LoadSeq2SeqDataSet(string strFileName, Featurizer featurizer, DataSet <SequencePair> dataSet)
        {
            Logger.WriteLine("Loading data set for seq2seq2 training...");
            StreamReader sr          = new StreamReader(strFileName);
            int          RecordCount = 0;

            while (true)
            {
                SentencePair sentPair = new SentencePair();

                //Extract features from it and convert it into sequence
                sentPair.srcSentence = new Sentence(ReadRecord(sr));
                sentPair.tgtSentence = new Sentence(ReadRecord(sr), false);

                if (sentPair.srcSentence.TokensList.Count <= 2 || sentPair.tgtSentence.TokensList.Count <= 0)
                {
                    //No more record, it only contain <s> and </s>
                    break;
                }

                SequencePair seq = featurizer.ExtractFeatures(sentPair);
                if (seq.tgtSequence.SetLabel(sentPair.tgtSentence, featurizer.TagSet))
                {
                    dataSet.SequenceList.Add(seq);

                    //Show state at every 1000 record
                    RecordCount++;
                    if (RecordCount % 10000 == 0)
                    {
                        Logger.WriteLine("{0}...", RecordCount);
                    }
                }
            }

            sr.Close();
        }
Ejemplo n.º 5
0
        private static void Train()
        {
            Logger.LogFile = "RNNSharpConsole.log";

            if (File.Exists(strTagFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't existed.", strTagFile);
                UsageTrain();
                return;
            }

            //Load tag id and its name from file
            TagSet tagSet = new TagSet(strTagFile);

            //Create configuration instance and set parameters
            ModelSetting RNNConfig = new ModelSetting();

            RNNConfig.TagFile             = strTagFile;
            RNNConfig.Tags                = tagSet;
            RNNConfig.ModelFile           = strModelFile;
            RNNConfig.HiddenLayerSizeList = hiddenLayerSizeList;
            RNNConfig.IsCRFTraining       = (iCRF == 1) ? true : false;
            RNNConfig.ModelDirection      = iDir;
            RNNConfig.VQ              = iVQ;
            RNNConfig.ModelType       = ParseLayerType(hiddenLayerType);
            RNNConfig.OutputLayerType = ParseLayerType(outputLayerType);
            RNNConfig.MaxIteration    = maxIter;
            RNNConfig.SaveStep        = savestep;
            RNNConfig.LearningRate    = alpha;
            RNNConfig.Dropout         = dropout;
            RNNConfig.Bptt            = bptt;
            RNNConfig.GradientCutoff  = gradientCutoff;
            RNNConfig.NCESampleSize   = nceSampleSize;

            //Dump RNN setting on console
            RNNConfig.DumpSetting();

            if (File.Exists(strFeatureConfigFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} doesn't exist.", strFeatureConfigFile);
                UsageTrain();
                return;
            }
            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);

            featurizer.ShowFeatureSize();

            if (featurizer.IsRunTimeFeatureUsed() == true && iDir == 1)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: Run time feature is not available for bi-directional RNN model.");
                UsageTrain();
                return;
            }

            if (File.Exists(strTrainFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The training corpus doesn't exist.");
                UsageTrain();
                return;
            }

            //Create RNN encoder and save necessary parameters
            RNNEncoder encoder = new RNNEncoder(RNNConfig);

            //LoadFeatureConfig training corpus and extract feature set
            encoder.TrainingSet = new DataSet(tagSet.GetSize());
            LoadDataset(strTrainFile, featurizer, encoder.TrainingSet);
            RNNConfig.TrainDataSet = encoder.TrainingSet;

            if (String.IsNullOrEmpty(strValidFile) == false)
            {
                //LoadFeatureConfig validated corpus and extract feature set
                Logger.WriteLine("Loading validated corpus from {0}", strValidFile);
                encoder.ValidationSet = new DataSet(tagSet.GetSize());
                LoadDataset(strValidFile, featurizer, encoder.ValidationSet);
            }
            else
            {
                Logger.WriteLine("Validated corpus isn't specified.");
                encoder.ValidationSet = null;
            }

            if (iCRF == 1)
            {
                Logger.WriteLine("Initialize output tag bigram transition probability...");
                //Build tag bigram transition matrix
                encoder.TrainingSet.BuildLabelBigramTransition();
            }

            //Start to train the model
            encoder.Train();
        }
Ejemplo n.º 6
0
        private static void Test()
        {
            if (String.IsNullOrEmpty(strTagFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't specified.", strTagFile);
                UsageTest();
                return;
            }

            //Load tag name
            TagSet tagSet = new TagSet(strTagFile);

            if (String.IsNullOrEmpty(strModelFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The model file {0} isn't specified.", strModelFile);
                UsageTest();
                return;
            }

            if (String.IsNullOrEmpty(strFeatureConfigFile) == true)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} isn't specified.", strFeatureConfigFile);
                UsageTest();
                return;
            }

            if (strOutputFile.Length == 0)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The output file name should not be empty.");
                UsageTest();
                return;
            }

            //Create feature extractors and load word embedding data from file
            Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet);

            featurizer.ShowFeatureSize();

            //Create instance for decoder
            RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer);


            if (File.Exists(strTestFile) == false)
            {
                Logger.WriteLine(Logger.Level.err, "FAILED: The test corpus {0} isn't existed.", strTestFile);
                UsageTest();
                return;
            }

            StreamReader sr = new StreamReader(strTestFile);
            StreamWriter sw = new StreamWriter(strOutputFile);

            while (true)
            {
                Sentence sent = new Sentence(ReadRecord(sr));
                if (sent.TokensList.Count <= 2)
                {
                    //No more record, it only contains <s> and </s>
                    break;
                }

                if (nBest == 1)
                {
                    int[] output = decoder.Process(sent);
                    //Output decoded result
                    //Append the decoded result into the end of feature set of each token
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < sent.TokensList.Count; i++)
                    {
                        string tokens = String.Join("\t", sent.TokensList[i]);
                        sb.Append(tokens);
                        sb.Append("\t");
                        sb.Append(tagSet.GetTagName(output[i]));
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
                else
                {
                    int[][]       output = decoder.ProcessNBest(sent, nBest);
                    StringBuilder sb     = new StringBuilder();
                    for (int i = 0; i < nBest; i++)
                    {
                        for (int j = 0; j < sent.TokensList.Count; j++)
                        {
                            string tokens = String.Join("\t", sent.TokensList[i]);
                            sb.Append(tokens);
                            sb.Append("\t");
                            sb.Append(tagSet.GetTagName(output[i][j]));
                            sb.AppendLine();
                        }
                        sb.AppendLine();
                    }

                    sw.WriteLine(sb.ToString());
                }
            }

            sr.Close();
            sw.Close();
        }
Ejemplo n.º 7
0
 public Learner(Featurizer featurizer)
 {
     this.featurizer = featurizer;
 }