static void LoadDataset(string strFileName, Featurizer featurizer, DataSet dataSet) { CheckCorpus(strFileName); StreamReader sr = new StreamReader(strFileName); int RecordCount = 0; while (true) { //Extract features from it and convert it into sequence Sentence sent = new Sentence(ReadRecord(sr)); if (sent.TokensList.Count <= 2) { //No more record, it only contain <s> and </s> break; } Sequence seq = featurizer.ExtractFeatures(sent); //Set label for the sequence seq.SetLabel(sent, featurizer.TagSet); //Add the sequence into data set dataSet.SequenceList.Add(seq); //Show state at every 1000 record RecordCount++; if (RecordCount % 10000 == 0) { Logger.WriteLine("{0}...", RecordCount); } } sr.Close(); }
public Model(MultinomialLogisticRegression regression, Featurizer featurizer, FeatureSpace featureSpace, HashSet<Target> targets) { this.regression = regression; this.FeatureSpace = featureSpace; this.Featurizer = featurizer; this.Targets = targets; }
static Model preprocess() { Logger.Log("Begin training"); var reader = new Reader(); var featurizer = new Featurizer(); featurizer.Blacklist = new Blacklist(@"Data\Features\Blacklist.txt"); var targets = new HashSet<Target>(); var featureSpace = featurizer.CreateFeatureSpace(Entities()); Logger.Log("Feature space created, {0} features", featureSpace.Size); Logger.Log("# of features by type:"); foreach (var pair in featureSpace.featureTypeCount) { Logger.Log("{0}: {1}", pair.Key, pair.Value); } Logger.Log("Operating with {0} entities", featureSpace.NumEntities); var learner = new Learner(featurizer); var model = learner.Learn(Entities(), featureSpace.NumEntities, featureSpace, targets); Logger.Log("Model learned"); model.Save(@"Data\model"); Logger.Log("Model serialized to file"); return model; }
static void LoadSeq2SeqDataSet(string strFileName, Featurizer featurizer, DataSet <SequencePair> dataSet) { Logger.WriteLine("Loading data set for seq2seq2 training..."); StreamReader sr = new StreamReader(strFileName); int RecordCount = 0; while (true) { SentencePair sentPair = new SentencePair(); //Extract features from it and convert it into sequence sentPair.srcSentence = new Sentence(ReadRecord(sr)); sentPair.tgtSentence = new Sentence(ReadRecord(sr), false); if (sentPair.srcSentence.TokensList.Count <= 2 || sentPair.tgtSentence.TokensList.Count <= 0) { //No more record, it only contain <s> and </s> break; } SequencePair seq = featurizer.ExtractFeatures(sentPair); if (seq.tgtSequence.SetLabel(sentPair.tgtSentence, featurizer.TagSet)) { dataSet.SequenceList.Add(seq); //Show state at every 1000 record RecordCount++; if (RecordCount % 10000 == 0) { Logger.WriteLine("{0}...", RecordCount); } } } sr.Close(); }
private static void Train() { Logger.LogFile = "RNNSharpConsole.log"; if (File.Exists(strTagFile) == false) { Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't existed.", strTagFile); UsageTrain(); return; } //Load tag id and its name from file TagSet tagSet = new TagSet(strTagFile); //Create configuration instance and set parameters ModelSetting RNNConfig = new ModelSetting(); RNNConfig.TagFile = strTagFile; RNNConfig.Tags = tagSet; RNNConfig.ModelFile = strModelFile; RNNConfig.HiddenLayerSizeList = hiddenLayerSizeList; RNNConfig.IsCRFTraining = (iCRF == 1) ? true : false; RNNConfig.ModelDirection = iDir; RNNConfig.VQ = iVQ; RNNConfig.ModelType = ParseLayerType(hiddenLayerType); RNNConfig.OutputLayerType = ParseLayerType(outputLayerType); RNNConfig.MaxIteration = maxIter; RNNConfig.SaveStep = savestep; RNNConfig.LearningRate = alpha; RNNConfig.Dropout = dropout; RNNConfig.Bptt = bptt; RNNConfig.GradientCutoff = gradientCutoff; RNNConfig.NCESampleSize = nceSampleSize; //Dump RNN setting on console RNNConfig.DumpSetting(); if (File.Exists(strFeatureConfigFile) == false) { Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} doesn't exist.", strFeatureConfigFile); UsageTrain(); return; } //Create feature extractors and load word embedding data from file Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet); featurizer.ShowFeatureSize(); if (featurizer.IsRunTimeFeatureUsed() == true && iDir == 1) { Logger.WriteLine(Logger.Level.err, "FAILED: Run time feature is not available for bi-directional RNN model."); UsageTrain(); return; } if (File.Exists(strTrainFile) == false) { Logger.WriteLine(Logger.Level.err, "FAILED: The training corpus doesn't exist."); UsageTrain(); return; } //Create RNN encoder and save necessary parameters RNNEncoder encoder = new RNNEncoder(RNNConfig); //LoadFeatureConfig training corpus and extract feature set encoder.TrainingSet = new DataSet(tagSet.GetSize()); LoadDataset(strTrainFile, featurizer, encoder.TrainingSet); RNNConfig.TrainDataSet = encoder.TrainingSet; if (String.IsNullOrEmpty(strValidFile) == false) { //LoadFeatureConfig validated corpus and extract feature set Logger.WriteLine("Loading validated corpus from {0}", strValidFile); encoder.ValidationSet = new DataSet(tagSet.GetSize()); LoadDataset(strValidFile, featurizer, encoder.ValidationSet); } else { Logger.WriteLine("Validated corpus isn't specified."); encoder.ValidationSet = null; } if (iCRF == 1) { Logger.WriteLine("Initialize output tag bigram transition probability..."); //Build tag bigram transition matrix encoder.TrainingSet.BuildLabelBigramTransition(); } //Start to train the model encoder.Train(); }
private static void Test() { if (String.IsNullOrEmpty(strTagFile) == true) { Logger.WriteLine(Logger.Level.err, "FAILED: The tag mapping file {0} isn't specified.", strTagFile); UsageTest(); return; } //Load tag name TagSet tagSet = new TagSet(strTagFile); if (String.IsNullOrEmpty(strModelFile) == true) { Logger.WriteLine(Logger.Level.err, "FAILED: The model file {0} isn't specified.", strModelFile); UsageTest(); return; } if (String.IsNullOrEmpty(strFeatureConfigFile) == true) { Logger.WriteLine(Logger.Level.err, "FAILED: The feature configuration file {0} isn't specified.", strFeatureConfigFile); UsageTest(); return; } if (strOutputFile.Length == 0) { Logger.WriteLine(Logger.Level.err, "FAILED: The output file name should not be empty."); UsageTest(); return; } //Create feature extractors and load word embedding data from file Featurizer featurizer = new Featurizer(strFeatureConfigFile, tagSet); featurizer.ShowFeatureSize(); //Create instance for decoder RNNSharp.RNNDecoder decoder = new RNNSharp.RNNDecoder(strModelFile, featurizer); if (File.Exists(strTestFile) == false) { Logger.WriteLine(Logger.Level.err, "FAILED: The test corpus {0} isn't existed.", strTestFile); UsageTest(); return; } StreamReader sr = new StreamReader(strTestFile); StreamWriter sw = new StreamWriter(strOutputFile); while (true) { Sentence sent = new Sentence(ReadRecord(sr)); if (sent.TokensList.Count <= 2) { //No more record, it only contains <s> and </s> break; } if (nBest == 1) { int[] output = decoder.Process(sent); //Output decoded result //Append the decoded result into the end of feature set of each token StringBuilder sb = new StringBuilder(); for (int i = 0; i < sent.TokensList.Count; i++) { string tokens = String.Join("\t", sent.TokensList[i]); sb.Append(tokens); sb.Append("\t"); sb.Append(tagSet.GetTagName(output[i])); sb.AppendLine(); } sw.WriteLine(sb.ToString()); } else { int[][] output = decoder.ProcessNBest(sent, nBest); StringBuilder sb = new StringBuilder(); for (int i = 0; i < nBest; i++) { for (int j = 0; j < sent.TokensList.Count; j++) { string tokens = String.Join("\t", sent.TokensList[i]); sb.Append(tokens); sb.Append("\t"); sb.Append(tagSet.GetTagName(output[i][j])); sb.AppendLine(); } sb.AppendLine(); } sw.WriteLine(sb.ToString()); } } sr.Close(); sw.Close(); }
public Learner(Featurizer featurizer) { this.featurizer = featurizer; }