/// <summary> /// Trains a parser model with the given parameters. /// </summary> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="rules">The head rules.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <returns>The trained <see cref="ParserModel" /> object.</returns> public static ParserModel Train( Monitor monitor, string languageCode, IObjectStream <Parse> samples, AbstractHeadRules rules, TrainingParameters parameters) { var dict = BuildDictionary(samples, rules, parameters); samples.Reset(); var manifestInfoEntries = new Dictionary <string, string>(); // build //System.err.println("Training builder"); var bes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dict); var buildReportMap = new Dictionary <string, string>(); var buildTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor); var buildModel = buildTrainer.Train(bes); MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build"); samples.Reset(); // tag var posTaggerParams = parameters.GetNamespace("tagger"); if (!posTaggerParams.Contains(Parameters.BeamSize)) { posTaggerParams.Set(Parameters.BeamSize, "10"); } var posModel = POSTaggerME.Train(languageCode, new PosSampleStream(samples), parameters.GetNamespace("tagger"), new POSTaggerFactory()); samples.Reset(); // chunk var chunkModel = ChunkerME.Train(languageCode, new ChunkSampleStream(samples), parameters.GetNamespace("chunker"), new ParserChunkerFactory()); samples.Reset(); // check //System.err.println("Training checker"); var kes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check); var checkReportMap = new Dictionary <string, string>(); var checkTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor); var checkModel = checkTrainer.Train(kes); MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check"); return(new ParserModel(languageCode, buildModel, checkModel, posModel, chunkModel, rules, manifestInfoEntries)); }
public void TestPerceptronOnPrepAttachDataWithStepSizeDecrease() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.Iterations, "500"); trainParams.Set(Parameters.StepSizeDecrease, "0.06"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); /* * The java test gives an error too, soo.... for now i'll assume that is correct :P * * java.lang.AssertionError: expected:<0.7756870512503095> but was:<0.7766773953948998> * at org.junit.Assert.fail(Assert.java:91) * at org.junit.Assert.failNotEquals(Assert.java:645) * at org.junit.Assert.assertEquals(Assert.java:441) * at org.junit.Assert.assertEquals(Assert.java:510) */ //PrepAttachDataUtility.TestModel(model, 0.7756870512503095); < OpenNLP value PrepAttachDataUtility.TestModel(model, 0.77742015350334237); }
/// <summary> /// Trains a name finder model with the given parameters. /// </summary> /// <param name="languageCode">The language of the training data.</param> /// <param name="type">Overrides the type parameter in the provided samples. This value can be null.</param> /// <param name="samples">The training samples.</param> /// <param name="parameters">The machine learning train parameters.</param> /// <param name="factory">The name finder factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns> public static TokenNameFinderModel Train(string languageCode, string type, IObjectStream <NameSample> samples, TrainingParameters parameters, TokenNameFinderFactory factory, Monitor monitor) { var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); var manifestInfoEntries = new Dictionary <string, string>(); var trainerType = TrainerFactory.GetTrainerType(parameters); IMaxentModel meModel = null; ML.Model.ISequenceClassificationModel <string> scModel = null; switch (trainerType) { case TrainerType.EventModelTrainer: var eventStream = new NameFinderEventStream(samples, type, factory.CreateContextGenerator(), factory.CreateSequenceCodec()); var nfTrainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); meModel = nfTrainer.Train(eventStream); break; case TrainerType.EventModelSequenceTrainer: var sampleStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator()); var nsTrainer = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor); meModel = nsTrainer.Train(sampleStream); break; case TrainerType.SequenceTrainer: var sequenceStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator()); var sqTrainer = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); scModel = sqTrainer.Train(sequenceStream); break; default: throw new InvalidOperationException("Unexpected trainer type!"); } if (scModel != null) { return(new TokenNameFinderModel( languageCode, scModel, factory.FeatureGenerator, factory.Resources, manifestInfoEntries, factory.SequenceCodec)); } return(new TokenNameFinderModel( languageCode, meModel, beamSize, factory.FeatureGenerator, factory.Resources, manifestInfoEntries, factory.SequenceCodec)); }
/// <summary> /// Trains document categorizer model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The document categorizer factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="DocumentCategorizerModel"/> model.</returns> public static DocumentCategorizerModel Train(string languageCode, IObjectStream <DocumentSample> samples, TrainingParameters parameters, DocumentCategorizerFactory factory, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); var eventStream = new DocumentCategorizerEventStream(samples, factory.FeatureGenerators); var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); var model = trainer.Train(eventStream); return(new DocumentCategorizerModel(languageCode, model, manifestInfoEntries, factory)); }
public void TestQnOnPrepAttachDataWithParamsDefault() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); TestModel(model, 0.8115870264917059); }
public void TestMaxentOnPrepAttachDataWithParamsDefault() { var reportMap = new Dictionary <string, string>(); var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.8086159940579352d); }
/// <summary> /// Trains a model for the <see cref="TokenizerME"/>. /// </summary> /// <param name="samples">The samples used for the training.</param> /// <param name="factory">A <see cref="TokenizerFactory"/> to get resources from.</param> /// <param name="parameters">The machine learning train parameters.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="TokenizerModel"/>.</returns> public static TokenizerModel Train(IObjectStream <TokenSample> samples, TokenizerFactory factory, TrainingParameters parameters, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); var eventStream = new TokSpanEventStream(samples, factory.UseAlphaNumericOptimization, factory.AlphaNumericPattern, factory.ContextGenerator); var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); var model = trainer.Train(eventStream); return(new TokenizerModel(model, manifestInfoEntries, factory)); }
public void TestPerceptronOnPrepAttachDataWithSkippedAveraging() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.UseSkippedAveraging, "true"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.773706362961129); }
/// <summary> /// Trains a lemmatizer model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training /// operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="LemmatizerModel" /> object.</returns> /// <exception cref="System.InvalidOperationException">The trainer was not specified.</exception> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static LemmatizerModel Train(string languageCode, IObjectStream <LemmaSample> samples, TrainingParameters parameters, LemmatizerFactory factory, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); var cg = factory.GetContextGenerator(); var trainerType = TrainerFactory.GetTrainerType(parameters); if (!trainerType.HasValue) { throw new InvalidOperationException("The trainer was not specified."); } IMaxentModel model = null; ML.Model.ISequenceClassificationModel <string> seqModel = null; switch (trainerType) { case TrainerType.EventModelTrainer: var s1 = new LemmaSampleEventStream(samples, cg); var t1 = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); model = t1.Train(s1); break; case TrainerType.EventModelSequenceTrainer: var s2 = new LemmaSampleSequenceStream(samples, cg); var t2 = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor); model = t2.Train(s2); break; case TrainerType.SequenceTrainer: var s3 = new LemmaSampleSequenceStream(samples, cg); var t3 = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); seqModel = t3.Train(s3); break; default: throw new NotSupportedException("Trainer type is not supported."); } return(model != null ? new LemmatizerModel(languageCode, model, beamSize, manifestInfoEntries, factory) : new LemmatizerModel(languageCode, seqModel, manifestInfoEntries, factory)); }
public void TestPerceptronOnPrepAttachDataWithTolerance() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.Iterations, "500"); trainParams.Set(Parameters.Tolerance, "0.0001"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.7677642980935875); }
public void TestMaxentOnPrepAttachDataWithParams() { var reportMap = new Dictionary <string, string>(); var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass); trainParams.Set(Parameters.Cutoff, "1"); var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.7997028967566229d); }
public void TestQnOnPrepAttachDataInParallel() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn); //trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Threads, "2"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); TestModel(model, 0.8115870264917059); }
/// <summary> /// Trains sentence detection model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="SentenceModel"/> object.</returns> public static SentenceModel Train(string languageCode, IObjectStream <SentenceSample> samples, SentenceDetectorFactory factory, TrainingParameters parameters, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); // TODO: Fix the EventStream to throw exceptions when training goes wrong var eventStream = new SentenceEventStream( samples, factory.GetContextGenerator(), factory.GetEndOfSentenceScanner()); var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); var model = trainer.Train(eventStream); return(new SentenceModel(languageCode, model, manifestInfoEntries, factory)); }
/// <summary> /// Trains a Part of Speech model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="POSModel"/> object.</returns> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static POSModel Train(string languageCode, IObjectStream <POSSample> samples, TrainingParameters parameters, POSTaggerFactory factory, Monitor monitor) { //int beamSize = trainParams.Get(Parameters.BeamSize, NameFinderME.DefaultBeamSize); var contextGenerator = factory.GetPOSContextGenerator(); var manifestInfoEntries = new Dictionary <string, string>(); var trainerType = TrainerFactory.GetTrainerType(parameters); IMaxentModel posModel = null; ML.Model.ISequenceClassificationModel <string> seqPosModel = null; switch (trainerType) { case TrainerType.EventModelTrainer: var es = new POSSampleEventStream(samples, contextGenerator); var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); posModel = trainer.Train(es); break; case TrainerType.EventModelSequenceTrainer: var ss = new POSSampleSequenceStream(samples, contextGenerator); var trainer2 = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor); posModel = trainer2.Train(ss); break; case TrainerType.SequenceTrainer: var trainer3 = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); // TODO: This will probably cause issue, since the feature generator uses the outcomes array var ss2 = new POSSampleSequenceStream(samples, contextGenerator); seqPosModel = trainer3.Train(ss2); break; default: throw new NotSupportedException("Trainer type is not supported."); } if (posModel != null) { return(new POSModel(languageCode, posModel, manifestInfoEntries, factory)); } return(new POSModel(languageCode, seqPosModel, manifestInfoEntries, factory)); }
public void TestQnOnPrepAttachDataWithL2Params() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn); trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.L1Cost, "0"); trainParams.Set(Parameters.L2Cost, "1.0"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); TestModel(model, 0.8227283981183461); }
/// <summary> /// Trains a chunker model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>The trained <see cref="ChunkerModel"/> object.</returns> /// <exception cref="System.InvalidOperationException">The trainer was not specified.</exception> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ChunkerModel Train(string languageCode, IObjectStream <ChunkSample> samples, TrainingParameters parameters, ChunkerFactory factory, Monitor monitor) { var trainerType = TrainerFactory.GetTrainerType(parameters); if (!trainerType.HasValue) { throw new InvalidOperationException("The trainer was not specified."); } var manifestInfoEntries = new Dictionary <string, string>(); IMaxentModel chunkerModel = null; ML.Model.ISequenceClassificationModel <string> seqChunkerModel = null; switch (trainerType) { case TrainerType.SequenceTrainer: var st = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); // TODO: This will probably cause issue, since the feature generator uses the outcomes array var ss = new ChunkSampleSequenceStream(samples, factory.GetContextGenerator()); seqChunkerModel = st.Train(ss); break; case TrainerType.EventModelTrainer: var es = new ChunkerEventStream(samples, factory.GetContextGenerator()); var et = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); chunkerModel = et.Train(es); break; default: throw new NotSupportedException("Trainer type is not supported."); } var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); return(chunkerModel != null ? new ChunkerModel(languageCode, chunkerModel, beamSize, manifestInfoEntries, factory) : new ChunkerModel(languageCode, seqChunkerModel, manifestInfoEntries, factory)); }
public void TestNaiveBayesOnPrepAttachDataUsingTrainUtilWithCutoff5() { var parameters = TrainingParameters.DefaultParameters(); parameters.Set(Parameters.Algorithm, Parameters.Algorithms.NaiveBayes); parameters.Set(Parameters.Cutoff, "5"); var trainer = TrainerFactory.GetEventTrainer(parameters, null, null); Assert.NotNull(trainer); Assert.IsInstanceOf <NaiveBayesTrainer>(trainer); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); Assert.NotNull(model); PrepAttachDataUtility.TestModel(model, 0.7945035899975241); }
/// <summary> /// Trains a parser model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="rules">The head rules.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="ParserModel"/> object.</returns> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ParserModel Train( string languageCode, IObjectStream <Parse> samples, AbstractHeadRules rules, TrainingParameters parameters, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); System.Diagnostics.Debug.Print("Building dictionary"); var dictionary = BuildDictionary(samples, rules, parameters); samples.Reset(); // tag var posModel = POSTaggerME.Train( languageCode, new PosSampleStream(samples), parameters.GetNamespace("tagger"), new POSTaggerFactory(), monitor); samples.Reset(); // chunk var chunkModel = ChunkerME.Train( languageCode, new ChunkSampleStream(samples), parameters.GetNamespace("chunker"), new ChunkerFactory(), monitor); samples.Reset(); // build System.Diagnostics.Debug.Print("Training builder"); var bes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dictionary); var buildReportMap = new Dictionary <string, string>(); var buildTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor); var buildModel = buildTrainer.Train(bes); Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build"); samples.Reset(); // check System.Diagnostics.Debug.Print("Training checker"); var kes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check); var checkReportMap = new Dictionary <string, string>(); var checkTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor); var checkModel = checkTrainer.Train(kes); Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check"); samples.Reset(); // attach System.Diagnostics.Debug.Print("Training attacher"); var attachEvents = new ParserEventStream(samples, rules, ParserEventTypeEnum.Attach); var attachReportMap = new Dictionary <string, string>(); var attachTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("attach"), attachReportMap, monitor); var attachModel = attachTrainer.Train(attachEvents); Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, attachReportMap, "attach"); return(new ParserModel( languageCode, buildModel, checkModel, attachModel, posModel, chunkModel, rules, ParserType.TreeInsert, manifestInfoEntries)); }