public void testSimpleTraining() { IObjectStream<DocumentSample> samples = new GenericObjectStream<DocumentSample>(new[] { new DocumentSample("1", new[] {"a", "b", "c"}), new DocumentSample("1", new[] {"a", "b", "c", "1", "2"}), new DocumentSample("1", new[] {"a", "b", "c", "3", "4"}), new DocumentSample("0", new[] {"x", "y", "z"}), new DocumentSample("0", new[] {"x", "y", "z", "5", "6"}), new DocumentSample("0", new[] {"x", "y", "z", "7", "8"}) }); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "100"); param.Set(Parameters.Cutoff, "0"); var model = DocumentCategorizerME.Train("x-unspecified", samples, param, new DocumentCategorizerFactory()); var doccat = new DocumentCategorizerME(model); var aProbs = doccat.Categorize("a"); Assert.AreEqual("1", doccat.GetBestCategory(aProbs)); var bProbs = doccat.Categorize("x"); Assert.AreEqual("0", doccat.GetBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. var sortedScoreMap = doccat.SortedScoreMap("a"); foreach (var pair in sortedScoreMap) { Assert.AreEqual("1", pair.Value[0]); break; } }
public void Setup() { mlParams = new TrainingParameters(); mlParams.Set(Parameters.Algorithm, GIS.MaxEntropy); mlParams.Set(Parameters.Iterations, "10"); mlParams.Set(Parameters.Cutoff, "5"); TrainerFactory.RegisterTrainer("Dummy", typeof(DummyTrainer)); }
public static TokenizerModel CreateMaxentTokenModel() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams); } }
public void Setup() { var p = new TrainingParameters(); p.Set(Parameters.Iterations, "70"); p.Set(Parameters.Cutoff, "1"); var chunkerModel = ChunkerME.Train("en", CreateSampleStream(), p, new ChunkerFactory()); chunker = new ChunkerME(chunkerModel); }
public void TestNameFinder() { using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentences.txt")) { var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1")); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "70"); param.Set(Parameters.Cutoff, "1"); var model = NameFinderME.Train( "en", sampleStream, param, new TokenNameFinderFactory(null, new Dictionary<string, object>())); var nameFinder = new NameFinderME(model); // now test if it can detect the sample sentences var sentence = new[] { "Alisa", "appreciated", "the", "hint", "and", "enjoyed", "a", "delicious", "traditional", "meal." }; var names = nameFinder.Find(sentence); Assert.AreEqual(1, names.Length); Assert.AreEqual(new Span(0, 1, Type), names[0]); sentence = new[] { "Hi", "Mike", ",", "it's", "Stefanie", "Schmidt", "." }; names = nameFinder.Find(sentence); Assert.AreEqual(2, names.Length); Assert.AreEqual(new Span(1, 2, Type), names[0]); Assert.AreEqual(new Span(4, 6, Type), names[1]); } }
public static TokenizerModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TokenSampleStream stream = new TokenSampleStream(new PlainTextByLineStream(fs)); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train(stream, new TokenizerFactory(TRAINING_LANGUAGE, null, true), trainParams)); }
public void TestPerceptronOnPrepAttachDataWithSkippedAveraging() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.UseSkippedAveraging, "true"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.773706362961129); }
public void TestMaxentOnPrepAttachDataWithParams() { var reportMap = new Dictionary<string, string>(); var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass); trainParams.Set(Parameters.Cutoff, "1"); var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.7997028967566229d); }
public void TestPerceptronOnPrepAttachDataWithTolerance() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.Iterations, "500"); trainParams.Set(Parameters.Tolerance, "0.0001"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.7677642980935875); }
public void TestQnOnPrepAttachDataInParallel() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn); //trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Threads, "2"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); TestModel(model, 0.8115870264917059); }
public static LemmatizerModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "1"); trainParams.Set(Parameters.Cutoff, "0"); LemmatizerFactory lemmatizerFactory = new LemmatizerFactory(); LemmaSampleStream sampleStream = new LemmaSampleStream(new PlainTextByLineStream(fs)); return(LemmatizerME.Train(TRAINING_LANGUAGE, sampleStream, trainParams, lemmatizerFactory)); }
public void TestMaxentOnPrepAttachDataWithParams() { var reportMap = new Dictionary <string, string>(); var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass); trainParams.Set(Parameters.Cutoff, "1"); var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.7997028967566229d); }
public static SentenceModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); SentenceDetectorFactory detectorFactory = new SentenceDetectorFactory(TRAINING_LANGUAGE, true, null, null); SentenceSampleStream sampleStream = new SentenceSampleStream(new PlainTextByLineStream(fs)); return(SentenceDetectorME.Train(TRAINING_LANGUAGE, sampleStream, detectorFactory, trainParams)); }
public void TestQnOnPrepAttachDataWithL2Params() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn); trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.L1Cost, "0"); trainParams.Set(Parameters.L2Cost, "1.0"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); TestModel(model, 0.8227283981183461); }
public void TestWithNameEvaluationErrorListener() { using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentences.txt")) { var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1")); var parameters = new TrainingParameters(); parameters.Set(Parameters.Iterations, "70"); parameters.Set(Parameters.Cutoff, "1"); parameters.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); var cv = new TokenNameFinderCrossValidator("en", Type, parameters, new NameEvaluationErrorListener()); cv.Evaluate(sampleStream, 2); Assert.NotNull(cv.FMeasure); } }
public void AbbreviationDefaultBehaviorTest() { var samples = "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine + "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine + "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel!" + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine; var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) { { "12. Toedracht" }, { "Tel." }, }; var trainingParameters = new TrainingParameters(); trainingParameters.Set(Parameters.Algorithm, "MAXENT"); trainingParameters.Set(Parameters.TrainerType, "Event"); trainingParameters.Set(Parameters.Iterations, "100"); trainingParameters.Set(Parameters.Cutoff, "5"); char[] eos = { '.', '?', '!' }; var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos); var stringReader = new StringReader(samples); var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader)); var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters); var sentenceDetectorMe = new SentenceDetectorME(sentenceModel); var sentences = sentenceDetectorMe.SentDetect(samples); var expected = samples.Split(new [] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(8, sentences.Length); for (var i = 0; i < sentences.Length; i++) { Assert.AreEqual(expected[i], sentences[i]); } }
public void TestCrossCompatibility() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams); var sMe = new TokenizerME(model); TokenizerMETest.TestTokenizer(sMe); var sProbs = sMe.TokenProbabilities; // --- java \/ var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel = new opennlp.tools.tokenize.TokenizerModel( OpenNLP.CreateInputStream(sFile) ); var jMe = new opennlp.tools.tokenize.TokenizerME(jModel); TestJavaTokenizer(jMe); var jProbs = jMe.getTokenProbabilities(); Assert.AreEqual(jProbs.Length, sProbs.Length); for (int i = 0; i < jProbs.Length; i++) { // one difference :( // -0.00000000000000011102230246251565 // // but still "insignificant" :) Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d); } } }
public static TokenizerModel CreateSimpleMaxentTokenModel() { var samples = new List<TokenSample> { new TokenSample("year", new[] {new Span(0, 4)}), new TokenSample("year,", new[] {new Span(0, 4), new Span(4, 5)}), new TokenSample("it,", new[] {new Span(0, 2), new Span(2, 3)}), new TokenSample("it", new[] {new Span(0, 2)}), new TokenSample("yes", new[] {new Span(0, 3)}), new TokenSample("yes,", new[] {new Span(0, 3), new Span(3, 4)}) }; var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return TokenizerME.Train( new CollectionObjectStream<TokenSample>(samples), new TokenizerFactory("en", null, true), mlParams); }
public void TestSentenceDetector() { using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) { var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var sdFactory = new SentenceDetectorFactory("en", true, null, null); var stream = new SentenceSampleStream(new PlainTextByLineStream(file)); var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams); Assert.AreEqual("en", model.Language); Assert.AreEqual(model.UseTokenEnd, true); EvalSentences(new SentenceDetectorME(model)); } }
internal static POSModel TrainPOSModel(ModelType type = ModelType.Maxent) { var p = new TrainingParameters(); switch (type) { case ModelType.Maxent: p.Set(Parameters.Algorithm, "MAXENT"); break; case ModelType.Perceptron: p.Set(Parameters.Algorithm, "PERCEPTRON"); break; default: throw new NotSupportedException(); } p.Set(Parameters.Iterations, "100"); p.Set(Parameters.Cutoff, "5"); return POSTaggerME.Train("en", CreateSampleStream(), p, new POSTaggerFactory()); }
public void AbbreviationDefaultBehaviorTest() { var samples = "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine + "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine + "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel!" + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine; var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) { {"12. Toedracht"}, {"Tel."}, }; var trainingParameters = new TrainingParameters(); trainingParameters.Set(Parameters.Algorithm, "MAXENT"); trainingParameters.Set(Parameters.TrainerType, "Event"); trainingParameters.Set(Parameters.Iterations, "100"); trainingParameters.Set(Parameters.Cutoff, "5"); char[] eos = { '.', '?', '!' }; var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos); var stringReader = new StringReader(samples); var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader)); var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters); var sentenceDetectorMe = new SentenceDetectorME(sentenceModel); var sentences = sentenceDetectorMe.SentDetect(samples); var expected = samples.Split(new []{ Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(8, sentences.Length); for (var i = 0; i < sentences.Length; i++) Assert.AreEqual(expected[i], sentences[i]); }
public void TestMaxentOnPrepAttachDataWithParamsDefault() { var reportMap = new Dictionary<string, string>(); var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.8086159940579352d); }
public void Setup() { var sParams = new TrainingParameters(); sParams.Set(Parameters.Iterations, "70"); sParams.Set(Parameters.Cutoff, "1"); var jParams = new opennlp.tools.util.TrainingParameters(); jParams.put("Iterations", "70"); jParams.put("Cutoff", "1"); var sModel = ChunkerME.Train("en", ChunkerMETest.CreateSampleStream(), sParams, new ChunkerFactory()); var jModel = opennlp.tools.chunker.ChunkerME.train("en", JavaSampleStream(), jParams, new opennlp.tools.chunker.ChunkerFactory()); Assert.NotNull(sModel); Assert.NotNull(jModel); sChunker = new ChunkerME(sModel); jChunker = new opennlp.tools.chunker.ChunkerME(jModel); }
public void TestMaxentOnPrepAttachDataWithParamsDefault() { var reportMap = new Dictionary <string, string>(); var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt); var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); PrepAttachDataUtility.TestModel(model, 0.8086159940579352d); }
public void TestQnOnPrepAttachDataWithParamsDefault() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); TestModel(model, 0.8115870264917059); }
public static TokenizerModel CreateSimpleMaxentTokenModel() { var samples = new List <TokenSample> { new TokenSample("year", new[] { new Span(0, 4) }), new TokenSample("year,", new[] { new Span(0, 4), new Span(4, 5) }), new TokenSample("it,", new[] { new Span(0, 2), new Span(2, 3) }), new TokenSample("it", new[] { new Span(0, 2) }), new TokenSample("yes", new[] { new Span(0, 3) }), new TokenSample("yes,", new[] { new Span(0, 3), new Span(3, 4) }) }; var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train( new CollectionObjectStream <TokenSample>(samples), new TokenizerFactory("en", null, true), mlParams)); }
public void TestSimpleTraining() { IObjectStream <DocumentSample> samples = new GenericObjectStream <DocumentSample>(new[] { new DocumentSample("1", new[] { "a", "b", "c" }), new DocumentSample("1", new[] { "a", "b", "c", "1", "2" }), new DocumentSample("1", new[] { "a", "b", "c", "3", "4" }), new DocumentSample("0", new[] { "x", "y", "z" }), new DocumentSample("0", new[] { "x", "y", "z", "5", "6" }), new DocumentSample("0", new[] { "x", "y", "z", "7", "8" }) }); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "100"); param.Set(Parameters.Cutoff, "0"); var model = DocumentCategorizerME.Train("x-unspecified", samples, param, new DocumentCategorizerFactory()); var doccat = new DocumentCategorizerME(model); var aProbs = doccat.Categorize("a"); Assert.AreEqual("1", doccat.GetBestCategory(aProbs)); var bProbs = doccat.Categorize("x"); Assert.AreEqual("0", doccat.GetBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. var sortedScoreMap = doccat.SortedScoreMap("a"); // first 0 // second 1 (last) foreach (var pair in sortedScoreMap) { Assert.AreEqual("0", pair.Value[0]); break; } }
internal static POSModel TrainPosModel(ModelType type = ModelType.Maxent) { var p = new TrainingParameters(); switch (type) { case ModelType.Maxent: p.Set(Parameters.Algorithm, "MAXENT"); break; case ModelType.Perceptron: p.Set(Parameters.Algorithm, "PERCEPTRON"); break; default: throw new NotSupportedException(); } p.Set(Parameters.Iterations, "100"); p.Set(Parameters.Cutoff, "5"); return(POSTaggerME.Train("en", CreateSampleStream(), p, new POSTaggerFactory())); }
/// <summary> /// Trains a parser model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="rules">The head rules.</param> /// <param name="iterations">The number of training iterations.</param> /// <param name="cutoff">The min number of times a feature must be seen.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="ParserModel"/> object.</returns> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ParserModel Train(string languageCode, IObjectStream <Parse> samples, AbstractHeadRules rules, int iterations, int cutoff, Monitor monitor) { var param = new TrainingParameters(); param.Set("dict", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("tagger", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("tagger", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); param.Set("chunker", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("chunker", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); param.Set("check", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("check", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); param.Set("build", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("build", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); return(Train(languageCode, samples, rules, param, monitor)); }
public void TestNameFinderWithTypes() { using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")) { var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1")); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "70"); param.Set(Parameters.Cutoff, "1"); var model = NameFinderME.Train( "en", sampleStream, param, new TokenNameFinderFactory(null, new Dictionary <string, object>())); var nameFinder = new NameFinderME(model); // now test if it can detect the sample sentences var sentence = new[] { "Alisa", "appreciated", "the", "hint", "and", "enjoyed", "a", "delicious", "traditional", "meal." }; var names = nameFinder.Find(sentence); Assert.AreEqual(1, names.Length); Assert.AreEqual(new Span(0, 1, "person"), names[0]); Assert.True(HasOtherAsOutcome(model)); sentence = new[] { "Hi", "Mike", ",", "it's", "Stefanie", "Schmidt", "." }; names = nameFinder.Find(sentence); Assert.AreEqual(2, names.Length); Assert.AreEqual(new Span(1, 2, "person"), names[0]); Assert.AreEqual(new Span(4, 6, "person"), names[1]); Assert.AreEqual("person", names[0].Type); Assert.AreEqual("person", names[1].Type); } }
public void TestPerceptronOnPrepAttachDataWithStepSizeDecrease() { var trainParams = new TrainingParameters(); trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron); trainParams.Set(Parameters.Cutoff, "1"); trainParams.Set(Parameters.Iterations, "500"); trainParams.Set(Parameters.StepSizeDecrease, "0.06"); var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null); var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream()); /* * The java test gives an error too, soo.... for now i'll assume that is correct :P * * java.lang.AssertionError: expected:<0.7756870512503095> but was:<0.7766773953948998> at org.junit.Assert.fail(Assert.java:91) at org.junit.Assert.failNotEquals(Assert.java:645) at org.junit.Assert.assertEquals(Assert.java:441) at org.junit.Assert.assertEquals(Assert.java:510) */ //PrepAttachDataUtility.TestModel(model, 0.7756870512503095); < OpenNLP value PrepAttachDataUtility.TestModel(model, 0.77742015350334237); }
public void TestEverything() { using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) { var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var sdFactory = new SentenceDetectorFactory("en", true, null, null); var stream = new SentenceSampleStream(new PlainTextByLineStream(file)); var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams); Assert.AreEqual("en", model.Language); Assert.AreEqual(model.UseTokenEnd, true); var sMe = new SentenceDetectorME(model); // test the SharpNL sentences SentenceDetectorMETest.EvalSentences(sMe); var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel2 = new JavaModel(OpenNLP.CreateInputStream(sFile)); var jMe = new JavaSDME(jModel2); // test the Java OpenNLP sentences. JavaEvalSentences(jMe); // first try?! Yes! ;-) } }
public void TestOnlyWithNamesWithTypes() { using (var file = Tests.OpenFile("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")) { var sampleStream = new NameSampleStream(new PlainTextByLineStream(file)); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "70"); param.Set(Parameters.Cutoff, "1"); var model = NameFinderME.Train( "en", sampleStream, param, new TokenNameFinderFactory(null, new Dictionary<string, object>())); var nameFinder = new NameFinderME(model); // now test if it can detect the sample sentences var sentence = WhitespaceTokenizer.Instance.Tokenize( "Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman Robert Aderholt " + "Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander"); var names = nameFinder.Find(sentence); Assert.AreEqual(new Span(0, 2, "person"), names[0]); Assert.AreEqual(new Span(2, 4, "person"), names[1]); Assert.AreEqual(new Span(4, 6, "person"), names[2]); Assert.True(!HasOtherAsOutcome(model)); } }
public void TestOnlyWithEntitiesWithTypes() { using (var file = Tests.OpenFile("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")) { var sampleStream = new NameSampleStream(new PlainTextByLineStream(file)); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "70"); param.Set(Parameters.Cutoff, "1"); var model = NameFinderME.Train( "en", sampleStream, param, new TokenNameFinderFactory(null, new Dictionary<string, object>())); var nameFinder = new NameFinderME(model); // now test if it can detect the sample sentences var sentence = WhitespaceTokenizer.Instance.Tokenize("NATO United States Barack Obama"); var names = nameFinder.Find(sentence); Assert.AreEqual(new Span(0, 1, "organization"), names[0]); Assert.AreEqual(new Span(1, 3, "location"), names[1]); Assert.AreEqual(new Span(3, 5, "person"), names[2]); Assert.False(HasOtherAsOutcome(model)); } }
public void TestNameFinderWithMultipleTypes() { using (var file = Tests.OpenFile("opennlp/tools/namefind/voa1.train")) { var sampleStream = new NameSampleStream(new PlainTextByLineStream(file)); var param = new TrainingParameters(); param.Set(Parameters.Iterations, "70"); param.Set(Parameters.Cutoff, "1"); var model = NameFinderME.Train( "en", sampleStream, param, new TokenNameFinderFactory(null, new Dictionary<string, object>())); var nameFinder = new NameFinderME(model); // now test if it can detect the sample sentences var sentence = new [] { "U", ".", "S", ".", "President", "Barack", "Obama", "has", "arrived", "in", "South", "Korea", ",", "where", "he", "is", "expected", "to", "show", "solidarity", "with", "the", "country", "'", "s", "president", "in", "demanding", "North", "Korea", "move", "toward", "ending", "its", "nuclear", "weapons", "programs", "." }; var names = nameFinder.Find(sentence); Assert.AreEqual(4, names.Length); Assert.AreEqual(new Span(0, 4, "location"), names[0]); Assert.AreEqual(new Span(5, 7, "person"), names[1]); Assert.AreEqual(new Span(10, 12, "location"), names[2]); Assert.AreEqual(new Span(28, 30, "location"), names[3]); /* These asserts are not needed because the equality comparer handles the Type assertEquals("location", names1[0].getType()); assertEquals("person", names1[1].getType()); assertEquals("location", names1[2].getType()); assertEquals("location", names1[3].getType()); */ sentence = new[] { "Scott", "Snyder", "is", "the", "director", "of", "the", "Center", "for", "U", ".", "S", ".", "Korea", "Policy", "." }; names = nameFinder.Find(sentence); Assert.AreEqual(2, names.Length); Assert.AreEqual(new Span(0, 2, "person"), names[0]); Assert.AreEqual(new Span(7, 15, "organization"), names[1]); /* assertEquals("person", names2[0].getType()); assertEquals("organization", names2[1].getType()); */ } }
/// <summary> /// Trains a parser model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="rules">The head rules.</param> /// <param name="iterations">The number of training iterations.</param> /// <param name="cutoff">The min number of times a feature must be seen.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="ParserModel"/> object.</returns> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ParserModel Train(string languageCode, IObjectStream<Parse> samples, AbstractHeadRules rules, int iterations, int cutoff, Monitor monitor) { var param = new TrainingParameters(); param.Set("dict", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("tagger", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("tagger", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); param.Set("chunker", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("chunker", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); param.Set("check", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("check", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); param.Set("build", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture)); param.Set("build", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture)); return Train(languageCode, samples, rules, param, monitor); }
public void TestCustomTrainer() { mlParams.Set(Parameters.Algorithm, "Dummy"); Assert.True(TrainerFactory.IsValid(mlParams)); }