public void TestNullDict() { char[] eos = { '.', '?' }; var sdModel = Train(new SentenceDetectorFactory("en", true, null, eos)); Assert.NotNull(sdModel); SentenceDetectorFactory factory = sdModel.Factory; Assert.Null(factory.AbbreviationDictionary); Assert.True(factory.GetContextGenerator() is DefaultSentenceContextGenerator); Assert.True(factory.GetEndOfSentenceScanner() is DefaultEndOfSentenceScanner); Assert.True(eos.SequenceEqual(factory.EOSCharacters)); var o = new MemoryStream(); sdModel.Serialize(new UnclosableStream(o)); o.Seek(0, SeekOrigin.Begin); var fromSerialized = new SentenceModel(o); factory = fromSerialized.Factory; Assert.Null(factory.AbbreviationDictionary); Assert.True(factory.GetContextGenerator() is DefaultSentenceContextGenerator); Assert.True(factory.GetEndOfSentenceScanner() is DefaultEndOfSentenceScanner); Assert.True(eos.SequenceEqual(factory.EOSCharacters)); }
public void TestEverything() { using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) { var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var sdFactory = new SentenceDetectorFactory("en", true, null, null); var stream = new SentenceSampleStream(new PlainTextByLineStream(file)); var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams); Assert.AreEqual("en", model.Language); Assert.AreEqual(model.UseTokenEnd, true); var sMe = new SentenceDetectorME(model); // test the SharpNL sentences SentenceDetectorMETest.EvalSentences(sMe); var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel2 = new JavaModel(OpenNLP.CreateInputStream(sFile)); var jMe = new JavaSDME(jModel2); // test the Java OpenNLP sentences. JavaEvalSentences(jMe); // first try?! Yes! ;-) } }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false); if (mlParams != null) { if (TrainUtil.isSequenceTraining(mlParams.Settings)) { throw new TerminateToolException(1, "Sequence training is not supported!"); } } if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } Jfile modelOutFile = @params.Model; CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile); char[] eos = null; if (@params.EosChars != null) { eos = @params.EosChars.ToCharArray(); } SentenceModel model; try { Dictionary dict = loadDict(@params.AbbDict); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(@params.Factory, @params.Lang, true, dict, eos); model = SentenceDetectorME.train(@params.Lang, sampleStream, sdFactory, mlParams); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("sentence detector", modelOutFile, model); }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false); if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } SDCrossValidator validator; SentenceDetectorEvaluationMonitor errorListener = null; if (@params.Misclassified.Value) { errorListener = new SentenceEvaluationErrorListener(); } char[] eos = null; if (@params.EosChars != null) { eos = @params.EosChars.ToCharArray(); } try { Dictionary abbreviations = SentenceDetectorTrainerTool.loadDict(@params.AbbDict); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(@params.Factory, @params.Lang, true, abbreviations, eos); validator = new SDCrossValidator(@params.Lang, mlParams, sdFactory, errorListener); validator.evaluate(sampleStream, @params.Folds.Value); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } FMeasure result = validator.FMeasure; Console.WriteLine(result.ToString()); }
public static SentenceModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); SentenceDetectorFactory detectorFactory = new SentenceDetectorFactory(TRAINING_LANGUAGE, true, null, null); SentenceSampleStream sampleStream = new SentenceSampleStream(new PlainTextByLineStream(fs)); return(SentenceDetectorME.Train(TRAINING_LANGUAGE, sampleStream, detectorFactory, trainParams)); }
public void AbbreviationDefaultBehaviorTest() { var samples = "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine + "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine + "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel!" + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine; var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) { { "12. Toedracht" }, { "Tel." }, }; var trainingParameters = new TrainingParameters(); trainingParameters.Set(Parameters.Algorithm, "MAXENT"); trainingParameters.Set(Parameters.TrainerType, "Event"); trainingParameters.Set(Parameters.Iterations, "100"); trainingParameters.Set(Parameters.Cutoff, "5"); char[] eos = { '.', '?', '!' }; var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos); var stringReader = new StringReader(samples); var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader)); var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters); var sentenceDetectorMe = new SentenceDetectorME(sentenceModel); var sentences = sentenceDetectorMe.SentDetect(samples); var expected = samples.Split(new [] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(8, sentences.Length); for (var i = 0; i < sentences.Length; i++) { Assert.AreEqual(expected[i], sentences[i]); } }
public void TestSentenceDetector() { using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) { var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var sdFactory = new SentenceDetectorFactory("en", true, null, null); var stream = new SentenceSampleStream(new PlainTextByLineStream(file)); var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams); Assert.AreEqual("en", model.Language); Assert.AreEqual(model.UseTokenEnd, true); EvalSentences(new SentenceDetectorME(model)); } }
private static SentenceModel Train(SentenceDetectorFactory factory) { return(SentenceDetectorME.Train("en", CreateSampleStream(), factory, TrainingParameters.DefaultParameters())); }