public void TestNullDict()
        {
            char[] eos     = { '.', '?' };
            var    sdModel = Train(new SentenceDetectorFactory("en", true, null, eos));

            Assert.NotNull(sdModel);

            SentenceDetectorFactory factory = sdModel.Factory;

            Assert.Null(factory.AbbreviationDictionary);
            Assert.True(factory.GetContextGenerator() is DefaultSentenceContextGenerator);
            Assert.True(factory.GetEndOfSentenceScanner() is DefaultEndOfSentenceScanner);
            Assert.True(eos.SequenceEqual(factory.EOSCharacters));

            var o = new MemoryStream();

            sdModel.Serialize(new UnclosableStream(o));

            o.Seek(0, SeekOrigin.Begin);

            var fromSerialized = new SentenceModel(o);

            factory = fromSerialized.Factory;
            Assert.Null(factory.AbbreviationDictionary);
            Assert.True(factory.GetContextGenerator() is DefaultSentenceContextGenerator);
            Assert.True(factory.GetEndOfSentenceScanner() is DefaultEndOfSentenceScanner);
            Assert.True(eos.SequenceEqual(factory.EOSCharacters));
        }
        public void TestEverything()
        {
            using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) {
                var mlParams = new TrainingParameters();

                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");

                var sdFactory = new SentenceDetectorFactory("en", true, null, null);
                var stream    = new SentenceSampleStream(new PlainTextByLineStream(file));

                var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams);

                Assert.AreEqual("en", model.Language);
                Assert.AreEqual(model.UseTokenEnd, true);

                var sMe = new SentenceDetectorME(model);

                // test the SharpNL sentences
                SentenceDetectorMETest.EvalSentences(sMe);

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel2 = new JavaModel(OpenNLP.CreateInputStream(sFile));

                var jMe = new JavaSDME(jModel2);

                // test the Java OpenNLP sentences.
                JavaEvalSentences(jMe);

                // first try?! Yes! ;-)
            }
        }
Example #3
0
        public override void run(string format, string[] args)
        {
            base.run(format, args);

            mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false);

            if (mlParams != null)
            {
                if (TrainUtil.isSequenceTraining(mlParams.Settings))
                {
                    throw new TerminateToolException(1, "Sequence training is not supported!");
                }
            }

            if (mlParams == null)
            {
                mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value);
            }

            Jfile modelOutFile = @params.Model;

            CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile);

            char[] eos = null;
            if (@params.EosChars != null)
            {
                eos = @params.EosChars.ToCharArray();
            }

            SentenceModel model;

            try
            {
                Dictionary dict = loadDict(@params.AbbDict);
                SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(@params.Factory, @params.Lang, true, dict, eos);
                model = SentenceDetectorME.train(@params.Lang, sampleStream, sdFactory, mlParams);
            }
            catch (IOException e)
            {
                throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e);
            }
            finally
            {
                try
                {
                    sampleStream.close();
                }
                catch (IOException)
                {
                    // sorry that this can fail
                }
            }

            CmdLineUtil.writeModel("sentence detector", modelOutFile, model);
        }
        public override void run(string format, string[] args)
        {
            base.run(format, args);

            mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false);
            if (mlParams == null)
            {
                mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value);
            }

            SDCrossValidator validator;

            SentenceDetectorEvaluationMonitor errorListener = null;

            if (@params.Misclassified.Value)
            {
                errorListener = new SentenceEvaluationErrorListener();
            }

            char[] eos = null;
            if (@params.EosChars != null)
            {
                eos = @params.EosChars.ToCharArray();
            }

            try
            {
                Dictionary abbreviations          = SentenceDetectorTrainerTool.loadDict(@params.AbbDict);
                SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(@params.Factory, @params.Lang, true, abbreviations, eos);
                validator = new SDCrossValidator(@params.Lang, mlParams, sdFactory, errorListener);

                validator.evaluate(sampleStream, @params.Folds.Value);
            }
            catch (IOException e)
            {
                throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e);
            }
            finally
            {
                try
                {
                    sampleStream.close();
                }
                catch (IOException)
                {
                    // sorry that this can fail
                }
            }

            FMeasure result = validator.FMeasure;

            Console.WriteLine(result.ToString());
        }
        public static SentenceModel TrainModel(string path)
        {
            FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read);

            TrainingParameters trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Iterations, "100");
            trainParams.Set(Parameters.Cutoff, "0");

            SentenceDetectorFactory detectorFactory = new SentenceDetectorFactory(TRAINING_LANGUAGE, true, null, null);
            SentenceSampleStream    sampleStream    = new SentenceSampleStream(new PlainTextByLineStream(fs));

            return(SentenceDetectorME.Train(TRAINING_LANGUAGE, sampleStream, detectorFactory, trainParams));
        }
Example #6
0
        public void AbbreviationDefaultBehaviorTest()
        {
            var samples =
                "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine +

                "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine +
                "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel!" + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine;

            var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false)
            {
                { "12. Toedracht" },
                { "Tel." },
            };

            var trainingParameters = new TrainingParameters();

            trainingParameters.Set(Parameters.Algorithm, "MAXENT");
            trainingParameters.Set(Parameters.TrainerType, "Event");
            trainingParameters.Set(Parameters.Iterations, "100");
            trainingParameters.Set(Parameters.Cutoff, "5");

            char[] eos          = { '.', '?', '!' };
            var    sdFactory    = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos);
            var    stringReader = new StringReader(samples);
            var    stream       = new SentenceSampleStream(new PlainTextByLineStream(stringReader));

            var sentenceModel      = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters);
            var sentenceDetectorMe = new SentenceDetectorME(sentenceModel);

            var sentences = sentenceDetectorMe.SentDetect(samples);
            var expected  = samples.Split(new [] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);


            Assert.AreEqual(8, sentences.Length);
            for (var i = 0; i < sentences.Length; i++)
            {
                Assert.AreEqual(expected[i], sentences[i]);
            }
        }
Example #7
0
        public void TestSentenceDetector()
        {
            using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) {
                var mlParams = new TrainingParameters();

                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");

                var sdFactory = new SentenceDetectorFactory("en", true, null, null);
                var stream    = new SentenceSampleStream(new PlainTextByLineStream(file));

                var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams);

                Assert.AreEqual("en", model.Language);
                Assert.AreEqual(model.UseTokenEnd, true);

                EvalSentences(new SentenceDetectorME(model));
            }
        }
 private static SentenceModel Train(SentenceDetectorFactory factory)
 {
     return(SentenceDetectorME.Train("en", CreateSampleStream(), factory, TrainingParameters.DefaultParameters()));
 }