public void testSimpleTraining() {

            IObjectStream<DocumentSample> samples = new GenericObjectStream<DocumentSample>(new[] {
                new DocumentSample("1", new[] {"a", "b", "c"}),
                new DocumentSample("1", new[] {"a", "b", "c", "1", "2"}),
                new DocumentSample("1", new[] {"a", "b", "c", "3", "4"}),
                new DocumentSample("0", new[] {"x", "y", "z"}),
                new DocumentSample("0", new[] {"x", "y", "z", "5", "6"}),
                new DocumentSample("0", new[] {"x", "y", "z", "7", "8"})
            });

            var param = new TrainingParameters();
            param.Set(Parameters.Iterations, "100");
            param.Set(Parameters.Cutoff, "0");

            var model = DocumentCategorizerME.Train("x-unspecified", samples, param, new DocumentCategorizerFactory());

            var doccat = new DocumentCategorizerME(model);

            var aProbs = doccat.Categorize("a");

            Assert.AreEqual("1", doccat.GetBestCategory(aProbs));

            var bProbs = doccat.Categorize("x");
            Assert.AreEqual("0", doccat.GetBestCategory(bProbs));

            //test to make sure sorted map's last key is cat 1 because it has the highest score.
            var sortedScoreMap = doccat.SortedScoreMap("a");

            foreach (var pair in sortedScoreMap) {
                Assert.AreEqual("1", pair.Value[0]);
                break;   
            }
        }
        public void Setup() {
            mlParams = new TrainingParameters();
            mlParams.Set(Parameters.Algorithm, GIS.MaxEntropy);
            mlParams.Set(Parameters.Iterations, "10");
            mlParams.Set(Parameters.Cutoff, "5");

            TrainerFactory.RegisterTrainer("Dummy", typeof(DummyTrainer));
        }
示例#3
0
 public static TokenizerModel CreateMaxentTokenModel() {
     using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
         var samples = new TokenSampleStream(new PlainTextByLineStream(data));
         var mlParams = new TrainingParameters();
         mlParams.Set(Parameters.Iterations, "100");
         mlParams.Set(Parameters.Cutoff, "0");
         return TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);
     }
 }
示例#4
0
        public void Setup() {
            var p = new TrainingParameters();
            p.Set(Parameters.Iterations, "70");
            p.Set(Parameters.Cutoff, "1");

            var chunkerModel = ChunkerME.Train("en", CreateSampleStream(), p, new ChunkerFactory());

            chunker = new ChunkerME(chunkerModel);
        }
示例#5
0
        public void TestNameFinder() {

            using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentences.txt")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1"));

                var param = new TrainingParameters();
                param.Set(Parameters.Iterations, "70");
                param.Set(Parameters.Cutoff, "1");

                var model = NameFinderME.Train(
                    "en",
                    sampleStream,
                    param,
                    new TokenNameFinderFactory(null, new Dictionary<string, object>()));

                var nameFinder = new NameFinderME(model);

                // now test if it can detect the sample sentences
                var sentence = new[] {
                    "Alisa",
                    "appreciated",
                    "the",
                    "hint",
                    "and",
                    "enjoyed",
                    "a",
                    "delicious",
                    "traditional",
                    "meal."
                };

                var names = nameFinder.Find(sentence);

                Assert.AreEqual(1, names.Length);
                Assert.AreEqual(new Span(0, 1, Type), names[0]);

                sentence = new[] {
                    "Hi",
                    "Mike",
                    ",",
                    "it's",
                    "Stefanie",
                    "Schmidt",
                    "."
                };

                names = nameFinder.Find(sentence);

                Assert.AreEqual(2, names.Length);
                Assert.AreEqual(new Span(1, 2, Type), names[0]);
                Assert.AreEqual(new Span(4, 6, Type), names[1]);

            }
        }
示例#6
0
        public static TokenizerModel TrainModel(string path)
        {
            FileStream        fs     = new FileStream(path, FileMode.Open, FileAccess.Read);
            TokenSampleStream stream = new TokenSampleStream(new PlainTextByLineStream(fs));

            TrainingParameters trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Iterations, "100");
            trainParams.Set(Parameters.Cutoff, "0");

            return(TokenizerME.Train(stream, new TokenizerFactory(TRAINING_LANGUAGE, null, true), trainParams));
        }
示例#7
0
        public void TestPerceptronOnPrepAttachDataWithSkippedAveraging()
        {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron);
            trainParams.Set(Parameters.Cutoff, "1");
            trainParams.Set(Parameters.UseSkippedAveraging, "true");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);
            var model   = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.773706362961129);
        }
        public void TestMaxentOnPrepAttachDataWithParams() {
            var reportMap = new Dictionary<string, string>();
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt);
            trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass);
            trainParams.Set(Parameters.Cutoff, "1");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null);
            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.7997028967566229d);
        }
        public void TestPerceptronOnPrepAttachDataWithTolerance() {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron);
            trainParams.Set(Parameters.Cutoff, "1");
            trainParams.Set(Parameters.Iterations, "500");
            trainParams.Set(Parameters.Tolerance, "0.0001");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);
            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.7677642980935875);

        }
        public void TestPerceptronOnPrepAttachDataWithSkippedAveraging() {

            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron);
            trainParams.Set(Parameters.Cutoff, "1");
            trainParams.Set(Parameters.UseSkippedAveraging, "true");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);
            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.773706362961129);

        }
示例#11
0
        public void TestPerceptronOnPrepAttachDataWithTolerance()
        {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron);
            trainParams.Set(Parameters.Cutoff, "1");
            trainParams.Set(Parameters.Iterations, "500");
            trainParams.Set(Parameters.Tolerance, "0.0001");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);
            var model   = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.7677642980935875);
        }
示例#12
0
        public void TestQnOnPrepAttachDataInParallel()
        {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn);
            //trainParams.Set(Parameters.Iterations, "100");
            trainParams.Set(Parameters.Threads, "2");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);

            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            TestModel(model, 0.8115870264917059);
        }
示例#13
0
        public static LemmatizerModel TrainModel(string path)
        {
            FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read);

            TrainingParameters trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Iterations, "1");
            trainParams.Set(Parameters.Cutoff, "0");

            LemmatizerFactory lemmatizerFactory = new LemmatizerFactory();
            LemmaSampleStream sampleStream      = new LemmaSampleStream(new PlainTextByLineStream(fs));

            return(LemmatizerME.Train(TRAINING_LANGUAGE, sampleStream, trainParams, lemmatizerFactory));
        }
示例#14
0
        public void TestMaxentOnPrepAttachDataWithParams()
        {
            var reportMap   = new Dictionary <string, string>();
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt);
            trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass);
            trainParams.Set(Parameters.Cutoff, "1");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null);
            var model   = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.7997028967566229d);
        }
        public static SentenceModel TrainModel(string path)
        {
            FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read);

            TrainingParameters trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Iterations, "100");
            trainParams.Set(Parameters.Cutoff, "0");

            SentenceDetectorFactory detectorFactory = new SentenceDetectorFactory(TRAINING_LANGUAGE, true, null, null);
            SentenceSampleStream    sampleStream    = new SentenceSampleStream(new PlainTextByLineStream(fs));

            return(SentenceDetectorME.Train(TRAINING_LANGUAGE, sampleStream, detectorFactory, trainParams));
        }
示例#16
0
        public void TestQnOnPrepAttachDataWithL2Params()
        {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn);
            trainParams.Set(Parameters.DataIndexer, Parameters.DataIndexers.TwoPass);
            trainParams.Set(Parameters.Cutoff, "1");
            trainParams.Set(Parameters.L1Cost, "0");
            trainParams.Set(Parameters.L2Cost, "1.0");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);

            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            TestModel(model, 0.8227283981183461);
        }
示例#17
0
        public void TestWithNameEvaluationErrorListener()
        {
            using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentences.txt")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1"));

                var parameters = new TrainingParameters();
                parameters.Set(Parameters.Iterations, "70");
                parameters.Set(Parameters.Cutoff, "1");
                parameters.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt);

                var cv = new TokenNameFinderCrossValidator("en", Type, parameters, new NameEvaluationErrorListener());

                cv.Evaluate(sampleStream, 2);

                Assert.NotNull(cv.FMeasure);
            }
        }
        public void TestWithNameEvaluationErrorListener() {
            using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentences.txt")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1"));

                var parameters = new TrainingParameters();
                parameters.Set(Parameters.Iterations, "70");
                parameters.Set(Parameters.Cutoff, "1");
                parameters.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt);

                var cv = new TokenNameFinderCrossValidator("en", Type, parameters, new NameEvaluationErrorListener());

                cv.Evaluate(sampleStream, 2);

                Assert.NotNull(cv.FMeasure);
            }

        }
示例#19
0
        public void AbbreviationDefaultBehaviorTest()
        {
            var samples =
                "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine +

                "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine +
                "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel!" + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine;

            var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false)
            {
                { "12. Toedracht" },
                { "Tel." },
            };

            var trainingParameters = new TrainingParameters();

            trainingParameters.Set(Parameters.Algorithm, "MAXENT");
            trainingParameters.Set(Parameters.TrainerType, "Event");
            trainingParameters.Set(Parameters.Iterations, "100");
            trainingParameters.Set(Parameters.Cutoff, "5");

            char[] eos          = { '.', '?', '!' };
            var    sdFactory    = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos);
            var    stringReader = new StringReader(samples);
            var    stream       = new SentenceSampleStream(new PlainTextByLineStream(stringReader));

            var sentenceModel      = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters);
            var sentenceDetectorMe = new SentenceDetectorME(sentenceModel);

            var sentences = sentenceDetectorMe.SentDetect(samples);
            var expected  = samples.Split(new [] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);


            Assert.AreEqual(8, sentences.Length);
            for (var i = 0; i < sentences.Length; i++)
            {
                Assert.AreEqual(expected[i], sentences[i]);
            }
        }
示例#20
0
        public void TestCrossCompatibility()
        {
            using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
                var samples  = new TokenSampleStream(new PlainTextByLineStream(data));
                var mlParams = new TrainingParameters();
                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");
                var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);

                var sMe = new TokenizerME(model);

                TokenizerMETest.TestTokenizer(sMe);

                var sProbs = sMe.TokenProbabilities;

                // --- java \/

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel = new opennlp.tools.tokenize.TokenizerModel(
                    OpenNLP.CreateInputStream(sFile)
                    );

                var jMe = new opennlp.tools.tokenize.TokenizerME(jModel);

                TestJavaTokenizer(jMe);

                var jProbs = jMe.getTokenProbabilities();

                Assert.AreEqual(jProbs.Length, sProbs.Length);

                for (int i = 0; i < jProbs.Length; i++)
                {
                    // one difference :(
                    // -0.00000000000000011102230246251565
                    //
                    // but still "insignificant" :)
                    Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d);
                }
            }
        }
示例#21
0
        public static TokenizerModel CreateSimpleMaxentTokenModel() {
            var samples = new List<TokenSample> {
                new TokenSample("year", new[] {new Span(0, 4)}),
                new TokenSample("year,", new[] {new Span(0, 4), new Span(4, 5)}),
                new TokenSample("it,", new[] {new Span(0, 2), new Span(2, 3)}),
                new TokenSample("it", new[] {new Span(0, 2)}),
                new TokenSample("yes", new[] {new Span(0, 3)}),
                new TokenSample("yes,", new[] {new Span(0, 3), new Span(3, 4)})
            };

            var mlParams = new TrainingParameters();
            mlParams.Set(Parameters.Iterations, "100");
            mlParams.Set(Parameters.Cutoff, "0");

            return TokenizerME.Train(
                new CollectionObjectStream<TokenSample>(samples),
                new TokenizerFactory("en", null, true),
                mlParams);
        }
示例#22
0
        public void TestSentenceDetector()
        {
            using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) {
                var mlParams = new TrainingParameters();

                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");

                var sdFactory = new SentenceDetectorFactory("en", true, null, null);
                var stream    = new SentenceSampleStream(new PlainTextByLineStream(file));

                var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams);

                Assert.AreEqual("en", model.Language);
                Assert.AreEqual(model.UseTokenEnd, true);

                EvalSentences(new SentenceDetectorME(model));
            }
        }
示例#23
0
        internal static POSModel TrainPOSModel(ModelType type = ModelType.Maxent) {

            var p = new TrainingParameters();
            switch (type) {
                case ModelType.Maxent:
                    p.Set(Parameters.Algorithm, "MAXENT");
                    break;
                case ModelType.Perceptron:
                    p.Set(Parameters.Algorithm, "PERCEPTRON");
                    break;
                default:
                    throw new NotSupportedException();
            }

            p.Set(Parameters.Iterations, "100");
            p.Set(Parameters.Cutoff, "5");

            return POSTaggerME.Train("en", CreateSampleStream(), p, new POSTaggerFactory());
        }
        public void TestSentenceDetector() {
            using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) {

                var mlParams = new TrainingParameters();

                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");

                var sdFactory = new SentenceDetectorFactory("en", true, null, null);
                var stream = new SentenceSampleStream(new PlainTextByLineStream(file));

                var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams);

                Assert.AreEqual("en", model.Language);
                Assert.AreEqual(model.UseTokenEnd, true);

                EvalSentences(new SentenceDetectorME(model));
            }
        }
        public void AbbreviationDefaultBehaviorTest() {

            var samples =
                "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine +

                "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine +
                "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel!" + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine;

            var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) {
                {"12. Toedracht"},
                {"Tel."},
            };

            var trainingParameters = new TrainingParameters();

            trainingParameters.Set(Parameters.Algorithm, "MAXENT");
            trainingParameters.Set(Parameters.TrainerType, "Event");
            trainingParameters.Set(Parameters.Iterations, "100");
            trainingParameters.Set(Parameters.Cutoff, "5");

            char[] eos = { '.', '?', '!' };
            var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos);
            var stringReader = new StringReader(samples);
            var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader));

            var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters);
            var sentenceDetectorMe = new SentenceDetectorME(sentenceModel);

            var sentences = sentenceDetectorMe.SentDetect(samples);
            var expected = samples.Split(new []{ Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);


            Assert.AreEqual(8, sentences.Length);
            for (var i = 0; i < sentences.Length; i++)
                Assert.AreEqual(expected[i], sentences[i]);
            
        }
        public void TestMaxentOnPrepAttachDataWithParamsDefault() {
            var reportMap = new Dictionary<string, string>();
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt);

            var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null);
            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.8086159940579352d);
        }
示例#27
0
        public void Setup() {
            var sParams = new TrainingParameters();
            sParams.Set(Parameters.Iterations, "70");
            sParams.Set(Parameters.Cutoff, "1");

            var jParams = new opennlp.tools.util.TrainingParameters();
            jParams.put("Iterations", "70");
            jParams.put("Cutoff", "1");

            var sModel = ChunkerME.Train("en", ChunkerMETest.CreateSampleStream(), sParams, new ChunkerFactory());

            var jModel = opennlp.tools.chunker.ChunkerME.train("en", JavaSampleStream(), jParams,
                new opennlp.tools.chunker.ChunkerFactory());

            Assert.NotNull(sModel);
            Assert.NotNull(jModel);

            sChunker = new ChunkerME(sModel);
            jChunker = new opennlp.tools.chunker.ChunkerME(jModel);
        }
示例#28
0
        public void TestMaxentOnPrepAttachDataWithParamsDefault()
        {
            var reportMap   = new Dictionary <string, string>();
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEnt);

            var trainer = TrainerFactory.GetEventTrainer(trainParams, reportMap, null);
            var model   = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            PrepAttachDataUtility.TestModel(model, 0.8086159940579352d);
        }
示例#29
0
        public void TestQnOnPrepAttachDataWithParamsDefault()
        {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.MaxEntQn);

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);

            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            TestModel(model, 0.8115870264917059);
        }
示例#30
0
        public static TokenizerModel CreateSimpleMaxentTokenModel()
        {
            var samples = new List <TokenSample> {
                new TokenSample("year", new[] { new Span(0, 4) }),
                new TokenSample("year,", new[] { new Span(0, 4), new Span(4, 5) }),
                new TokenSample("it,", new[] { new Span(0, 2), new Span(2, 3) }),
                new TokenSample("it", new[] { new Span(0, 2) }),
                new TokenSample("yes", new[] { new Span(0, 3) }),
                new TokenSample("yes,", new[] { new Span(0, 3), new Span(3, 4) })
            };

            var mlParams = new TrainingParameters();

            mlParams.Set(Parameters.Iterations, "100");
            mlParams.Set(Parameters.Cutoff, "0");

            return(TokenizerME.Train(
                       new CollectionObjectStream <TokenSample>(samples),
                       new TokenizerFactory("en", null, true),
                       mlParams));
        }
示例#31
0
        public void TestSimpleTraining()
        {
            IObjectStream <DocumentSample> samples = new GenericObjectStream <DocumentSample>(new[] {
                new DocumentSample("1", new[] { "a", "b", "c" }),
                new DocumentSample("1", new[] { "a", "b", "c", "1", "2" }),
                new DocumentSample("1", new[] { "a", "b", "c", "3", "4" }),
                new DocumentSample("0", new[] { "x", "y", "z" }),
                new DocumentSample("0", new[] { "x", "y", "z", "5", "6" }),
                new DocumentSample("0", new[] { "x", "y", "z", "7", "8" })
            });

            var param = new TrainingParameters();

            param.Set(Parameters.Iterations, "100");
            param.Set(Parameters.Cutoff, "0");

            var model = DocumentCategorizerME.Train("x-unspecified", samples, param, new DocumentCategorizerFactory());

            var doccat = new DocumentCategorizerME(model);

            var aProbs = doccat.Categorize("a");

            Assert.AreEqual("1", doccat.GetBestCategory(aProbs));

            var bProbs = doccat.Categorize("x");

            Assert.AreEqual("0", doccat.GetBestCategory(bProbs));

            //test to make sure sorted map's last key is cat 1 because it has the highest score.
            var sortedScoreMap = doccat.SortedScoreMap("a");

            // first 0
            // second 1 (last)
            foreach (var pair in sortedScoreMap)
            {
                Assert.AreEqual("0", pair.Value[0]);
                break;
            }
        }
示例#32
0
        internal static POSModel TrainPosModel(ModelType type = ModelType.Maxent)
        {
            var p = new TrainingParameters();

            switch (type)
            {
            case ModelType.Maxent:
                p.Set(Parameters.Algorithm, "MAXENT");
                break;

            case ModelType.Perceptron:
                p.Set(Parameters.Algorithm, "PERCEPTRON");
                break;

            default:
                throw new NotSupportedException();
            }

            p.Set(Parameters.Iterations, "100");
            p.Set(Parameters.Cutoff, "5");

            return(POSTaggerME.Train("en", CreateSampleStream(), p, new POSTaggerFactory()));
        }
示例#33
0
        public void Setup()
        {
            var sParams = new TrainingParameters();

            sParams.Set(Parameters.Iterations, "70");
            sParams.Set(Parameters.Cutoff, "1");

            var jParams = new opennlp.tools.util.TrainingParameters();

            jParams.put("Iterations", "70");
            jParams.put("Cutoff", "1");

            var sModel = ChunkerME.Train("en", ChunkerMETest.CreateSampleStream(), sParams, new ChunkerFactory());

            var jModel = opennlp.tools.chunker.ChunkerME.train("en", JavaSampleStream(), jParams,
                                                               new opennlp.tools.chunker.ChunkerFactory());

            Assert.NotNull(sModel);
            Assert.NotNull(jModel);

            sChunker = new ChunkerME(sModel);
            jChunker = new opennlp.tools.chunker.ChunkerME(jModel);
        }
示例#34
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="iterations">The number of training iterations.</param>
        /// <param name="cutoff">The min number of times a feature must be seen.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="ParserModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static ParserModel Train(string languageCode, IObjectStream <Parse> samples, AbstractHeadRules rules, int iterations, int cutoff, Monitor monitor)
        {
            var param = new TrainingParameters();

            param.Set("dict", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));

            param.Set("tagger", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("tagger", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("chunker", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("chunker", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("check", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("check", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("build", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("build", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            return(Train(languageCode, samples, rules, param, monitor));
        }
示例#35
0
        public void TestNameFinderWithTypes()
        {
            using (var file = Tests.OpenFile("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file, "ISO-8859-1"));

                var param = new TrainingParameters();
                param.Set(Parameters.Iterations, "70");
                param.Set(Parameters.Cutoff, "1");

                var model = NameFinderME.Train(
                    "en",
                    sampleStream,
                    param,
                    new TokenNameFinderFactory(null, new Dictionary <string, object>()));

                var nameFinder = new NameFinderME(model);

                // now test if it can detect the sample sentences

                var sentence = new[] { "Alisa", "appreciated", "the", "hint", "and", "enjoyed", "a", "delicious", "traditional", "meal." };
                var names    = nameFinder.Find(sentence);

                Assert.AreEqual(1, names.Length);
                Assert.AreEqual(new Span(0, 1, "person"), names[0]);
                Assert.True(HasOtherAsOutcome(model));

                sentence = new[] { "Hi", "Mike", ",", "it's", "Stefanie", "Schmidt", "." };
                names    = nameFinder.Find(sentence);

                Assert.AreEqual(2, names.Length);
                Assert.AreEqual(new Span(1, 2, "person"), names[0]);
                Assert.AreEqual(new Span(4, 6, "person"), names[1]);
                Assert.AreEqual("person", names[0].Type);
                Assert.AreEqual("person", names[1].Type);
            }
        }
        public void TestPerceptronOnPrepAttachDataWithStepSizeDecrease() {
            var trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Algorithm, Parameters.Algorithms.Perceptron);
            trainParams.Set(Parameters.Cutoff, "1");
            trainParams.Set(Parameters.Iterations, "500");
            trainParams.Set(Parameters.StepSizeDecrease, "0.06");

            var trainer = TrainerFactory.GetEventTrainer(trainParams, null, null);
            var model = trainer.Train(PrepAttachDataUtility.CreateTrainingStream());

            /*
             * The java test gives an error too, soo.... for now i'll assume that is correct :P
             * 
             * java.lang.AssertionError: expected:<0.7756870512503095> but was:<0.7766773953948998>
                at org.junit.Assert.fail(Assert.java:91)
                at org.junit.Assert.failNotEquals(Assert.java:645)
                at org.junit.Assert.assertEquals(Assert.java:441)
                at org.junit.Assert.assertEquals(Assert.java:510)
            */
            //PrepAttachDataUtility.TestModel(model, 0.7756870512503095); < OpenNLP value

            PrepAttachDataUtility.TestModel(model, 0.77742015350334237);
        }
        public void TestEverything() {
            using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) {

                var mlParams = new TrainingParameters();

                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");

                var sdFactory = new SentenceDetectorFactory("en", true, null, null);
                var stream = new SentenceSampleStream(new PlainTextByLineStream(file));

                var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams);

                Assert.AreEqual("en", model.Language);
                Assert.AreEqual(model.UseTokenEnd, true);

                var sMe = new SentenceDetectorME(model);
                
                // test the SharpNL sentences
                SentenceDetectorMETest.EvalSentences(sMe);

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel2 = new JavaModel(OpenNLP.CreateInputStream(sFile));

                var jMe = new JavaSDME(jModel2);

                // test the Java OpenNLP sentences.
                JavaEvalSentences(jMe);

                // first try?! Yes! ;-)

            }
        }
示例#38
0
        public void TestOnlyWithNamesWithTypes() {
            using (var file = Tests.OpenFile("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file));

                var param = new TrainingParameters();
                param.Set(Parameters.Iterations, "70");
                param.Set(Parameters.Cutoff, "1");

                var model = NameFinderME.Train(
                    "en",
                    sampleStream,
                    param,
                    new TokenNameFinderFactory(null, new Dictionary<string, object>()));

                var nameFinder = new NameFinderME(model);

                // now test if it can detect the sample sentences
                var sentence = WhitespaceTokenizer.Instance.Tokenize(
                    "Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman Robert Aderholt " +
                    "Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander");

                var names = nameFinder.Find(sentence);

                Assert.AreEqual(new Span(0, 2, "person"), names[0]);
                Assert.AreEqual(new Span(2, 4, "person"), names[1]);
                Assert.AreEqual(new Span(4, 6, "person"), names[2]);
                Assert.True(!HasOtherAsOutcome(model));
            }
        }
示例#39
0
        public void TestOnlyWithEntitiesWithTypes() {

            using (var file = Tests.OpenFile("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file));

                var param = new TrainingParameters();
                param.Set(Parameters.Iterations, "70");
                param.Set(Parameters.Cutoff, "1");

                var model = NameFinderME.Train(
                    "en",
                    sampleStream,
                    param,
                    new TokenNameFinderFactory(null, new Dictionary<string, object>()));

                var nameFinder = new NameFinderME(model);

                // now test if it can detect the sample sentences
                var sentence = WhitespaceTokenizer.Instance.Tokenize("NATO United States Barack Obama");

                var names = nameFinder.Find(sentence);

                Assert.AreEqual(new Span(0, 1, "organization"), names[0]);
                Assert.AreEqual(new Span(1, 3, "location"), names[1]);
                Assert.AreEqual(new Span(3, 5, "person"), names[2]);
                Assert.False(HasOtherAsOutcome(model));
            }
        }
示例#40
0
        public void TestNameFinderWithMultipleTypes() {
            using (var file = Tests.OpenFile("opennlp/tools/namefind/voa1.train")) {
                var sampleStream = new NameSampleStream(new PlainTextByLineStream(file));

                var param = new TrainingParameters();
                param.Set(Parameters.Iterations, "70");
                param.Set(Parameters.Cutoff, "1");

                var model = NameFinderME.Train(
                    "en",
                    sampleStream,
                    param,
                    new TokenNameFinderFactory(null, new Dictionary<string, object>()));

                var nameFinder = new NameFinderME(model);

                // now test if it can detect the sample sentences
                var sentence = new [] { "U", ".", "S", ".", "President", "Barack", "Obama", "has", 
                    "arrived", "in", "South", "Korea", ",", "where", "he", "is", "expected", "to", 
                    "show", "solidarity", "with", "the", "country", "'", "s", "president", "in",
                    "demanding", "North", "Korea", "move", "toward", "ending", "its", "nuclear", 
                    "weapons", "programs", "." };

                var names = nameFinder.Find(sentence);

                Assert.AreEqual(4, names.Length);
                Assert.AreEqual(new Span(0, 4, "location"), names[0]);
                Assert.AreEqual(new Span(5, 7, "person"), names[1]);
                Assert.AreEqual(new Span(10, 12, "location"), names[2]);
                Assert.AreEqual(new Span(28, 30, "location"), names[3]);

                /*
                These asserts are not needed because the equality comparer handles the Type 
                assertEquals("location", names1[0].getType());
                assertEquals("person", names1[1].getType());
                assertEquals("location", names1[2].getType());
                assertEquals("location", names1[3].getType());
                 */

                sentence = new[] {
                    "Scott", "Snyder", "is", "the", "director", "of", "the", 
                    "Center", "for", "U", ".", "S", ".", "Korea", "Policy", "."
                };

                names = nameFinder.Find(sentence);

                Assert.AreEqual(2, names.Length);
                Assert.AreEqual(new Span(0, 2, "person"), names[0]);
                Assert.AreEqual(new Span(7, 15, "organization"), names[1]);

                /* 
                 
                assertEquals("person", names2[0].getType());
                assertEquals("organization", names2[1].getType());
                 
                */
            }
        }           
示例#41
0
        public void TestCrossCompatibility() {
            using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
                var samples = new TokenSampleStream(new PlainTextByLineStream(data));
                var mlParams = new TrainingParameters();
                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");
                var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);

                var sMe = new TokenizerME(model);

                TokenizerMETest.TestTokenizer(sMe);

                var sProbs = sMe.TokenProbabilities;

                // --- java \/

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel = new opennlp.tools.tokenize.TokenizerModel(
                    OpenNLP.CreateInputStream(sFile) 
                );

                var jMe = new opennlp.tools.tokenize.TokenizerME(jModel);

                TestJavaTokenizer(jMe);

                var jProbs = jMe.getTokenProbabilities();

                Assert.AreEqual(jProbs.Length, sProbs.Length);

                for (int i = 0; i < jProbs.Length; i++) {

                    // one difference :(
                    // -0.00000000000000011102230246251565
                    //
                    // but still "insignificant" :)
                    Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d);
                }
            }
        }
示例#42
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="iterations">The number of training iterations.</param>
        /// <param name="cutoff">The min number of times a feature must be seen.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="ParserModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static ParserModel Train(string languageCode, IObjectStream<Parse> samples, AbstractHeadRules rules, int iterations, int cutoff, Monitor monitor) {

            var param = new TrainingParameters();

            param.Set("dict", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));

            param.Set("tagger", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("tagger", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("chunker", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("chunker", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("check", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("check", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("build", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("build", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            return Train(languageCode, samples, rules, param, monitor);
        }
 public void TestCustomTrainer()
 {
     mlParams.Set(Parameters.Algorithm, "Dummy");
     Assert.True(TrainerFactory.IsValid(mlParams));
 }