public void TestSentenceDetect()
        {
            const string data =
                "The Apache OpenNLP library is a machine learning based toolkit for the " +
                "processing of natural language text. It supports the most common NLP tasks, " +
                "such as tokenization, sentence segmentation, part-of-speech tagging, named entity " +
                "extraction, chunking, parsing, and coreference resolution. These tasks are usually " +
                "required to build more advanced text processing services.";

            var jME = new JavaSDME(OpenJavaModel());
            var sME = new SharpSDME(OpenSharpModel());

            TestSentences(jME, sME, data);

            // nice :D
            // Knuppe: I swear I did not expect to hit the same (double precision) the probability.
        }
Пример #2
0
        private static void TestSentences(JavaSDME javaMe, SharpSDME sharpME, string data) {
            var jSentences = javaMe.sentPosDetect(data);
            var sSentences = sharpME.SentPosDetect(data);

            var jProb = javaMe.getSentenceProbabilities();
            var sProb = sharpME.GetSentenceProbabilities();

            Assert.AreEqual(jSentences.Length, sSentences.Length);
            Assert.AreEqual(jProb.Length, sProb.Length);

            for (int i = 0; i < jSentences.Length; i++) {
                var a = jSentences[i].getCoveredText(data).toString(); // CharSequence -> string
                var b = sSentences[i].GetCoveredText(data);

                Assert.AreEqual(jSentences[i].getStart(), sSentences[i].Start);
                Assert.AreEqual(jSentences[i].getEnd(), sSentences[i].End);

                Assert.AreEqual(a, b);
                Assert.AreEqual(jProb[i], sProb[i]);
            }
        }
        private static void TestSentences(JavaSDME javaMe, SharpSDME sharpME, string data)
        {
            var jSentences = javaMe.sentPosDetect(data);
            var sSentences = sharpME.SentPosDetect(data);

            var jProb = javaMe.getSentenceProbabilities();
            var sProb = sharpME.GetSentenceProbabilities();

            Assert.AreEqual(jSentences.Length, sSentences.Length);
            Assert.AreEqual(jProb.Length, sProb.Length);

            for (var i = 0; i < jSentences.Length; i++)
            {
                var a = jSentences[i].getCoveredText(data).toString(); // CharSequence -> string
                var b = sSentences[i].GetCoveredText(data);

                Assert.AreEqual(jSentences[i].getStart(), sSentences[i].Start);
                Assert.AreEqual(jSentences[i].getEnd(), sSentences[i].End);

                Assert.AreEqual(a, b);
                Assert.AreEqual(jProb[i], sProb[i]);
            }
        }
        public void AbbreviationDefaultBehaviorTest() {

            var samples =
                "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine +

                "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine +
                "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine +
                "Dit is een 2e regel!" + Environment.NewLine +
                "Dit is een 2e regel." + Environment.NewLine;

            var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) {
                {"12. Toedracht"},
                {"Tel."},
            };

            var trainingParameters = new TrainingParameters();

            trainingParameters.Set(Parameters.Algorithm, "MAXENT");
            trainingParameters.Set(Parameters.TrainerType, "Event");
            trainingParameters.Set(Parameters.Iterations, "100");
            trainingParameters.Set(Parameters.Cutoff, "5");

            char[] eos = { '.', '?', '!' };
            var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos);
            var stringReader = new StringReader(samples);
            var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader));

            var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters);
            var sentenceDetectorMe = new SentenceDetectorME(sentenceModel);

            var sentences = sentenceDetectorMe.SentDetect(samples);
            var expected = samples.Split(new []{ Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);


            Assert.AreEqual(8, sentences.Length);
            for (var i = 0; i < sentences.Length; i++)
                Assert.AreEqual(expected[i], sentences[i]);
            
        }
        internal static void EvalSentences(SentenceDetectorME sentDetect) {
            const string sampleSentences1 = "This is a test. There are many tests, this is the second.";
            var sents = sentDetect.SentDetect(sampleSentences1);
            Assert.AreEqual(sents.Length, 2);
            Assert.AreEqual(sents[0], "This is a test.");
            Assert.AreEqual(sents[1], "There are many tests, this is the second.");
            var probs = sentDetect.GetSentenceProbabilities();
            Assert.AreEqual(probs.Length, 2);

            const string sampleSentences2 = "This is a test. There are many tests, this is the second";
            sents = sentDetect.SentDetect(sampleSentences2);
            Assert.AreEqual(sents.Length, 2);
            probs = sentDetect.GetSentenceProbabilities();
            Assert.AreEqual(probs.Length, 2);
            Assert.AreEqual(sents[0], "This is a test.");
            Assert.AreEqual(sents[1], "There are many tests, this is the second");

            const string sampleSentences3 = "This is a \"test\". He said \"There are many tests, this is the second.\"";
            sents = sentDetect.SentDetect(sampleSentences3);
            Assert.AreEqual(sents.Length, 2);
            probs = sentDetect.GetSentenceProbabilities();
            Assert.AreEqual(probs.Length, 2);
            Assert.AreEqual(sents[0], "This is a \"test\".");
            Assert.AreEqual(sents[1], "He said \"There are many tests, this is the second.\"");

            const string sampleSentences4 = "This is a \"test\". I said \"This is a test.\"  Any questions?";
            sents = sentDetect.SentDetect(sampleSentences4);
            Assert.AreEqual(sents.Length, 3);
            probs = sentDetect.GetSentenceProbabilities();
            Assert.AreEqual(probs.Length, 3);
            Assert.AreEqual(sents[0], "This is a \"test\".");
            Assert.AreEqual(sents[1], "I said \"This is a test.\"");
            Assert.AreEqual(sents[2], "Any questions?");

            const string sampleSentences5 = "This is a one sentence test space at the end.    ";
            sents = sentDetect.SentDetect(sampleSentences5);
            Assert.AreEqual(1, sentDetect.GetSentenceProbabilities().Length);
            Assert.AreEqual(sents[0], "This is a one sentence test space at the end.");

            const string sampleSentences6 = "This is a one sentences test with tab at the end.            ";
            sents = sentDetect.SentDetect(sampleSentences6);
            Assert.AreEqual(sents[0], "This is a one sentences test with tab at the end.");

            const string sampleSentences7 = "This is a test.    With spaces between the two sentences.";
            sents = sentDetect.SentDetect(sampleSentences7);
            Assert.AreEqual(sents[0], "This is a test.");
            Assert.AreEqual(sents[1], "With spaces between the two sentences.");

            const string sampleSentences9 = "";
            sents = sentDetect.SentDetect(sampleSentences9);
            Assert.AreEqual(0, sents.Length);

            const string sampleSentences10 = "               "; // whitespaces and tabs
            sents = sentDetect.SentDetect(sampleSentences10);
            Assert.AreEqual(0, sents.Length);

            const string sampleSentences11 = "This is test sentence without a dot at the end and spaces          ";
            sents = sentDetect.SentDetect(sampleSentences11);
            Assert.AreEqual(sents[0], "This is test sentence without a dot at the end and spaces");
            probs = sentDetect.GetSentenceProbabilities();
            Assert.AreEqual(1, probs.Length);

            const string sampleSentence12 = "    This is a test.";
            sents = sentDetect.SentDetect(sampleSentence12);
            Assert.AreEqual(sents[0], "This is a test.");

            const string sampleSentence13 = " This is a test";
            sents = sentDetect.SentDetect(sampleSentence13);
            Assert.AreEqual(sents[0], "This is a test");

            // Test that sentPosDetect also works
            var pos = sentDetect.SentPosDetect(sampleSentences2);
            Assert.AreEqual(pos.Length, 2);
            probs = sentDetect.GetSentenceProbabilities();
            Assert.AreEqual(probs.Length, 2);
            Assert.AreEqual(new Span(0, 15), pos[0]);
            Assert.AreEqual(new Span(16, 56), pos[1]);
        }
Пример #6
0
        public void TestSentenceDetect() {

            const string data = 
                "The Apache OpenNLP library is a machine learning based toolkit for the " +
                "processing of natural language text. It supports the most common NLP tasks, "+
                "such as tokenization, sentence segmentation, part-of-speech tagging, named entity "+
                "extraction, chunking, parsing, and coreference resolution. These tasks are usually "+
                "required to build more advanced text processing services.";
            
            var jME = new JavaSDME(OpenJavaModel());
            var sME = new SharpSDME(OpenSharpModel());

            TestSentences(jME, sME, data);

            // nice :D
            // Knuppe: I swear I did not expect to hit the same (double precision) the probability.
        }
Пример #7
0
        public void TestEverything() {
            using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) {

                var mlParams = new TrainingParameters();

                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");

                var sdFactory = new SentenceDetectorFactory("en", true, null, null);
                var stream = new SentenceSampleStream(new PlainTextByLineStream(file));

                var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams);

                Assert.AreEqual("en", model.Language);
                Assert.AreEqual(model.UseTokenEnd, true);

                var sMe = new SentenceDetectorME(model);
                
                // test the SharpNL sentences
                SentenceDetectorMETest.EvalSentences(sMe);

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel2 = new JavaModel(OpenNLP.CreateInputStream(sFile));

                var jMe = new JavaSDME(jModel2);

                // test the Java OpenNLP sentences.
                JavaEvalSentences(jMe);

                // first try?! Yes! ;-)

            }
        }