public List<string> Tokenize()
        {
            var model = new TokenizerModel(File.OpenRead(BaseFolder + "en-token.bin"));
            var tokenizer = new TokenizerME(model);

            return tokenizer.Tokenize(this.Text).ToList();
        }
        public void TestTokenizerSimpleModel() {
            var model = TokenizerTestUtil.CreateMaxentTokenModel();
            var tokenizer = new TokenizerME(model);
            var tokens = tokenizer.Tokenize("test,");

            Assert.AreEqual(2, tokens.Length);
            Assert.AreEqual("test", tokens[0]);
            Assert.AreEqual(",", tokens[1]);
        }
Beispiel #3
0
        public void TestCrossCompatibility() {
            using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
                var samples = new TokenSampleStream(new PlainTextByLineStream(data));
                var mlParams = new TrainingParameters();
                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");
                var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);

                var sMe = new TokenizerME(model);

                TokenizerMETest.TestTokenizer(sMe);

                var sProbs = sMe.TokenProbabilities;

                // --- java \/

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel = new opennlp.tools.tokenize.TokenizerModel(
                    OpenNLP.CreateInputStream(sFile) 
                );

                var jMe = new opennlp.tools.tokenize.TokenizerME(jModel);

                TestJavaTokenizer(jMe);

                var jProbs = jMe.getTokenProbabilities();

                Assert.AreEqual(jProbs.Length, sProbs.Length);

                for (int i = 0; i < jProbs.Length; i++) {

                    // one difference :(
                    // -0.00000000000000011102230246251565
                    //
                    // but still "insignificant" :)
                    Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d);
                }
            }
        }
        public void TestTokenizer() {
            var model = TokenizerTestUtil.CreateMaxentTokenModel();
            var tokenizer = new TokenizerME(model);

            TestTokenizer(tokenizer);
        }