Beispiel #1
0
        public void Tokenization()
        {
            using var modelIn = new java.io.FileInputStream(GetModel("en-token.bin"));

            var model     = new opennlp.tools.tokenize.TokenizerModel(modelIn);
            var tokenizer = new opennlp.tools.tokenize.TokenizerME(model);

            var tokens = tokenizer.tokenize("An input sample sentence.");

            System.Console.WriteLine(string.Join(";", tokens));

            Assert.AreEqual(5, tokens.Length);
        }
Beispiel #2
0
        public void TestCrossCompatibility()
        {
            using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
                var samples  = new TokenSampleStream(new PlainTextByLineStream(data));
                var mlParams = new TrainingParameters();
                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");
                var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);

                var sMe = new TokenizerME(model);

                TokenizerMETest.TestTokenizer(sMe);

                var sProbs = sMe.TokenProbabilities;

                // --- java \/

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel = new opennlp.tools.tokenize.TokenizerModel(
                    OpenNLP.CreateInputStream(sFile)
                    );

                var jMe = new opennlp.tools.tokenize.TokenizerME(jModel);

                TestJavaTokenizer(jMe);

                var jProbs = jMe.getTokenProbabilities();

                Assert.AreEqual(jProbs.Length, sProbs.Length);

                for (int i = 0; i < jProbs.Length; i++)
                {
                    // one difference :(
                    // -0.00000000000000011102230246251565
                    //
                    // but still "insignificant" :)
                    Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d);
                }
            }
        }
Beispiel #3
0
        public void TestCrossCompatibility() {
            using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
                var samples = new TokenSampleStream(new PlainTextByLineStream(data));
                var mlParams = new TrainingParameters();
                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");
                var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);

                var sMe = new TokenizerME(model);

                TokenizerMETest.TestTokenizer(sMe);

                var sProbs = sMe.TokenProbabilities;

                // --- java \/

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel = new opennlp.tools.tokenize.TokenizerModel(
                    OpenNLP.CreateInputStream(sFile) 
                );

                var jMe = new opennlp.tools.tokenize.TokenizerME(jModel);

                TestJavaTokenizer(jMe);

                var jProbs = jMe.getTokenProbabilities();

                Assert.AreEqual(jProbs.Length, sProbs.Length);

                for (int i = 0; i < jProbs.Length; i++) {

                    // one difference :(
                    // -0.00000000000000011102230246251565
                    //
                    // but still "insignificant" :)
                    Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d);
                }
            }
        }
Beispiel #4
0
        // Constructors and finalizers:
        private Repository()
        {
            _assemblyName = Regex.Match(_assemblyFullName, "^(.*?),.*$").Result("$1");

            _rootDrive = ("/usr/project/xtmp/dp195/Poetix18/").Replace(@"\", Dsc);
            _nlpFolder = ("rhetorica/nlp/").Replace(@"\", Dsc);

            _openNlpModelsFolder = ("OpenNLP/models/").Replace(@"\", Dsc);
            _openNlpModelsPath   = RootDrive + _nlpFolder + _openNlpModelsFolder;

            _wordNetFolder = ("WordNet_3/").Replace(@"\", Dsc);
            _wordNetPath   = RootDrive + _nlpFolder + _wordNetFolder;

            _grammarFolder = ("StanfordParser/grammar/").Replace(@"\", Dsc);
            _grammarPath   = RootDrive + _nlpFolder + _grammarFolder;

            _dataFolder   = ("data/").Replace(@"\", Dsc);
            _nlpTextsPath = RootDrive + _dataFolder;

            string[] localTextDirectoryParts =
            {
                CurrentAssemblyDirectoryPath,
                "..",                        "..","..", "data"
                //"..", "..", "text"
            };
            _localTextPath = Path.Combine(localTextDirectoryParts) + "/"; // For development use

            // WordNet engine:
            Console.Write("Loading WordNet engine.... ");
            _wordNetEngine = new WordNetEngine(WordNetPath, true);
            Console.WriteLine("Done.");

            // OpenNLP sentence detector:
            Console.Write("Loading OpenNLP sentence detector.... ");
            java.io.FileInputStream modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-sent.bin");
            _sentenceModel = new SentenceModel(modelInputStream);
            modelInputStream.close();
            _sentenceDetector = new SentenceDetectorME(_sentenceModel);
            Console.WriteLine("Done.");

            // OpenNLP tokenizer:
            Console.Write("Loading OpenNLP tokenizer.... ");
            modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-token.bin");
            _tokenizerModel  = new opennlp.tools.tokenize.TokenizerModel(modelInputStream);
            modelInputStream.close();
            _tokenizer = new opennlp.tools.tokenize.TokenizerME(_tokenizerModel);
            Console.WriteLine("Done.");

            // OpenNLP name finder:
            Console.Write("Loading OpenNLP name finder.... ");
            modelInputStream      = new java.io.FileInputStream(OpenNlpModelsPath + "en-ner-person.bin");
            _tokenNameFinderModel = new TokenNameFinderModel(modelInputStream);
            modelInputStream.close();
            _nameFinder = new NameFinderME(_tokenNameFinderModel);
            Console.WriteLine("Done.");

            // OpenNLP POS tagger:
            Console.Write("Loading OpenNLP POS tagger.... ");
            modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-pos-maxent.bin");
            _posModel        = new POSModel(modelInputStream);
            modelInputStream.close();
            _tagger = new POSTaggerME(_posModel);
            Console.WriteLine("Done.");

            // OpenNLP chunker:
            Console.Write("Loading OpenNLP chunker.... ");
            modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-chunker.bin");
            _chunkerModel    = new ChunkerModel(modelInputStream);
            modelInputStream.close();
            _chunker = new ChunkerME(_chunkerModel);
            Console.WriteLine("Done.");

            // OpenNLP parser:
            if (_loadParser)
            {
                Console.Write("Loading OpenNLP parser.... ");
                modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-parser-chunking.bin");
                _parserModel     = new ParserModel(modelInputStream);
                modelInputStream.close();
                _parser = ParserFactory.create(_parserModel);
                Console.WriteLine("Done.");
            }

            // Stanford parser:
            //_stanfordParser = new LexicalizedParser(GrammarPath + "englishPCFG.ser.gz"); // Obsolete method
            _stanfordParser = LexicalizedParser.loadModel(GrammarPath + "englishPCFG.ser.gz");

            // Porter stemmer:
            _porterStemmer = new PorterStemmer();
        }