Exemplo n.º 1
0
        public void Test1SentenceDictionaryOnly()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
            {
                var loader = new ClasspathResourceLoader(GetType());

                var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> {
                    { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile }
                });
                opennlpFactory.Inform(loader);
                var opennlp = opennlpFactory.Create(reader);

                var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> {
                    { "posTaggerModel", posTaggerModelFile }
                });
                opennlpPOSFilterFactory.Inform(loader);
                var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);

                var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary <string, string> {
                    { "dictionary", lemmatizerDictFile }
                });
                opennlpLemmatizerFilterFactory.Inform(loader);
                var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);

                return(new TokenStreamComponents(opennlp, opennlpLemmatizerFilter));
            });

            //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
            //.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
            //.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
            //.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
            //.build();
            AssertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
                             SENTENCE_posTags, null, null, true);
        }
Exemplo n.º 2
0
        public void TestKeywordAttributeAwarenessDictionaryAndMaxEnt()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
            {
                var loader = new ClasspathResourceLoader(GetType());

                var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> {
                    { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile }
                });
                opennlpFactory.Inform(loader);
                var opennlp = opennlpFactory.Create(reader);

                var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> {
                    { "posTaggerModel", posTaggerModelFile }
                });
                opennlpPOSFilterFactory.Inform(loader);
                var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);

                var keywordRepeatFilterFactory = new KeywordRepeatFilterFactory(new Dictionary <string, string>());
                var keywordRepeatFilter        = keywordRepeatFilterFactory.Create(opennlpPOSFilter);

                var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary <string, string> {
                    { "dictionary", lemmatizerDictFile }, { "lemmatizerModel", lemmatizerModelFile }
                });
                opennlpLemmatizerFilterFactory.Inform(loader);
                var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(keywordRepeatFilter);

                var removeDuplicatesTokenFilterFactory = new RemoveDuplicatesTokenFilterFactory(new Dictionary <string, string>());
                var removeDuplicatesTokenFilter        = removeDuplicatesTokenFilterFactory.Create(opennlpLemmatizerFilter);

                return(new TokenStreamComponents(opennlp, removeDuplicatesTokenFilter));
            });

            //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
            //    .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
            //    .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
            //    .addTokenFilter(KeywordRepeatFilterFactory.class)
            //    .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
            //    .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
            //    .build();
            AssertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
                             SENTENCES_both_keep_orig_posTags, null, null, true);
        }