public void Test1SentenceDictionaryOnly() { Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) => { var loader = new ClasspathResourceLoader(GetType()); var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } }); opennlpFactory.Inform(loader); var opennlp = opennlpFactory.Create(reader); var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> { { "posTaggerModel", posTaggerModelFile } }); opennlpPOSFilterFactory.Inform(loader); var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp); var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary <string, string> { { "dictionary", lemmatizerDictFile } }); opennlpLemmatizerFilterFactory.Inform(loader); var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter); return(new TokenStreamComponents(opennlp, opennlpLemmatizerFilter)); }); //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) //.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) //.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin") //.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict") //.build(); AssertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags, null, null, true); }
public void TestKeywordAttributeAwarenessDictionaryAndMaxEnt() { Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) => { var loader = new ClasspathResourceLoader(GetType()); var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } }); opennlpFactory.Inform(loader); var opennlp = opennlpFactory.Create(reader); var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> { { "posTaggerModel", posTaggerModelFile } }); opennlpPOSFilterFactory.Inform(loader); var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp); var keywordRepeatFilterFactory = new KeywordRepeatFilterFactory(new Dictionary <string, string>()); var keywordRepeatFilter = keywordRepeatFilterFactory.Create(opennlpPOSFilter); var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary <string, string> { { "dictionary", lemmatizerDictFile }, { "lemmatizerModel", lemmatizerModelFile } }); opennlpLemmatizerFilterFactory.Inform(loader); var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(keywordRepeatFilter); var removeDuplicatesTokenFilterFactory = new RemoveDuplicatesTokenFilterFactory(new Dictionary <string, string>()); var removeDuplicatesTokenFilter = removeDuplicatesTokenFilterFactory.Create(opennlpLemmatizerFilter); return(new TokenStreamComponents(opennlp, removeDuplicatesTokenFilter)); }); //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) // .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) // .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) // .addTokenFilter(KeywordRepeatFilterFactory.class) // .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile) // .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class) // .build(); AssertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null, SENTENCES_both_keep_orig_posTags, null, null, true); }