Example #1
0
        public virtual void TestInform()
        {
            //IResourceLoader loader = new ClasspathResourceLoader(typeof(TestStopFilter));
            IResourceLoader loader = new ClasspathResourceLoader(typeof(TestAnalyzers), "Lucene.Net"); // LUCENENET: Need to set to a type that is in the same path as the files

            assertTrue("loader is null and it shouldn't be", loader != null);
            CommonGramsQueryFilterFactory factory = (CommonGramsQueryFilterFactory)TokenFilterFactory("CommonGramsQuery", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt", "ignoreCase", "true");
            CharArraySet words = factory.CommonWords;

            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (CommonGramsQueryFilterFactory)TokenFilterFactory("CommonGramsQuery", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true");
            words   = factory.CommonWords;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (CommonGramsQueryFilterFactory)TokenFilterFactory("CommonGramsQuery", TEST_VERSION_CURRENT, loader, "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true");
            words   = factory.CommonWords;
            assertEquals(8, words.size());
            assertTrue(words.contains("he"));
            assertTrue(words.contains("him"));
            assertTrue(words.contains("his"));
            assertTrue(words.contains("himself"));
            assertTrue(words.contains("she"));
            assertTrue(words.contains("her"));
            assertTrue(words.contains("hers"));
            assertTrue(words.contains("herself"));
        }
        public virtual void TestInform()
        {
            IResourceLoader loader = new ClasspathResourceLoader(this.GetType());
            assertTrue("loader is null and it shouldn't be", loader != null);
            StopFilterFactory factory = (StopFilterFactory)TokenFilterFactory("Stop", "words", "stop-1.txt", "ignoreCase", "true");
            CharArraySet words = factory.StopWords;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (StopFilterFactory)TokenFilterFactory("Stop", "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true");
            words = factory.StopWords;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (StopFilterFactory)TokenFilterFactory("Stop", "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true");
            words = factory.StopWords;
            assertEquals(8, words.size());
            assertTrue(words.contains("he"));
            assertTrue(words.contains("him"));
            assertTrue(words.contains("his"));
            assertTrue(words.contains("himself"));
            assertTrue(words.contains("she"));
            assertTrue(words.contains("her"));
            assertTrue(words.contains("hers"));
            assertTrue(words.contains("herself"));

            // defaults
            factory = (StopFilterFactory)TokenFilterFactory("Stop");
            assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.StopWords);
            assertEquals(false, factory.IgnoreCase);
        }
        public virtual void TestInform()
        {
            //IResourceLoader loader = new ClasspathResourceLoader(typeof(TestStopFilter));
            IResourceLoader loader = new ClasspathResourceLoader(typeof(TestAnalyzers), "Lucene.Net"); // LUCENENET: Need to set to a type that is in the same path as the files
            assertTrue("loader is null and it shouldn't be", loader != null);
            CommonGramsFilterFactory factory = (CommonGramsFilterFactory)TokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt", "ignoreCase", "true");
            CharArraySet words = factory.CommonWords;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (CommonGramsFilterFactory)TokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true");
            words = factory.CommonWords;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (CommonGramsFilterFactory)TokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true");
            words = factory.CommonWords;
            assertEquals(8, words.size());
            assertTrue(words.contains("he"));
            assertTrue(words.contains("him"));
            assertTrue(words.contains("his"));
            assertTrue(words.contains("himself"));
            assertTrue(words.contains("she"));
            assertTrue(words.contains("her"));
            assertTrue(words.contains("hers"));
            assertTrue(words.contains("herself"));
        }
        public void TestBasic()
        {
            var      loader   = new ClasspathResourceLoader(GetType());
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) => {
                var tokenizerFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> {
                    { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile }
                });
                tokenizerFactory.Inform(loader);
                var tokenizer = tokenizerFactory.Create(reader);

                var filter1Factory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> {
                    { "posTaggerModel", posTaggerModelFile }
                });
                filter1Factory.Inform(loader);
                var filter1 = filter1Factory.Create(tokenizer);

                return(new TokenStreamComponents(tokenizer, filter1));
            });

            //    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(GetType()))
            //.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
            //.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
            //.build();
            AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
        }
Example #5
0
        public void Test1SentenceDictionaryOnly()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
            {
                var loader = new ClasspathResourceLoader(GetType());

                var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> {
                    { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile }
                });
                opennlpFactory.Inform(loader);
                var opennlp = opennlpFactory.Create(reader);

                var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> {
                    { "posTaggerModel", posTaggerModelFile }
                });
                opennlpPOSFilterFactory.Inform(loader);
                var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);

                var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary <string, string> {
                    { "dictionary", lemmatizerDictFile }
                });
                opennlpLemmatizerFilterFactory.Inform(loader);
                var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);

                return(new TokenStreamComponents(opennlp, opennlpLemmatizerFilter));
            });

            //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
            //.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
            //.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
            //.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
            //.build();
            AssertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
                             SENTENCE_posTags, null, null, true);
        }
        public virtual void TestInform()
        {
            IResourceLoader loader = new ClasspathResourceLoader(this.GetType());

            assertTrue("loader is null and it shouldn't be", loader != null);
            StopFilterFactory factory = (StopFilterFactory)TokenFilterFactory("Stop", "words", "stop-1.txt", "ignoreCase", "true");
            CharArraySet      words   = factory.StopWords;

            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (StopFilterFactory)TokenFilterFactory("Stop", "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true");
            words   = factory.StopWords;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
            assertTrue(factory.IgnoreCase + " does not equal: " + true, factory.IgnoreCase == true);

            factory = (StopFilterFactory)TokenFilterFactory("Stop", "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true");
            words   = factory.StopWords;
            assertEquals(8, words.size());
            assertTrue(words.contains("he"));
            assertTrue(words.contains("him"));
            assertTrue(words.contains("his"));
            assertTrue(words.contains("himself"));
            assertTrue(words.contains("she"));
            assertTrue(words.contains("her"));
            assertTrue(words.contains("hers"));
            assertTrue(words.contains("herself"));

            // defaults
            factory = (StopFilterFactory)TokenFilterFactory("Stop");
            assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.StopWords);
            assertEquals(false, factory.IgnoreCase);
        }
        public void TestMissingDictionary()
        {
            IResourceLoader loader = new ClasspathResourceLoader(typeof(TestMorfologikFilterFactory));

            IOException expected = NUnit.Framework.Assert.Throws <IOException>(() =>
            {
                IDictionary <String, String> @params = new JCG.Dictionary <String, String>();
                @params[MorfologikFilterFactory.DICTIONARY_ATTRIBUTE] = "missing-dictionary-resource.dict";
                MorfologikFilterFactory factory = new MorfologikFilterFactory(@params);
                factory.Inform(loader);
            });

            assertTrue(expected.Message.Contains("Resource not found"));
        }
        public virtual void TestInform()
        {
            IResourceLoader loader = new ClasspathResourceLoader(this.GetType());
            assertTrue("loader is null and it shouldn't be", loader != null);
            KeepWordFilterFactory factory = (KeepWordFilterFactory)TokenFilterFactory("KeepWord", "words", "keep-1.txt", "ignoreCase", "true");
            CharArraySet words = factory.Words;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);

            factory = (KeepWordFilterFactory)TokenFilterFactory("KeepWord", "words", "keep-1.txt, keep-2.txt", "ignoreCase", "true");
            words = factory.Words;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
        }
        public void TestExplicitDictionary()
        {
            IResourceLoader loader = new ClasspathResourceLoader(typeof(TestMorfologikFilterFactory));

            StringReader reader = new StringReader("inflected1 inflected2");
            IDictionary <String, String> @params = new JCG.Dictionary <string, string>();

            @params[MorfologikFilterFactory.DICTIONARY_ATTRIBUTE] = "custom-dictionary.dict";
            MorfologikFilterFactory factory = new MorfologikFilterFactory(@params);

            factory.Inform(loader);
            TokenStream stream = new MockTokenizer(reader); // whitespaceMockTokenizer(reader);

            stream = factory.Create(stream);
            AssertTokenStreamContents(stream, new String[] { "lemma1", "lemma2" });
        }
Example #10
0
        public virtual void TestInform()
        {
            IResourceLoader loader = new ClasspathResourceLoader(this.GetType());

            assertTrue("loader is null and it shouldn't be", loader != null);
            KeepWordFilterFactory factory = (KeepWordFilterFactory)TokenFilterFactory("KeepWord", "words", "keep-1.txt", "ignoreCase", "true");
            CharArraySet          words   = factory.Words;

            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);

            factory = (KeepWordFilterFactory)TokenFilterFactory("KeepWord", "words", "keep-1.txt, keep-2.txt", "ignoreCase", "true");
            words   = factory.Words;
            assertTrue("words is null and it shouldn't be", words != null);
            assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
        }
Example #11
0
        public void TestKeywordAttributeAwarenessDictionaryAndMaxEnt()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
            {
                var loader = new ClasspathResourceLoader(GetType());

                var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> {
                    { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile }
                });
                opennlpFactory.Inform(loader);
                var opennlp = opennlpFactory.Create(reader);

                var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> {
                    { "posTaggerModel", posTaggerModelFile }
                });
                opennlpPOSFilterFactory.Inform(loader);
                var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);

                var keywordRepeatFilterFactory = new KeywordRepeatFilterFactory(new Dictionary <string, string>());
                var keywordRepeatFilter        = keywordRepeatFilterFactory.Create(opennlpPOSFilter);

                var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary <string, string> {
                    { "dictionary", lemmatizerDictFile }, { "lemmatizerModel", lemmatizerModelFile }
                });
                opennlpLemmatizerFilterFactory.Inform(loader);
                var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(keywordRepeatFilter);

                var removeDuplicatesTokenFilterFactory = new RemoveDuplicatesTokenFilterFactory(new Dictionary <string, string>());
                var removeDuplicatesTokenFilter        = removeDuplicatesTokenFilterFactory.Create(opennlpLemmatizerFilter);

                return(new TokenStreamComponents(opennlp, removeDuplicatesTokenFilter));
            });

            //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
            //    .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
            //    .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
            //    .addTokenFilter(KeywordRepeatFilterFactory.class)
            //    .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
            //    .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
            //    .build();
            AssertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
                             SENTENCES_both_keep_orig_posTags, null, null, true);
        }
        public void TestPayloads()
        {
            //CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
            //.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
            //.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
            //.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
            //.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
            //.build();

            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                var loader = new ClasspathResourceLoader(GetType());

                var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary <string, string> {
                    { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile }
                });
                opennlpFactory.Inform(loader);
                var opennlp = opennlpFactory.Create(NewAttributeFactory(), reader); //new OpenNLPTokenizer(reader, new Tools.NLPSentenceDetectorOp(sentenceModelFile), new Tools.NLPTokenizerOp(tokenizerModelFile));

                var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary <string, string> {
                    { "posTaggerModel", posTaggerModelFile }
                });
                opennlpPOSFilterFactory.Inform(loader);
                var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);  //new OpenNLPPOSFilter(opennlp, new Tools.NLPPOSTaggerOp(posTaggerModelFile));

                var opennlpChunkerFilterFactory = new OpenNLPChunkerFilterFactory(new Dictionary <string, string> {
                    { "chunkerModel", chunkerModelFile }
                });
                opennlpChunkerFilterFactory.Inform(loader);
                var opennlpChunkerFilter = opennlpChunkerFilterFactory.Create(opennlpPOSFilter);  //new OpenNLPChunkerFilter(filter1, new Tools.NLPChunkerOp(chunkerModelFile));

                var typeAsPayloadFilterFactory = new TypeAsPayloadTokenFilterFactory(new Dictionary <string, string>());
                var typeAsPayloadFilter        = typeAsPayloadFilterFactory.Create(opennlpChunkerFilter);

                return(new TokenStreamComponents(opennlp, typeAsPayloadFilter));
            });

            AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
                             null, null, null, true, ToPayloads(SENTENCES_chunks));
        }
Example #13
0
 /// <summary>
 /// Gets a resource from the classpath as <seealso cref="File"/>. this method should only
 /// be used, if a real file is needed. To get a stream, code should prefer
 /// <seealso cref="Class#getResourceAsStream"/> using {@code this.getClass()}.
 /// </summary>
 protected Stream GetDataFile(string name)
 {
     try
     {
         var resourceLoader = new ClasspathResourceLoader(this.GetType(), "Lucene.Net");
         return resourceLoader.OpenResource(name);
     }
     catch (Exception e)
     {
         throw new IOException("Cannot find resource: " + name);
     }
 }