Beispiel #1
0
        /*
         * Loads stopwords hash from resource stream (file, database...).
         * @param   wordfile    File containing the wordlist
         * @param   encoding    Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
         * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
         *             and {@link #CzechAnalyzer(Version, Set)} instead
         */
        public void LoadStopWords(Stream wordfile, System.Text.Encoding encoding)
        {
            PreviousTokenStream = null; // force a new stopfilter to be created
            if (wordfile == null)
            {
                stoptable = Support.Compatibility.SetFactory.CreateHashSet <string>();
                return;
            }
            try {
                // clear any previous table (if present)
                stoptable = Support.Compatibility.SetFactory.CreateHashSet <string>();

                StreamReader isr;
                if (encoding == null)
                {
                    isr = new StreamReader(wordfile);
                }
                else
                {
                    isr = new StreamReader(wordfile, encoding);
                }

                stoptable = WordlistLoader.GetWordSet(isr);
            } catch (IOException) {
                // clear any previous table (if present)
                // TODO: throw IOException
                stoptable = Support.Compatibility.SetFactory.CreateHashSet <string>();
            }
        }
Beispiel #2
0
            static DefaultsHolder()
            {
                try
                {
                    DEFAULT_STOP_SET = WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer),
                                                                                           DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#",
#pragma warning disable 612, 618
                                                                 LuceneVersion.LUCENE_CURRENT);
#pragma warning restore 612, 618
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw new InvalidOperationException("Unable to load default stopword set", ex);
                }

                try
                {
                    DEFAULT_TABLE = StempelStemmer.Load(typeof(PolishAnalyzer).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(PolishAnalyzer), DEFAULT_STEMMER_FILE));
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw new InvalidOperationException("Unable to load default stemming tables", ex);
                }
            }
Beispiel #3
0
 internal static ISet <string> LoadDefaultStopWordSet()
 {
     using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(typeof(ArabicAnalyzer)).GetManifestResourceStream("Lucene.Net.Analysis.AR." + DEFAULT_STOPWORD_FILE)))
     {
         return(CharArraySet.UnmodifiableSet(CharArraySet.Copy(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT))));
     }
 }
Beispiel #4
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testComments() throws Exception
        public virtual void testComments()
        {
            string       s        = "ONE\n  two \nthree\n#comment";
            CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT);

            checkSet(wordSet1);
            assertFalse(wordSet1.contains("#comment"));
            assertFalse(wordSet1.contains("comment"));
        }
Beispiel #5
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testWordlistLoading() throws java.io.IOException
        public virtual void testWordlistLoading()
        {
            string       s        = "ONE\n  two \nthree";
            CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT);

            checkSet(wordSet1);
            CharArraySet wordSet2 = WordlistLoader.getWordSet(new System.IO.StreamReader(new StringReader(s)), TEST_VERSION_CURRENT);

            checkSet(wordSet2);
        }
            internal static CharArraySet LoadDefaultStopWordSet()
            {
                // make sure it is unmodifiable as we expose it in the outer class
                return(CharArraySet.UnmodifiableSet(WordlistLoader.GetWordSet(IOUtils
                                                                              .GetDecodingReader(typeof(SmartChineseAnalyzer), DEFAULT_STOPWORD_FILE,
                                                                                                 Encoding.UTF8), STOPWORD_FILE_COMMENT,
#pragma warning disable 612, 618
                                                                              LuceneVersion.LUCENE_CURRENT)));

#pragma warning restore 612, 618
            }
Beispiel #7
0
 public void SetStemExclusionTable(FileInfo exclusionlist)
 {
     try
     {
         ExclusionTable      = WordlistLoader.GetWordSet(exclusionlist);
         PreviousTokenStream = null;
     }
     catch (IOException ex)
     {
         throw new Exception("", ex);
     }
 }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void assertClasspathDelegation(ResourceLoader rl) throws Exception
        private void assertClasspathDelegation(ResourceLoader rl)
        {
            // try a stopwords file from classpath
            CharArraySet set = WordlistLoader.getSnowballWordSet(new System.IO.StreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), Encoding.UTF8), TEST_VERSION_CURRENT);

            assertTrue(set.contains("you"));
            // try to load a class; we use string comparison because classloader may be different...
//JAVA TO C# CONVERTER WARNING: The .NET Type.FullName property will not always yield results identical to the Java Class.getName method:
            assertEquals("org.apache.lucene.analysis.util.RollingCharBuffer", rl.newInstance("org.apache.lucene.analysis.util.RollingCharBuffer", typeof(object)).GetType().FullName);
            // theoretically classes should also be loadable:
            IOUtils.closeWhileHandlingException(rl.openResource("java/lang/String.class"));
        }
Beispiel #9
0
 static DefaultSetHolder()
 {
     try
     {
         DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
     }
     catch (IOException)
     {
         // default set should always be present as it is part of the
         // distribution (JAR)
         throw new Exception("Unable to load default stopword set");
     }
 }
Beispiel #10
0
        /// <summary>
        /// Test stopwords in snowball format
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testSnowballListLoading() throws java.io.IOException
        public virtual void testSnowballListLoading()
        {
            string       s       = "|comment\n" + " |comment\n" + "\n" + "  \t\n" + " |comment | comment\n" + "ONE\n" + "   two   \n" + " three   four five \n" + "six seven | comment\n"; //multiple stopwords + comment -  multiple stopwords -  stopword with leading/trailing space -  stopword, in uppercase -  commented line with comment -  line with only whitespace -  blank line -  commented line with leading whitespace -  commented line
            CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT);

            assertEquals(7, wordset.size());
            assertTrue(wordset.contains("ONE"));
            assertTrue(wordset.contains("two"));
            assertTrue(wordset.contains("three"));
            assertTrue(wordset.contains("four"));
            assertTrue(wordset.contains("five"));
            assertTrue(wordset.contains("six"));
            assertTrue(wordset.contains("seven"));
        }
Beispiel #11
0
            static ISet <String> LoadDefaultStopWordSet()
            {
                var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);

                try
                {
                    StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
                    // make sure it is unmodifiable as we expose it in the outer class
                    return(CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true)));
                }
                finally
                {
                    stream.Close();
                }
            }
 static DefaultSetHolder()
 {
     try
     {
         DEFAULT_STOP_SET = WordlistLoader.GetSnowballWordSet(
             IOUtils.GetDecodingReader(typeof(SnowballFilter), typeof(SnowballFilter).Namespace + "." + DEFAULT_STOPWORD_FILE, Encoding.UTF8),
             LuceneVersion.LUCENE_CURRENT);
     }
     catch (IOException)
     {
         // default set should always be present as it is part of the
         // distribution (JAR)
         throw new Exception("Unable to load default stopword set");
     }
 }
Beispiel #13
0
            static DefaultSetHolder()
            {
                try
                {
                    DEFAULT_STOP_SET = WordlistLoader.GetWordSet(
                        IOUtils.GetDecodingReader(typeof(SoraniAnalyzer), DEFAULT_STOPWORD_FILE, Encoding.UTF8),
#pragma warning disable 612, 618
                        LuceneVersion.LUCENE_CURRENT);
#pragma warning restore 612, 618
                }
                catch (IOException)
                {
                    // default set should always be present as it is part of the
                    // distribution (JAR)
                    throw new Exception("Unable to load default stopword set");
                }
            }
Beispiel #14
0
            private static CharArraySet LoadDefaultStopSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
            {
                try
                {
                    return(WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer),
                                                                               DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#",
#pragma warning disable 612, 618
                                                     LuceneVersion.LUCENE_CURRENT));

#pragma warning restore 612, 618
                }
                catch (Exception ex) when(ex.IsIOException())
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw RuntimeException.Create("Unable to load default stopword set", ex);
                }
            }
Beispiel #15
0
            private static CharArraySet LoadDefaultStopSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
            {
                try
                {
                    return(WordlistLoader.GetSnowballWordSet(
                               IOUtils.GetDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, Encoding.UTF8),
#pragma warning disable 612, 618
                               LuceneVersion.LUCENE_CURRENT));

#pragma warning restore 612, 618
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (JAR)
                    throw new Exception("Unable to load default stopword set", ex);
                }
            }
Beispiel #16
0
            internal static readonly CharArraySet DEFAULT_STOP_SET = LoadDefaultSet(); // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)

            private static CharArraySet LoadDefaultSet()
            {
                try
                {
                    return(WordlistLoader.GetSnowballWordSet(IOUtils.GetDecodingReader(typeof(UkrainianMorfologikAnalyzer),
                                                                                       DEFAULT_STOPWORD_FILE, Encoding.UTF8),
#pragma warning disable 612, 618
                                                             LuceneVersion.LUCENE_CURRENT));

#pragma warning restore 612, 618
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (JAR)
                    throw new Exception("Unable to load default stopword set", ex);
                }
            }
Beispiel #17
0
            static DefaultSetHolder()
            {
                try
                {
                    DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
                }
                catch (IOException)
                {
                    // default set should always be present as it is part of the
                    // distribution (JAR)
                    throw new Exception("Unable to load default stopword set");
                }

                DEFAULT_STEM_DICT = new CharArrayMap <>(Version.LUCENE_CURRENT, 4, false);
                DEFAULT_STEM_DICT.put("fiets", "fiets");         //otherwise fiet
                DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
                DEFAULT_STEM_DICT.put("ei", "eier");
                DEFAULT_STEM_DICT.put("kind", "kinder");
            }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testBaseDir() throws Exception
        public virtual void testBaseDir()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final java.io.File super = createTempDir("fsResourceLoaderBase").getAbsoluteFile();
            File @base = createTempDir("fsResourceLoaderBase").AbsoluteFile;

            try
            {
                @base.mkdirs();
                Writer os = new System.IO.StreamWriter(new System.IO.FileStream(@base, "template.txt", System.IO.FileMode.Create, System.IO.FileAccess.Write), Encoding.UTF8);
                try
                {
                    os.write("foobar\n");
                }
                finally
                {
                    IOUtils.closeWhileHandlingException(os);
                }

                ResourceLoader rl = new FilesystemResourceLoader(@base);
                assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
                // Same with full path name:
                string fullPath = (new File(@base, "template.txt")).ToString();
                assertEquals("foobar", WordlistLoader.getLines(rl.openResource(fullPath), StandardCharsets.UTF_8).get(0));
                assertClasspathDelegation(rl);
                assertNotFound(rl);

                // now use RL without base dir:
                rl = new FilesystemResourceLoader();
                assertEquals("foobar", WordlistLoader.getLines(rl.openResource((new File(@base, "template.txt")).ToString()), StandardCharsets.UTF_8).get(0));
                assertClasspathDelegation(rl);
                assertNotFound(rl);
            }
            finally
            {
                TestUtil.rm(@base);
            }
        }
Beispiel #19
0
 /**
  * Builds an analyzer with the given stop words.
  */
 public LithuanianAnalyzer(FileInfo stopwords)
 {
     stoptable = WordlistLoader.GetWordSet(stopwords);
 }
Beispiel #20
0
 /**
  * Builds an exclusionlist from the words contained in the given file.
  */
 public void SetStemExclusionTable(FileInfo exclusionlist)
 {
     excltable = WordlistLoader.GetWordSet(exclusionlist);
 }
 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
 /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
 /// </seealso>
 /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
 ///
 /// </param>
 /// <param name="stopwords">Reader to read stop words from
 /// </param>
 public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
     : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
 {
 }
Beispiel #22
0
 public DanishAnalyzer(Version matchVersion, FileInfo stopwordsFile)
 {
     StopTable    = WordlistLoader.GetWordSet(stopwordsFile);
     MatchVersion = matchVersion;
 }
Beispiel #23
0
        /*
         * Builds an analyzer with the given stop words.
         * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
         */

        public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, FileInfo stopwords)
            : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
        {
        }
Beispiel #24
0
 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
 /// </seealso>
 /// <param name="matchVersion">Lucene version to match See {@link
 /// <a href="#version">above</a>}
 /// </param>
 /// <param name="stopwords">Reader to read stop words from
 /// </param>
 public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
 {
     stopSet = WordlistLoader.GetWordSet(stopwords);
     Init(matchVersion);
 }
Beispiel #25
0
        /*
         * Builds an exclusionlist from the words contained in the given file.
         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
         */

        public void SetStemExclusionTable(FileInfo exclusionlist)
        {
            excltable           = WordlistLoader.GetWordSet(exclusionlist);
            PreviousTokenStream = null;    // force a new stemmer to be created
        }
Beispiel #26
0
        /*
         * Builds an analyzer with the given stop words.
         * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
         */

        public BrazilianAnalyzer(Version matchVersion, FileInfo stopwords)
            : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
        {
        }
 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
 /// </seealso>
 public StandardAnalyzer(System.IO.TextReader stopwords)
 {
     stopSet = WordlistLoader.GetWordSet(stopwords);
 }
 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
 /// <seealso cref="WordlistLoader.GetWordSet(File)">
 /// </seealso>
 public StandardAnalyzer(System.IO.FileInfo stopwords)
 {
     stopSet = WordlistLoader.GetWordSet(stopwords);
 }
 /**
  * Builds an analyzer with the given stop words.
  */
 public BrazilianAnalyzer(FileInfo stopwords)
 {
     stoptable = WordlistLoader.GetWordtable(stopwords);
 }
Beispiel #30
0
 /*
  * Builds an analyzer with the given stop words.  Lines can be commented out using <see cref="STOPWORDS_COMMENT"/>
  */
 public ArabicAnalyzer(Version matchVersion, FileInfo stopwords)
     : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
 {
 }