Loader for text files that represent a list of stopwords.
Ejemplo n.º 1
0
        /// <summary>
        /// same as <seealso cref="#getWordSet(ResourceLoader, String, boolean)"/>,
        /// except the input is in snowball format.
        /// </summary>
        protected internal CharArraySet getSnowballWordSet(ResourceLoader loader, string wordFiles, bool ignoreCase)
        {
            assureMatchVersion();
            IList <string> files = splitFileNames(wordFiles);
            CharArraySet   words = null;

            if (files.Count > 0)
            {
                // default stopwords list has 35 or so words, but maybe don't make it that
                // big to start
                words = new CharArraySet(luceneMatchVersion, files.Count * 10, ignoreCase);
                foreach (string file in files)
                {
                    InputStream stream = null;
                    TextReader  reader = null;
                    try
                    {
                        stream = loader.openResource(file.Trim());
                        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
                        reader = new InputStreamReader(stream, decoder);
                        WordlistLoader.getSnowballWordSet(reader, words);
                    }
                    finally
                    {
                        IOUtils.closeWhileHandlingException(reader, stream);
                    }
                }
            }
            return(words);
        }
        private void assertClasspathDelegation(IResourceLoader rl)
        {
            //var englishStopText = System.IO.Path.Combine(analysisCommonFolder.FullName, @"Analysis\Snowball\english_stop.txt");
            // LUCENENET specific - rather than being completely dependent on the location of the file
            // in the file system, we use the embedded resource to write the file to a known location
            // before passing it to our resource loader.
            string englishStopFile = "english_stop.txt";
            var    file            = CreateTempFile(System.IO.Path.GetFileNameWithoutExtension(englishStopFile), System.IO.Path.GetExtension(englishStopFile));

            using (var stream = typeof(Snowball.SnowballFilter).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(Snowball.SnowballFilter), englishStopFile))
            {
                using (var outputStream = new FileStream(file.FullName, FileMode.OpenOrCreate, FileAccess.Write))
                {
                    stream.CopyTo(outputStream);
                }
            }
            // try a stopwords file from classpath
            CharArraySet set = WordlistLoader.GetSnowballWordSet(new System.IO.StreamReader(rl.OpenResource(file.FullName), Encoding.UTF8), TEST_VERSION_CURRENT);

            assertTrue(set.contains("you"));
            // try to load a class; we use string comparison because classloader may be different...
            assertEquals("Lucene.Net.Analysis.Util.RollingCharBuffer", rl.NewInstance <object>("Lucene.Net.Analysis.Util.RollingCharBuffer").ToString());
            // theoretically classes should also be loadable:
            //IOUtils.CloseWhileHandlingException(rl.OpenResource("java/lang/String.class")); // LUCENENET TODO: Not sure what the equivalent to this is (or if there is one).
        }
Ejemplo n.º 3
0
        public virtual void TestComments()
        {
            string       s        = "ONE\n  two \nthree\n#comment";
            CharArraySet wordSet1 = WordlistLoader.GetWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT);

            CheckSet(wordSet1);
            assertFalse(wordSet1.contains("#comment"));
            assertFalse(wordSet1.contains("comment"));
        }
Ejemplo n.º 4
0
        public virtual void TestWordlistLoading()
        {
            string       s        = "ONE\n  two \nthree";
            CharArraySet wordSet1 = WordlistLoader.GetWordSet(new StringReader(s), TEST_VERSION_CURRENT);

            CheckSet(wordSet1);
            // TODO: Do we need to check for a "buffered reader" in .NET?
            //CharArraySet wordSet2 = WordlistLoader.GetWordSet(new System.IO.StreamReader(new StringReader(s)), TEST_VERSION_CURRENT);
            //CheckSet(wordSet2);
        }
Ejemplo n.º 5
0
 /// <summary>
 /// Creates a <see cref="CharArraySet"/> from a file.
 /// </summary>
 /// <param name="stopwords">
 ///          the stopwords reader to load
 /// </param>
 /// <param name="matchVersion">
 ///          the Lucene version for cross version compatibility </param>
 /// <returns> a <see cref="CharArraySet"/> containing the distinct stopwords from the given
 ///         reader </returns>
 /// <exception cref="IOException">
 ///           if loading the stopwords throws an <see cref="IOException"/> </exception>
 protected static CharArraySet LoadStopwordSet(TextReader stopwords, LuceneVersion matchVersion)
 {
     try
     {
         return(WordlistLoader.GetWordSet(stopwords, matchVersion));
     }
     finally
     {
         IOUtils.Dispose(stopwords);
     }
 }
Ejemplo n.º 6
0
        private void assertClasspathDelegation(IResourceLoader rl)
        {
            // try a stopwords file from classpath
            CharArraySet set = WordlistLoader.GetSnowballWordSet(new System.IO.StreamReader(rl.OpenResource(System.IO.Path.GetFullPath(@"..\..\..\Lucene.Net.Analysis.Common\Analysis\Snowball\english_stop.txt")), Encoding.UTF8), TEST_VERSION_CURRENT);

            assertTrue(set.contains("you"));
            // try to load a class; we use string comparison because classloader may be different...
            assertEquals("Lucene.Net.Analysis.Util.RollingCharBuffer", rl.NewInstance <object>("Lucene.Net.Analysis.Util.RollingCharBuffer").ToString());
            // theoretically classes should also be loadable:
            //IOUtils.CloseWhileHandlingException(rl.OpenResource("java/lang/String.class")); // LUCENENET TODO: Not sure what the equivalent to this is (or if there is one).
        }
 /// <summary>
 /// Creates a CharArraySet from a file.
 /// </summary>
 /// <param name="stopwords">
 ///          the stopwords reader to load
 /// </param>
 /// <param name="matchVersion">
 ///          the Lucene version for cross version compatibility </param>
 /// <returns> a CharArraySet containing the distinct stopwords from the given
 ///         reader </returns>
 /// <exception cref="IOException">
 ///           if loading the stopwords throws an <seealso cref="IOException"/> </exception>
 protected internal static CharArraySet loadStopwordSet(Reader stopwords, Version matchVersion)
 {
     try
     {
         return(WordlistLoader.GetWordSet(stopwords, matchVersion));
     }
     finally
     {
         IOUtils.Close(stopwords);
     }
 }
        /// <summary>
        /// Creates a CharArraySet from a file resource associated with a class. (See
        /// <seealso cref="Class#getResourceAsStream(String)"/>).
        /// </summary>
        /// <param name="ignoreCase">
        ///          <code>true</code> if the set should ignore the case of the
        ///          stopwords, otherwise <code>false</code> </param>
        /// <param name="aClass">
        ///          a class that is associated with the given stopwordResource </param>
        /// <param name="resource">
        ///          name of the resource file associated with the given class </param>
        /// <param name="comment">
        ///          comment string to ignore in the stopword file </param>
        /// <returns> a CharArraySet containing the distinct stopwords from the given
        ///         file </returns>
        /// <exception cref="IOException">
        ///           if loading the stopwords throws an <seealso cref="IOException"/> </exception>
        protected internal static CharArraySet LoadStopwordSet(bool ignoreCase, Type aClass, string resource, string comment)
        {
            TextReader reader = null;

            try
            {
                reader = IOUtils.GetDecodingReader(aClass.GetResourceAsStream(resource), StandardCharsets.UTF_8);
                return(WordlistLoader.GetWordSet(reader, comment, new CharArraySet(Version.LUCENE_CURRENT, 16, ignoreCase)));
            }
            finally
            {
                IOUtils.Close(reader);
            }
        }
        /// <summary>
        /// Creates a CharArraySet from a file.
        /// </summary>
        /// <param name="stopwords">
        ///          the stopwords file to load
        /// </param>
        /// <param name="matchVersion">
        ///          the Lucene version for cross version compatibility </param>
        /// <returns> a CharArraySet containing the distinct stopwords from the given
        ///         file </returns>
        /// <exception cref="IOException">
        ///           if loading the stopwords throws an <seealso cref="IOException"/> </exception>
        protected internal static CharArraySet LoadStopwordSet(File stopwords, Version matchVersion)
        {
            Reader reader = null;

            try
            {
                reader = IOUtils.GetDecodingReader(stopwords, StandardCharsets.UTF_8);
                return(WordlistLoader.GetWordSet(reader, matchVersion));
            }
            finally
            {
                IOUtils.Close(reader);
            }
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Creates a <see cref="CharArraySet"/> from a file.
        /// </summary>
        /// <param name="stopwords">
        ///          the stopwords file to load
        /// </param>
        /// <param name="matchVersion">
        ///          the Lucene version for cross version compatibility </param>
        /// <returns> a <see cref="CharArraySet"/> containing the distinct stopwords from the given
        ///         file </returns>
        /// <exception cref="IOException">
        ///           if loading the stopwords throws an <see cref="IOException"/> </exception>
        protected static CharArraySet LoadStopwordSet(FileInfo stopwords, LuceneVersion matchVersion)
        {
            TextReader reader = null;

            try
            {
                reader = IOUtils.GetDecodingReader(stopwords, Encoding.UTF8);
                return(WordlistLoader.GetWordSet(reader, matchVersion));
            }
            finally
            {
                IOUtils.Dispose(reader);
            }
        }
Ejemplo n.º 11
0
        public virtual void TestSnowballListLoading()
        {
            string       s       = "|comment\n" + " |comment\n" + "\n" + "  \t\n" + " |comment | comment\n" + "ONE\n" + "   two   \n" + " three   four five \n" + "six seven | comment\n"; //multiple stopwords + comment -  multiple stopwords -  stopword with leading/trailing space -  stopword, in uppercase -  commented line with comment -  line with only whitespace -  blank line -  commented line with leading whitespace -  commented line
            CharArraySet wordset = WordlistLoader.GetSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT);

            assertEquals(7, wordset.size());
            assertTrue(wordset.contains("ONE"));
            assertTrue(wordset.contains("two"));
            assertTrue(wordset.contains("three"));
            assertTrue(wordset.contains("four"));
            assertTrue(wordset.contains("five"));
            assertTrue(wordset.contains("six"));
            assertTrue(wordset.contains("seven"));
        }
Ejemplo n.º 12
0
        // LUCENENET TODO: If this works, need to update the documentation for the .NET version of the story.

        /// <summary>
        /// Creates a CharArraySet from a file resource associated with a class. (See
        /// <seealso cref="Class#getResourceAsStream(String)"/>).
        /// </summary>
        /// <param name="ignoreCase">
        ///          <code>true</code> if the set should ignore the case of the
        ///          stopwords, otherwise <code>false</code> </param>
        /// <param name="aClass">
        ///          a class that is associated with the given stopwordResource </param>
        /// <param name="resource">
        ///          name of the resource file associated with the given class </param>
        /// <param name="comment">
        ///          comment string to ignore in the stopword file </param>
        /// <returns> a CharArraySet containing the distinct stopwords from the given
        ///         file </returns>
        /// <exception cref="IOException">
        ///           if loading the stopwords throws an <seealso cref="IOException"/> </exception>
        protected internal static CharArraySet LoadStopwordSet(bool ignoreCase, Type aClass, string resource, string comment)
        {
            TextReader reader = null;

            try
            {
                reader = IOUtils.GetDecodingReader(aClass.Assembly.GetManifestResourceStream(resource), Encoding.UTF8);
                return(WordlistLoader.GetWordSet(reader, comment, new CharArraySet(
#pragma warning disable 612, 618
                                                     LuceneVersion.LUCENE_CURRENT, 16, ignoreCase)));

#pragma warning restore 612, 618
            }
            finally
            {
                IOUtils.Close(reader);
            }
        }
Ejemplo n.º 13
0
        public virtual void TestBaseDir()
        {
            DirectoryInfo @base = CreateTempDir("fsResourceLoaderBase");

            try
            {
                TextWriter os = new StreamWriter(new FileStream(System.IO.Path.Combine(@base.FullName, "template.txt"), FileMode.Create, FileAccess.Write), Encoding.UTF8);
                try
                {
                    os.Write("foobar\n");
                }
                finally
                {
                    IOUtils.DisposeWhileHandlingException(os);
                }

                IResourceLoader rl = new FilesystemResourceLoader(@base);
                assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource("template.txt"), Encoding.UTF8).First());
                // Same with full path name:
                string fullPath = (new FileInfo(System.IO.Path.Combine(@base.FullName, "template.txt"))).ToString();
                assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource(fullPath), Encoding.UTF8).First());
                assertClasspathDelegation(rl);
                assertNotFound(rl);

                // now use RL without base dir:
                rl = new FilesystemResourceLoader();
                assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource(new FileInfo(System.IO.Path.Combine(@base.FullName, "template.txt")).FullName), Encoding.UTF8).First());
                assertClasspathDelegation(rl);
                assertNotFound(rl);
            }
            finally
            {
                // clean up
                foreach (var file in @base.EnumerateFiles())
                {
                    file.Delete();
                }
                @base.Delete();
            }
        }
Ejemplo n.º 14
0
        /// <summary>
        /// Same as <see cref="GetWordSet(IResourceLoader, string, bool)"/>,
        /// except the input is in snowball format.
        /// </summary>
        protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase)
        {
            AssureMatchVersion();
            IList <string> files = SplitFileNames(wordFiles);
            CharArraySet   words = null;

            if (files.Count > 0)
            {
                // default stopwords list has 35 or so words, but maybe don't make it that
                // big to start
                words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
                foreach (string file in files)
                {
                    using (Stream stream = loader.OpenResource(file.Trim()))
                        using (TextReader reader = new StreamReader(stream, Encoding.UTF8))
                        {
                            WordlistLoader.GetSnowballWordSet(reader, words);
                        }
                }
            }
            return(words);
        }
Ejemplo n.º 15
0
 /// <summary>
 /// Returns the resource's lines (with content treated as UTF-8)
 /// </summary>
 protected internal IEnumerable <string> GetLines(IResourceLoader loader, string resource)
 {
     return(WordlistLoader.GetLines(loader.OpenResource(resource), Encoding.UTF8));
 }
Ejemplo n.º 16
0
 /// <summary>
 /// Returns the resource's lines (with content treated as UTF-8)
 /// </summary>
 protected IList <string> GetLines(IResourceLoader loader, string resource)
 {
     return(WordlistLoader.GetLines(loader.OpenResource(resource), Encoding.UTF8));
 }
Ejemplo n.º 17
0
        public virtual void TestDelegation()
        {
            IResourceLoader rl = new FilesystemResourceLoader(null, new StringMockResourceLoader("foobar\n"));

            assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource("template.txt"), Encoding.UTF8).First());
        }
Ejemplo n.º 18
0
 /// <summary>
 /// Returns the resource's lines (with content treated as UTF-8)
 /// </summary>
 protected internal IList <string> getLines(ResourceLoader loader, string resource)
 {
     return(WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8));
 }