/// <summary> /// same as <seealso cref="#getWordSet(ResourceLoader, String, boolean)"/>, /// except the input is in snowball format. /// </summary> protected internal CharArraySet getSnowballWordSet(ResourceLoader loader, string wordFiles, bool ignoreCase) { assureMatchVersion(); IList <string> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.Count > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(luceneMatchVersion, files.Count * 10, ignoreCase); foreach (string file in files) { InputStream stream = null; TextReader reader = null; try { stream = loader.openResource(file.Trim()); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT); reader = new InputStreamReader(stream, decoder); WordlistLoader.getSnowballWordSet(reader, words); } finally { IOUtils.closeWhileHandlingException(reader, stream); } } } return(words); }
private void assertClasspathDelegation(IResourceLoader rl) { //var englishStopText = System.IO.Path.Combine(analysisCommonFolder.FullName, @"Analysis\Snowball\english_stop.txt"); // LUCENENET specific - rather than being completely dependent on the location of the file // in the file system, we use the embedded resource to write the file to a known location // before passing it to our resource loader. string englishStopFile = "english_stop.txt"; var file = CreateTempFile(System.IO.Path.GetFileNameWithoutExtension(englishStopFile), System.IO.Path.GetExtension(englishStopFile)); using (var stream = typeof(Snowball.SnowballFilter).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(Snowball.SnowballFilter), englishStopFile)) { using (var outputStream = new FileStream(file.FullName, FileMode.OpenOrCreate, FileAccess.Write)) { stream.CopyTo(outputStream); } } // try a stopwords file from classpath CharArraySet set = WordlistLoader.GetSnowballWordSet(new System.IO.StreamReader(rl.OpenResource(file.FullName), Encoding.UTF8), TEST_VERSION_CURRENT); assertTrue(set.contains("you")); // try to load a class; we use string comparison because classloader may be different... assertEquals("Lucene.Net.Analysis.Util.RollingCharBuffer", rl.NewInstance <object>("Lucene.Net.Analysis.Util.RollingCharBuffer").ToString()); // theoretically classes should also be loadable: //IOUtils.CloseWhileHandlingException(rl.OpenResource("java/lang/String.class")); // LUCENENET TODO: Not sure what the equivalent to this is (or if there is one). }
public virtual void TestComments() { string s = "ONE\n two \nthree\n#comment"; CharArraySet wordSet1 = WordlistLoader.GetWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT); CheckSet(wordSet1); assertFalse(wordSet1.contains("#comment")); assertFalse(wordSet1.contains("comment")); }
public virtual void TestWordlistLoading() { string s = "ONE\n two \nthree"; CharArraySet wordSet1 = WordlistLoader.GetWordSet(new StringReader(s), TEST_VERSION_CURRENT); CheckSet(wordSet1); // TODO: Do we need to check for a "buffered reader" in .NET? //CharArraySet wordSet2 = WordlistLoader.GetWordSet(new System.IO.StreamReader(new StringReader(s)), TEST_VERSION_CURRENT); //CheckSet(wordSet2); }
/// <summary> /// Creates a <see cref="CharArraySet"/> from a file. /// </summary> /// <param name="stopwords"> /// the stopwords reader to load /// </param> /// <param name="matchVersion"> /// the Lucene version for cross version compatibility </param> /// <returns> a <see cref="CharArraySet"/> containing the distinct stopwords from the given /// reader </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <see cref="IOException"/> </exception> protected static CharArraySet LoadStopwordSet(TextReader stopwords, LuceneVersion matchVersion) { try { return(WordlistLoader.GetWordSet(stopwords, matchVersion)); } finally { IOUtils.Dispose(stopwords); } }
private void assertClasspathDelegation(IResourceLoader rl) { // try a stopwords file from classpath CharArraySet set = WordlistLoader.GetSnowballWordSet(new System.IO.StreamReader(rl.OpenResource(System.IO.Path.GetFullPath(@"..\..\..\Lucene.Net.Analysis.Common\Analysis\Snowball\english_stop.txt")), Encoding.UTF8), TEST_VERSION_CURRENT); assertTrue(set.contains("you")); // try to load a class; we use string comparison because classloader may be different... assertEquals("Lucene.Net.Analysis.Util.RollingCharBuffer", rl.NewInstance <object>("Lucene.Net.Analysis.Util.RollingCharBuffer").ToString()); // theoretically classes should also be loadable: //IOUtils.CloseWhileHandlingException(rl.OpenResource("java/lang/String.class")); // LUCENENET TODO: Not sure what the equivalent to this is (or if there is one). }
/// <summary> /// Creates a CharArraySet from a file. /// </summary> /// <param name="stopwords"> /// the stopwords reader to load /// </param> /// <param name="matchVersion"> /// the Lucene version for cross version compatibility </param> /// <returns> a CharArraySet containing the distinct stopwords from the given /// reader </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <seealso cref="IOException"/> </exception> protected internal static CharArraySet loadStopwordSet(Reader stopwords, Version matchVersion) { try { return(WordlistLoader.GetWordSet(stopwords, matchVersion)); } finally { IOUtils.Close(stopwords); } }
/// <summary> /// Creates a CharArraySet from a file resource associated with a class. (See /// <seealso cref="Class#getResourceAsStream(String)"/>). /// </summary> /// <param name="ignoreCase"> /// <code>true</code> if the set should ignore the case of the /// stopwords, otherwise <code>false</code> </param> /// <param name="aClass"> /// a class that is associated with the given stopwordResource </param> /// <param name="resource"> /// name of the resource file associated with the given class </param> /// <param name="comment"> /// comment string to ignore in the stopword file </param> /// <returns> a CharArraySet containing the distinct stopwords from the given /// file </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <seealso cref="IOException"/> </exception> protected internal static CharArraySet LoadStopwordSet(bool ignoreCase, Type aClass, string resource, string comment) { TextReader reader = null; try { reader = IOUtils.GetDecodingReader(aClass.GetResourceAsStream(resource), StandardCharsets.UTF_8); return(WordlistLoader.GetWordSet(reader, comment, new CharArraySet(Version.LUCENE_CURRENT, 16, ignoreCase))); } finally { IOUtils.Close(reader); } }
/// <summary> /// Creates a CharArraySet from a file. /// </summary> /// <param name="stopwords"> /// the stopwords file to load /// </param> /// <param name="matchVersion"> /// the Lucene version for cross version compatibility </param> /// <returns> a CharArraySet containing the distinct stopwords from the given /// file </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <seealso cref="IOException"/> </exception> protected internal static CharArraySet LoadStopwordSet(File stopwords, Version matchVersion) { Reader reader = null; try { reader = IOUtils.GetDecodingReader(stopwords, StandardCharsets.UTF_8); return(WordlistLoader.GetWordSet(reader, matchVersion)); } finally { IOUtils.Close(reader); } }
/// <summary> /// Creates a <see cref="CharArraySet"/> from a file. /// </summary> /// <param name="stopwords"> /// the stopwords file to load /// </param> /// <param name="matchVersion"> /// the Lucene version for cross version compatibility </param> /// <returns> a <see cref="CharArraySet"/> containing the distinct stopwords from the given /// file </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <see cref="IOException"/> </exception> protected static CharArraySet LoadStopwordSet(FileInfo stopwords, LuceneVersion matchVersion) { TextReader reader = null; try { reader = IOUtils.GetDecodingReader(stopwords, Encoding.UTF8); return(WordlistLoader.GetWordSet(reader, matchVersion)); } finally { IOUtils.Dispose(reader); } }
public virtual void TestSnowballListLoading() { string s = "|comment\n" + " |comment\n" + "\n" + " \t\n" + " |comment | comment\n" + "ONE\n" + " two \n" + " three four five \n" + "six seven | comment\n"; //multiple stopwords + comment - multiple stopwords - stopword with leading/trailing space - stopword, in uppercase - commented line with comment - line with only whitespace - blank line - commented line with leading whitespace - commented line CharArraySet wordset = WordlistLoader.GetSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT); assertEquals(7, wordset.size()); assertTrue(wordset.contains("ONE")); assertTrue(wordset.contains("two")); assertTrue(wordset.contains("three")); assertTrue(wordset.contains("four")); assertTrue(wordset.contains("five")); assertTrue(wordset.contains("six")); assertTrue(wordset.contains("seven")); }
// LUCENENET TODO: If this works, need to update the documentation for the .NET version of the story. /// <summary> /// Creates a CharArraySet from a file resource associated with a class. (See /// <seealso cref="Class#getResourceAsStream(String)"/>). /// </summary> /// <param name="ignoreCase"> /// <code>true</code> if the set should ignore the case of the /// stopwords, otherwise <code>false</code> </param> /// <param name="aClass"> /// a class that is associated with the given stopwordResource </param> /// <param name="resource"> /// name of the resource file associated with the given class </param> /// <param name="comment"> /// comment string to ignore in the stopword file </param> /// <returns> a CharArraySet containing the distinct stopwords from the given /// file </returns> /// <exception cref="IOException"> /// if loading the stopwords throws an <seealso cref="IOException"/> </exception> protected internal static CharArraySet LoadStopwordSet(bool ignoreCase, Type aClass, string resource, string comment) { TextReader reader = null; try { reader = IOUtils.GetDecodingReader(aClass.Assembly.GetManifestResourceStream(resource), Encoding.UTF8); return(WordlistLoader.GetWordSet(reader, comment, new CharArraySet( #pragma warning disable 612, 618 LuceneVersion.LUCENE_CURRENT, 16, ignoreCase))); #pragma warning restore 612, 618 } finally { IOUtils.Close(reader); } }
public virtual void TestBaseDir() { DirectoryInfo @base = CreateTempDir("fsResourceLoaderBase"); try { TextWriter os = new StreamWriter(new FileStream(System.IO.Path.Combine(@base.FullName, "template.txt"), FileMode.Create, FileAccess.Write), Encoding.UTF8); try { os.Write("foobar\n"); } finally { IOUtils.DisposeWhileHandlingException(os); } IResourceLoader rl = new FilesystemResourceLoader(@base); assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource("template.txt"), Encoding.UTF8).First()); // Same with full path name: string fullPath = (new FileInfo(System.IO.Path.Combine(@base.FullName, "template.txt"))).ToString(); assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource(fullPath), Encoding.UTF8).First()); assertClasspathDelegation(rl); assertNotFound(rl); // now use RL without base dir: rl = new FilesystemResourceLoader(); assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource(new FileInfo(System.IO.Path.Combine(@base.FullName, "template.txt")).FullName), Encoding.UTF8).First()); assertClasspathDelegation(rl); assertNotFound(rl); } finally { // clean up foreach (var file in @base.EnumerateFiles()) { file.Delete(); } @base.Delete(); } }
/// <summary> /// Same as <see cref="GetWordSet(IResourceLoader, string, bool)"/>, /// except the input is in snowball format. /// </summary> protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase) { AssureMatchVersion(); IList <string> files = SplitFileNames(wordFiles); CharArraySet words = null; if (files.Count > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase); foreach (string file in files) { using (Stream stream = loader.OpenResource(file.Trim())) using (TextReader reader = new StreamReader(stream, Encoding.UTF8)) { WordlistLoader.GetSnowballWordSet(reader, words); } } } return(words); }
/// <summary> /// Returns the resource's lines (with content treated as UTF-8) /// </summary> protected internal IEnumerable <string> GetLines(IResourceLoader loader, string resource) { return(WordlistLoader.GetLines(loader.OpenResource(resource), Encoding.UTF8)); }
/// <summary> /// Returns the resource's lines (with content treated as UTF-8) /// </summary> protected IList <string> GetLines(IResourceLoader loader, string resource) { return(WordlistLoader.GetLines(loader.OpenResource(resource), Encoding.UTF8)); }
public virtual void TestDelegation() { IResourceLoader rl = new FilesystemResourceLoader(null, new StringMockResourceLoader("foobar\n")); assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource("template.txt"), Encoding.UTF8).First()); }
/// <summary> /// Returns the resource's lines (with content treated as UTF-8) /// </summary> protected internal IList <string> getLines(ResourceLoader loader, string resource) { return(WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8)); }