/// <summary> /// Find the unique stem(s) of the provided word. /// </summary> /// <param name="word">Word to find the stems for.</param> /// <returns>List of stems for the word.</returns> public IEnumerable <HunspellStem> UniqueStems(String word) { if (word == null) { throw new ArgumentNullException("word"); } var stems = new List <HunspellStem>(); var terms = new CharArraySet(8, false); if (_dictionary.LookupWord(word) != null) { stems.Add(new HunspellStem(word)); terms.Add(word); } var otherStems = Stem(word, null, 0); foreach (var s in otherStems) { if (!terms.Contains(s.Stem)) { stems.Add(s); terms.Add(s.Stem); } } return(stems); }
/// <summary> /// Reads stopwords from a stopword list in Snowball format. /// <para> /// The snowball format is the following: /// <list type="bullet"> /// <item><description>Lines may contain multiple words separated by whitespace.</description></item> /// <item><description>The comment character is the vertical line (|).</description></item> /// <item><description>Lines may contain trailing comments.</description></item> /// </list> /// </para> /// </summary> /// <param name="reader"> <see cref="TextReader"/> containing a Snowball stopword list </param> /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result) { try { string line = null; while ((line = reader.ReadLine()) != null) { int comment = line.IndexOf('|'); if (comment >= 0) { line = line.Substring(0, comment); } string[] words = WHITESPACE.Split(line).TrimEnd(); foreach (var word in words) { if (word.Length > 0) { result.Add(word); } } } } finally { IOUtils.Dispose(reader); } return(result); }
/// <summary> /// {@inheritDoc} /// </summary> public override bool IncrementToken() { while (input.IncrementToken()) { char[] term = termAttribute.Buffer(); int length = termAttribute.Length; int posIncrement = posIncAttribute.PositionIncrement; if (posIncrement > 0) { previous.Clear(); } bool duplicate = (posIncrement == 0 && previous.Contains(term, 0, length)); // clone the term, and add to the set of seen terms. char[] saved = new char[length]; Array.Copy(term, 0, saved, 0, length); previous.Add(saved); if (!duplicate) { return(true); } } return(false); }
/// <summary> /// Find the unique stem(s) of the provided word. /// </summary> /// <param name="word">Word to find the stems for.</param> /// <returns>List of stems for the word.</returns> public IEnumerable<HunspellStem> UniqueStems(String word) { if (word == null) throw new ArgumentNullException("word"); var stems = new List<HunspellStem>(); var terms = new CharArraySet(8, false); if (_dictionary.LookupWord(word) != null) { stems.Add(new HunspellStem(word)); terms.Add(word); } var otherStems = Stem(word, null, 0); foreach (var s in otherStems) { if (!terms.Contains(s.Stem)) { stems.Add(s); terms.Add(s.Stem); } } return stems; }
public virtual void TestRehash() { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 0, true); for (int i = 0; i < TEST_STOP_WORDS.Length; i++) { cas.Add(TEST_STOP_WORDS[i]); } assertEquals(TEST_STOP_WORDS.Length, cas.size()); for (int i = 0; i < TEST_STOP_WORDS.Length; i++) { assertTrue(cas.Contains(TEST_STOP_WORDS[i])); } }
public virtual void TestObjectContains() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); int? val = Convert.ToInt32(1); set.Add(val); assertTrue(set.Contains(val)); assertTrue(set.Contains(new int?(1))); // another integer assertTrue(set.Contains("1")); assertTrue(set.Contains(new char[] { '1' })); // test unmodifiable set = CharArraySet.UnmodifiableSet(set); assertTrue(set.Contains(val)); assertTrue(set.Contains(new int?(1))); // another integer assertTrue(set.Contains("1")); assertTrue(set.Contains(new char[] { '1' })); }
// LUCENENET TODO: Add .NET overloads that accept a file name? Or at least a FileInfo object as was done in 3.0.3? /// <summary> /// Reads lines from a <see cref="TextReader"/> and adds every line as an entry to a <see cref="CharArraySet"/> (omitting /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only /// one word. The words need to be in lowercase if you make use of an /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>). /// </summary> /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param> /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, CharArraySet result) { try { string word = null; while ((word = reader.ReadLine()) != null) { result.Add(word.Trim()); } } finally { IOUtils.Dispose(reader); } return(result); }
/// <summary> /// Reads lines from a <see cref="TextReader"/> and adds every non-comment line as an entry to a <see cref="CharArraySet"/> (omitting /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only /// one word. The words need to be in lowercase if you make use of an /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>). /// </summary> /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param> /// <param name="comment"> The string representing a comment. </param> /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result) { try { string word = null; while ((word = reader.ReadLine()) != null) { if (word.StartsWith(comment, StringComparison.Ordinal) == false) { result.Add(word.Trim()); } } } finally { IOUtils.Dispose(reader); } return(result); }
/// <summary> /// Find the unique stem(s) of the provided word /// </summary> /// <param name="word"> Word to find the stems for </param> /// <returns> List of stems for the word </returns> public IList <CharsRef> UniqueStems(char[] word, int length) { IList <CharsRef> stems = Stem(word, length); if (stems.Count < 2) { return(stems); } CharArraySet terms = new CharArraySet(LuceneVersion.LUCENE_CURRENT, 8, dictionary.ignoreCase); IList <CharsRef> deduped = new List <CharsRef>(); foreach (CharsRef s in stems) { if (!terms.Contains(s)) { deduped.Add(s); terms.Add(s); } } return(deduped); }
static StopWord() { CharArraySet charArraySet = new CharArraySet(0, true); string applicationPath = Path.Combine(LuceneNetConfig.LuceneDictDirectory, "Stopword.txt"); if (File.Exists(applicationPath)) { Encoding encoding = EncodingType.GetType(applicationPath); using (StreamReader sr = new StreamReader(applicationPath, encoding)) { while (!sr.EndOfStream) { string line = sr.ReadLine(); if (line != null) { charArraySet.Add(line); } } } } //charArraySet.AddAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//英语停用词,我们使用StandardAnalyzer分析器里面已经使用了英语停用词,所以就不需要在添加了。 _StopWordList = CharArraySet.UnmodifiableSet(charArraySet); }
// LUCENENET TODO: Add .NET overloads that accept a file name? Or at least a FileInfo object as was done in 3.0.3? /// <summary> /// Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, CharArraySet result) { try { string word = null; while ((word = reader.ReadLine()) != null) { result.Add(word.Trim()); } } finally { IOUtils.Close(reader); } return result; }
public virtual void TestCopyCharArraySet() { CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true); CharArraySet setCaseSensitive = new CharArraySet(TEST_VERSION_CURRENT, 10, false); IList<string> stopwords = TEST_STOP_WORDS; IList<string> stopwordsUpper = new List<string>(); foreach (string @string in stopwords) { stopwordsUpper.Add(@string.ToUpper()); } setIngoreCase.addAll(TEST_STOP_WORDS); setIngoreCase.Add(Convert.ToInt32(1)); setCaseSensitive.addAll(TEST_STOP_WORDS); setCaseSensitive.Add(Convert.ToInt32(1)); CharArraySet copy = CharArraySet.Copy(TEST_VERSION_CURRENT, setIngoreCase); CharArraySet copyCaseSens = CharArraySet.Copy(TEST_VERSION_CURRENT, setCaseSensitive); assertEquals(setIngoreCase.size(), copy.size()); assertEquals(setCaseSensitive.size(), copy.size()); assertTrue(copy.containsAll(stopwords)); assertTrue(copy.containsAll(stopwordsUpper)); assertTrue(copyCaseSens.containsAll(stopwords)); foreach (string @string in stopwordsUpper) { assertFalse(copyCaseSens.contains(@string)); } // test adding terms to the copy IList<string> newWords = new List<string>(); foreach (string @string in stopwords) { newWords.Add(@string + "_1"); } copy.addAll(newWords); assertTrue(copy.containsAll(stopwords)); assertTrue(copy.containsAll(stopwordsUpper)); assertTrue(copy.containsAll(newWords)); // new added terms are not in the source set foreach (string @string in newWords) { assertFalse(setIngoreCase.contains(@string)); assertFalse(setCaseSensitive.contains(@string)); } }
public virtual void TestUnmodifiableSet() { var set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(TEST_STOP_WORDS); set.Add(Convert.ToInt32(1)); int size = set.size(); set = CharArraySet.UnmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); foreach (var stopword in TEST_STOP_WORDS) { assertTrue(set.Contains(stopword)); } assertTrue(set.Contains(Convert.ToInt32(1))); assertTrue(set.Contains("1")); assertTrue(set.Contains(new[] { '1' })); try { CharArraySet.UnmodifiableSet(null); fail("can not make null unmodifiable"); } catch (System.ArgumentNullException) // NOTE: In .NET we throw an ArgumentExcpetion, not a NullReferenceExeption { // expected } }
/// <summary> /// Find the unique stem(s) of the provided word /// </summary> /// <param name="word"> Word to find the stems for </param> /// <returns> List of stems for the word </returns> public IList<CharsRef> UniqueStems(char[] word, int length) { IList<CharsRef> stems = Stem(word, length); if (stems.Count < 2) { return stems; } CharArraySet terms = new CharArraySet( #pragma warning disable 612, 618 LuceneVersion.LUCENE_CURRENT, 8, dictionary.ignoreCase); #pragma warning restore 612, 618 IList<CharsRef> deduped = new List<CharsRef>(); foreach (CharsRef s in stems) { if (!terms.Contains(s)) { deduped.Add(s); terms.Add(s); } } return deduped; }
/// <summary> /// Reads stopwords from a stopword list in Snowball format. /// <para> /// The snowball format is the following: /// <ul> /// <li>Lines may contain multiple words separated by whitespace. /// <li>The comment character is the vertical line (|). /// <li>Lines may contain trailing comments. /// </ul> /// </para> /// </summary> /// <param name="reader"> Reader containing a Snowball stopword list </param> /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result) { try { string line = null; while ((line = reader.ReadLine()) != null) { int comment = line.IndexOf('|'); if (comment >= 0) { line = line.Substring(0, comment); } string[] words = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(w => w.Trim()).ToArray(); foreach (var word in words) { if (word.Length > 0) { result.Add(word); } } } } finally { IOUtils.Close(reader); } return result; }
public virtual void TestModifyOnUnmodifiable() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(TEST_STOP_WORDS); int size = set.size(); set = CharArraySet.UnmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); string NOT_IN_SET = "SirGallahad"; assertFalse("Test String already exists in set", set.Contains(NOT_IN_SET)); try { set.Add(NOT_IN_SET.ToCharArray()); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.Add(new StringBuilder(NOT_IN_SET)); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.clear(); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } // NOTE: This results in a StackOverflow exception. Since this is not a public member of CharArraySet, // but an extension method for the test fixture (which apparently has a bug), this test is non-critical //// This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's //// current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call //// remove() on the iterator //try //{ // set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, TEST_STOP_WORDS, true)); // fail("Modified unmodifiable set"); //} //catch (System.NotSupportedException) //{ // // expected // assertEquals("Size of unmodifiable set has changed", size, set.size()); //} #region Added for better .NET support // This test was added for .NET to check the Remove method, since the extension method // above fails to execute. try { #pragma warning disable 612, 618 set.Remove(TEST_STOP_WORDS[0]); #pragma warning restore 612, 618 fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } #endregion try { set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, new [] { NOT_IN_SET }, true)); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.addAll(new[] { NOT_IN_SET}); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } // LUCENENET Specific - added to test .NETified UnionWith method try { set.UnionWith(new[] { NOT_IN_SET }); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } for (int i = 0; i < TEST_STOP_WORDS.Length; i++) { assertTrue(set.contains(TEST_STOP_WORDS[i])); } }
/// <summary> </summary> /// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param> /// <param name="ignoreCase">if true, all words are lower cased first</param> /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns> public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase) { var stopSet = new CharArraySet(stopWords.Count, ignoreCase); foreach(var word in stopWords) stopSet.Add(word.ToString()); return stopSet; }
/// <summary> /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="comment"> The string representing a comment. </param> /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result) { try { string word = null; while ((word = reader.ReadLine()) != null) { if (word.StartsWith(comment, StringComparison.Ordinal) == false) { result.Add(word.Trim()); } } } finally { IOUtils.Close(reader); } return result; }