/// <summary>
        ///   Find the unique stem(s) of the provided word.
        /// </summary>
        /// <param name="word">Word to find the stems for.</param>
        /// <returns>List of stems for the word.</returns>
        public IEnumerable <HunspellStem> UniqueStems(String word)
        {
            if (word == null)
            {
                throw new ArgumentNullException("word");
            }

            var stems = new List <HunspellStem>();
            var terms = new CharArraySet(8, false);

            if (_dictionary.LookupWord(word) != null)
            {
                stems.Add(new HunspellStem(word));
                terms.Add(word);
            }

            var otherStems = Stem(word, null, 0);

            foreach (var s in otherStems)
            {
                if (!terms.Contains(s.Stem))
                {
                    stems.Add(s);
                    terms.Add(s.Stem);
                }
            }

            return(stems);
        }
Example #2
0
 /// <summary>
 /// Reads stopwords from a stopword list in Snowball format.
 /// <para>
 /// The snowball format is the following:
 /// <list type="bullet">
 ///     <item><description>Lines may contain multiple words separated by whitespace.</description></item>
 ///     <item><description>The comment character is the vertical line (&#124;).</description></item>
 ///     <item><description>Lines may contain trailing comments.</description></item>
 /// </list>
 /// </para>
 /// </summary>
 /// <param name="reader"> <see cref="TextReader"/> containing a Snowball stopword list </param>
 /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
 /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
 public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result)
 {
     try
     {
         string line = null;
         while ((line = reader.ReadLine()) != null)
         {
             int comment = line.IndexOf('|');
             if (comment >= 0)
             {
                 line = line.Substring(0, comment);
             }
             string[] words = WHITESPACE.Split(line).TrimEnd();
             foreach (var word in words)
             {
                 if (word.Length > 0)
                 {
                     result.Add(word);
                 }
             }
         }
     }
     finally
     {
         IOUtils.Dispose(reader);
     }
     return(result);
 }
Example #3
0
        /// <summary>
        /// {@inheritDoc}
        /// </summary>
        public override bool IncrementToken()
        {
            while (input.IncrementToken())
            {
                char[] term         = termAttribute.Buffer();
                int    length       = termAttribute.Length;
                int    posIncrement = posIncAttribute.PositionIncrement;

                if (posIncrement > 0)
                {
                    previous.Clear();
                }

                bool duplicate = (posIncrement == 0 && previous.Contains(term, 0, length));

                // clone the term, and add to the set of seen terms.
                char[] saved = new char[length];
                Array.Copy(term, 0, saved, 0, length);
                previous.Add(saved);

                if (!duplicate)
                {
                    return(true);
                }
            }
            return(false);
        }
        /// <summary>
        ///   Find the unique stem(s) of the provided word.
        /// </summary>
        /// <param name="word">Word to find the stems for.</param>
        /// <returns>List of stems for the word.</returns>
        public IEnumerable<HunspellStem> UniqueStems(String word) {
            if (word == null) throw new ArgumentNullException("word");

            var stems = new List<HunspellStem>();
            var terms = new CharArraySet(8, false);
            if (_dictionary.LookupWord(word) != null) {
                stems.Add(new HunspellStem(word));
                terms.Add(word);
            }

            var otherStems = Stem(word, null, 0);
            foreach (var s in otherStems) {
                if (!terms.Contains(s.Stem)) {
                    stems.Add(s);
                    terms.Add(s.Stem);
                }
            }

            return stems;
        }
 public virtual void TestRehash()
 {
     CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 0, true);
     for (int i = 0; i < TEST_STOP_WORDS.Length; i++)
     {
         cas.Add(TEST_STOP_WORDS[i]);
     }
     assertEquals(TEST_STOP_WORDS.Length, cas.size());
     for (int i = 0; i < TEST_STOP_WORDS.Length; i++)
     {
         assertTrue(cas.Contains(TEST_STOP_WORDS[i]));
     }
 }
 public virtual void TestObjectContains()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
     int? val = Convert.ToInt32(1);
     set.Add(val);
     assertTrue(set.Contains(val));
     assertTrue(set.Contains(new int?(1))); // another integer
     assertTrue(set.Contains("1"));
     assertTrue(set.Contains(new char[] { '1' }));
     // test unmodifiable
     set = CharArraySet.UnmodifiableSet(set);
     assertTrue(set.Contains(val));
     assertTrue(set.Contains(new int?(1))); // another integer
     assertTrue(set.Contains("1"));
     assertTrue(set.Contains(new char[] { '1' }));
 }
Example #7
0
        // LUCENENET TODO: Add .NET overloads that accept a file name? Or at least a FileInfo object as was done in 3.0.3?

        /// <summary>
        /// Reads lines from a <see cref="TextReader"/> and adds every line as an entry to a <see cref="CharArraySet"/> (omitting
        /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
        /// </summary>
        /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
        /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
        /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
        public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
        {
            try
            {
                string word = null;
                while ((word = reader.ReadLine()) != null)
                {
                    result.Add(word.Trim());
                }
            }
            finally
            {
                IOUtils.Dispose(reader);
            }
            return(result);
        }
Example #8
0
 /// <summary>
 /// Reads lines from a <see cref="TextReader"/> and adds every non-comment line as an entry to a <see cref="CharArraySet"/> (omitting
 /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
 /// one word. The words need to be in lowercase if you make use of an
 /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
 /// </summary>
 /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
 /// <param name="comment"> The string representing a comment. </param>
 /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
 /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
 public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result)
 {
     try
     {
         string word = null;
         while ((word = reader.ReadLine()) != null)
         {
             if (word.StartsWith(comment, StringComparison.Ordinal) == false)
             {
                 result.Add(word.Trim());
             }
         }
     }
     finally
     {
         IOUtils.Dispose(reader);
     }
     return(result);
 }
Example #9
0
        /// <summary>
        /// Find the unique stem(s) of the provided word
        /// </summary>
        /// <param name="word"> Word to find the stems for </param>
        /// <returns> List of stems for the word </returns>
        public IList <CharsRef> UniqueStems(char[] word, int length)
        {
            IList <CharsRef> stems = Stem(word, length);

            if (stems.Count < 2)
            {
                return(stems);
            }
            CharArraySet     terms   = new CharArraySet(LuceneVersion.LUCENE_CURRENT, 8, dictionary.ignoreCase);
            IList <CharsRef> deduped = new List <CharsRef>();

            foreach (CharsRef s in stems)
            {
                if (!terms.Contains(s))
                {
                    deduped.Add(s);
                    terms.Add(s);
                }
            }
            return(deduped);
        }
Example #10
0
        static StopWord()
        {
            CharArraySet charArraySet    = new CharArraySet(0, true);
            string       applicationPath = Path.Combine(LuceneNetConfig.LuceneDictDirectory, "Stopword.txt");

            if (File.Exists(applicationPath))
            {
                Encoding encoding = EncodingType.GetType(applicationPath);
                using (StreamReader sr = new StreamReader(applicationPath, encoding))
                {
                    while (!sr.EndOfStream)
                    {
                        string line = sr.ReadLine();
                        if (line != null)
                        {
                            charArraySet.Add(line);
                        }
                    }
                }
            }
            //charArraySet.AddAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//英语停用词,我们使用StandardAnalyzer分析器里面已经使用了英语停用词,所以就不需要在添加了。
            _StopWordList = CharArraySet.UnmodifiableSet(charArraySet);
        }
        // LUCENENET TODO: Add .NET overloads that accept a file name? Or at least a FileInfo object as was done in 3.0.3?
        /// <summary>
        /// Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
        /// leading and trailing whitespace). Every line of the Reader should contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// </summary>
        /// <param name="reader"> Reader containing the wordlist </param>
        /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
        /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
        public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
        {
            try
            {
                string word = null;
                while ((word = reader.ReadLine()) != null)
                {
                    result.Add(word.Trim());
                }

            }
            finally
            {
                IOUtils.Close(reader);
            }
            return result;
        }
        public virtual void TestCopyCharArraySet()
        {
            CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            CharArraySet setCaseSensitive = new CharArraySet(TEST_VERSION_CURRENT, 10, false);

            IList<string> stopwords = TEST_STOP_WORDS;
            IList<string> stopwordsUpper = new List<string>();
            foreach (string @string in stopwords)
            {
                stopwordsUpper.Add(@string.ToUpper());
            }
            setIngoreCase.addAll(TEST_STOP_WORDS);
            setIngoreCase.Add(Convert.ToInt32(1));
            setCaseSensitive.addAll(TEST_STOP_WORDS);
            setCaseSensitive.Add(Convert.ToInt32(1));

            CharArraySet copy = CharArraySet.Copy(TEST_VERSION_CURRENT, setIngoreCase);
            CharArraySet copyCaseSens = CharArraySet.Copy(TEST_VERSION_CURRENT, setCaseSensitive);

            assertEquals(setIngoreCase.size(), copy.size());
            assertEquals(setCaseSensitive.size(), copy.size());

            assertTrue(copy.containsAll(stopwords));
            assertTrue(copy.containsAll(stopwordsUpper));
            assertTrue(copyCaseSens.containsAll(stopwords));
            foreach (string @string in stopwordsUpper)
            {
                assertFalse(copyCaseSens.contains(@string));
            }
            // test adding terms to the copy
            IList<string> newWords = new List<string>();
            foreach (string @string in stopwords)
            {
                newWords.Add(@string + "_1");
            }
            copy.addAll(newWords);

            assertTrue(copy.containsAll(stopwords));
            assertTrue(copy.containsAll(stopwordsUpper));
            assertTrue(copy.containsAll(newWords));
            // new added terms are not in the source set
            foreach (string @string in newWords)
            {
                assertFalse(setIngoreCase.contains(@string));
                assertFalse(setCaseSensitive.contains(@string));
            }
        }
        public virtual void TestUnmodifiableSet()
        {
            var set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            set.AddAll(TEST_STOP_WORDS);
            set.Add(Convert.ToInt32(1));
            int size = set.size();
            set = CharArraySet.UnmodifiableSet(set);
            assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
            foreach (var stopword in TEST_STOP_WORDS)
            {
                assertTrue(set.Contains(stopword));
            }
            assertTrue(set.Contains(Convert.ToInt32(1)));
            assertTrue(set.Contains("1"));
            assertTrue(set.Contains(new[] { '1' }));

            try
            {
                CharArraySet.UnmodifiableSet(null);
                fail("can not make null unmodifiable");
            }
            catch (System.ArgumentNullException) // NOTE: In .NET we throw an ArgumentExcpetion, not a NullReferenceExeption
            {
                // expected
            }
        }
Example #14
0
        /// <summary>
        /// Find the unique stem(s) of the provided word
        /// </summary>
        /// <param name="word"> Word to find the stems for </param>
        /// <returns> List of stems for the word </returns>
        public IList<CharsRef> UniqueStems(char[] word, int length)
        {
            IList<CharsRef> stems = Stem(word, length);
            if (stems.Count < 2)
            {
                return stems;
            }
            CharArraySet terms = new CharArraySet(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_CURRENT, 8, dictionary.ignoreCase);
#pragma warning restore 612, 618
            IList<CharsRef> deduped = new List<CharsRef>();
            foreach (CharsRef s in stems)
            {
                if (!terms.Contains(s))
                {
                    deduped.Add(s);
                    terms.Add(s);
                }
            }
            return deduped;
        }
 /// <summary>
 /// Reads stopwords from a stopword list in Snowball format.
 /// <para>
 /// The snowball format is the following:
 /// <ul>
 /// <li>Lines may contain multiple words separated by whitespace.
 /// <li>The comment character is the vertical line (&#124;).
 /// <li>Lines may contain trailing comments.
 /// </ul>
 /// </para>
 /// </summary>
 /// <param name="reader"> Reader containing a Snowball stopword list </param>
 /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
 /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
 public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result)
 {
     try
     {
         string line = null;
         while ((line = reader.ReadLine()) != null)
         {
             int comment = line.IndexOf('|');
             if (comment >= 0)
             {
                 line = line.Substring(0, comment);
             }
             string[] words = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(w => w.Trim()).ToArray();
             foreach (var word in words)
             {
                 if (word.Length > 0)
                 {
                     result.Add(word);
                 }
             }
         }
     }
     finally
     {
         IOUtils.Close(reader);
     }
     return result;
 }
        public virtual void TestModifyOnUnmodifiable()
        {
            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            set.AddAll(TEST_STOP_WORDS);
            int size = set.size();
            set = CharArraySet.UnmodifiableSet(set);
            assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
            string NOT_IN_SET = "SirGallahad";
            assertFalse("Test String already exists in set", set.Contains(NOT_IN_SET));

            try
            {
                set.Add(NOT_IN_SET.ToCharArray());
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.add(NOT_IN_SET);
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.Add(new StringBuilder(NOT_IN_SET));
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.clear();
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }
            try
            {
                set.add(NOT_IN_SET);
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            // NOTE: This results in a StackOverflow exception. Since this is not a public member of CharArraySet,
            // but an extension method for the test fixture (which apparently has a bug), this test is non-critical
            //// This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
            //// current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
            //// remove() on the iterator
            //try
            //{
            //    set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, TEST_STOP_WORDS, true));
            //    fail("Modified unmodifiable set");
            //}
            //catch (System.NotSupportedException)
            //{
            //    // expected
            //    assertEquals("Size of unmodifiable set has changed", size, set.size());
            //}

            #region Added for better .NET support
            // This test was added for .NET to check the Remove method, since the extension method
            // above fails to execute.
            try
            {
#pragma warning disable 612, 618
                set.Remove(TEST_STOP_WORDS[0]);
#pragma warning restore 612, 618
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }
            #endregion

            try
            {
                set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, new [] { NOT_IN_SET }, true));
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.addAll(new[] { NOT_IN_SET});
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
            }

            // LUCENENET Specific - added to test .NETified UnionWith method
            try
            {
                set.UnionWith(new[] { NOT_IN_SET });
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
            }

            for (int i = 0; i < TEST_STOP_WORDS.Length; i++)
            {
                assertTrue(set.contains(TEST_STOP_WORDS[i]));
            }
        }
Example #17
0
		/// <summary> </summary>
        /// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
		/// <param name="ignoreCase">if true, all words are lower cased first</param>
		/// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns>
		public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase)
		{
			var stopSet = new CharArraySet(stopWords.Count, ignoreCase);
            foreach(var word in stopWords)
                stopSet.Add(word.ToString());
			return stopSet;
		}
 /// <summary>
 /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
 /// leading and trailing whitespace). Every line of the Reader should contain only
 /// one word. The words need to be in lowercase if you make use of an
 /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 /// </summary>
 /// <param name="reader"> Reader containing the wordlist </param>
 /// <param name="comment"> The string representing a comment. </param>
 /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
 /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
 public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result)
 {
     try
     {
         string word = null;
         while ((word = reader.ReadLine()) != null)
         {
             if (word.StartsWith(comment, StringComparison.Ordinal) == false)
             {
                 result.Add(word.Trim());
             }
         }
     }
     finally
     {
         IOUtils.Close(reader);
     }
     return result;
 }