Beispiel #1
0
        /// <summary>
        /// Creates a stopword set from the given stopword array.
        /// </summary>
        /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
        /// <param name="stopWords"> An array of stopwords </param>
        /// <param name="ignoreCase"> If true, all words are lower cased first. </param>
        /// <returns> a Set containing the words </returns>
        public static CharArraySet MakeStopSet(LuceneVersion matchVersion, string[] stopWords, bool ignoreCase)
        {
            CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Length, ignoreCase);

            stopSet.UnionWith(stopWords);
            return(stopSet);
        }
        internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list

        /// <summary>
        /// Creates a new CapitalizationFilterFactory </summary>
        public CapitalizationFilterFactory(IDictionary<string, string> args)
            : base(args)
        {
            AssureMatchVersion();
            bool ignoreCase = GetBoolean(args, KEEP_IGNORE_CASE, false);
            IEnumerable<string> k = GetSet(args, KEEP);
            if (k != null)
            {
                keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
                keep.UnionWith(k);
            }

            k = GetSet(args, OK_PREFIX);
            if (k != null)
            {
                okPrefix = new List<char[]>();
                foreach (string item in k)
                {
                    okPrefix.Add(item.ToCharArray());
                }
            }

            minWordLength = GetInt(args, MIN_WORD_LENGTH, 0);
            maxWordCount = GetInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
            maxTokenLength = GetInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
            onlyFirstWord = GetBoolean(args, ONLY_FIRST_WORD, true);
            forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true);
            if (args.Count > 0)
            {
                throw new System.ArgumentException("Unknown parameters: " + args);
            }
        }
Beispiel #3
0
        /// <summary>
        /// Creates a stopword set from the given stopword list. </summary>
        /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
        /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
        /// <param name="ignoreCase"> if true, all words are lower cased first </param>
        /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns>
        public static CharArraySet MakeStopSet <T1>(LuceneVersion matchVersion, IEnumerable <T1> stopWords, bool ignoreCase)
        {
            var stopSet = new CharArraySet(matchVersion, stopWords.Count(), ignoreCase);

            stopSet.UnionWith(stopWords);
            return(stopSet);
        }
        private readonly CultureInfo culture;    // LUCENENET specific

        /// <summary>
        /// Creates a new <see cref="CapitalizationFilterFactory"/> </summary>
        public CapitalizationFilterFactory(IDictionary <string, string> args)
            : base(args)
        {
            AssureMatchVersion();
            bool ignoreCase        = GetBoolean(args, KEEP_IGNORE_CASE, false);
            ICollection <string> k = GetSet(args, KEEP);

            if (k != null)
            {
                keep = new CharArraySet(m_luceneMatchVersion, 10, ignoreCase);
                keep.UnionWith(k);
            }

            k = GetSet(args, OK_PREFIX);
            if (k != null)
            {
                okPrefix = new List <char[]>();
                foreach (string item in k)
                {
                    okPrefix.Add(item.ToCharArray());
                }
            }

            minWordLength    = GetInt32(args, MIN_WORD_LENGTH, 0);
            maxWordCount     = GetInt32(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
            maxTokenLength   = GetInt32(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
            onlyFirstWord    = GetBoolean(args, ONLY_FIRST_WORD, true);
            forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true);
            culture          = GetCulture(args, CULTURE, null);
            if (args.Count > 0)
            {
                throw new ArgumentException("Unknown parameters: " + args);
            }
        }
Beispiel #5
0
        /// <summary>
        /// Returns as <see cref="CharArraySet"/> from wordFiles, which
        /// can be a comma-separated list of filenames
        /// </summary>
        protected CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase)
        {
            AssureMatchVersion();
            IList <string> files = SplitFileNames(wordFiles);
            CharArraySet   words = null;

            if (files.Count > 0)
            {
                // default stopwords list has 35 or so words, but maybe don't make it that
                // big to start
                words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
                foreach (string file in files)
                {
                    var wlist = GetLines(loader, file.Trim());
                    words.UnionWith(StopFilter.MakeStopSet(m_luceneMatchVersion, wlist, ignoreCase));
                }
            }
            return(words);
        }
 /// <summary>
 /// Returns as <seealso cref="CharArraySet"/> from wordFiles, which
 /// can be a comma-separated list of filenames
 /// </summary>
 protected internal CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase)
 {
     AssureMatchVersion();
     IEnumerable<string> files = SplitFileNames(wordFiles);
     CharArraySet words = null;
     if (files.Count() > 0)
     {
         // default stopwords list has 35 or so words, but maybe don't make it that
         // big to start
         words = new CharArraySet(luceneMatchVersion, files.Count() * 10, ignoreCase);
         foreach (string file in files)
         {
             var wlist = GetLines(loader, file.Trim());
             words.UnionWith(StopFilter.MakeStopSet(luceneMatchVersion, wlist, ignoreCase));
         }
     }
     return words;
 }
        public virtual void TestUnionWithCharSequence()
        {
            var originalValues = new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore" };
            CharArraySet target = new CharArraySet(TEST_VERSION_CURRENT, originalValues, false);
            var existingValues = new List<ICharSequence> { new StringCharSequenceWrapper("seashells"), new StringCharSequenceWrapper("sea"), new StringCharSequenceWrapper("shore") };
            var mixedExistingNonExistingValues = new List<ICharSequence> { new StringCharSequenceWrapper("true"), new StringCharSequenceWrapper("set"), new StringCharSequenceWrapper("of"), new StringCharSequenceWrapper("unique"), new StringCharSequenceWrapper("values"), new StringCharSequenceWrapper("except"), new StringCharSequenceWrapper("sells") };

            // Add existing values
            assertFalse(target.UnionWith(existingValues));
            assertEquals(7, target.Count);
            CollectionAssert.AreEquivalent(originalValues, target);

            // Add mixed existing/non-existing values
            assertTrue(target.UnionWith(mixedExistingNonExistingValues));
            assertEquals(13, target.Count);
            CollectionAssert.AreEquivalent(new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore",
                "true", "set", "of", "unique", "values", "except"}, target);
        }
        public virtual void TestUnionWithObject()
        {
            var originalValues = new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore" };
            CharArraySet target = new CharArraySet(TEST_VERSION_CURRENT, originalValues, false);
            var existingValuesAsObject = new List<object> { "seashells", "sea", "shore" };
            var mixedExistingNonExistingValuesAsObject = new List<object> { "true", "set", "of", "unique", "values", "except", "sells" };
            var nonExistingMixedTypes = new object[] { true, (byte)55, (short)44, (int)33, (sbyte)22, (long)11, (char)'\n', "hurray", (uint)99, (ulong)89, (ushort)79, new char[] { 't', 'w', 'o' }, new StringCharSequenceWrapper("testing") };

            // Add existing values
            assertFalse(target.UnionWith(existingValuesAsObject));
            assertEquals(7, target.Count);
            CollectionAssert.AreEquivalent(originalValues, target);

            // Add mixed existing/non-existing values
            assertTrue(target.UnionWith(mixedExistingNonExistingValuesAsObject));
            assertEquals(13, target.Count);
            CollectionAssert.AreEquivalent(new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore",
                "true", "set", "of", "unique", "values", "except"}, target);

            target.Clear();
            assertEquals(0, target.Count);
            assertTrue(target.UnionWith(originalValues.Cast<object>())); // Need to cast here because the .NET return type is void for UnionWith.
            CollectionAssert.AreEquivalent(originalValues, target);

            // Add mixed types as object
            assertTrue(target.UnionWith(nonExistingMixedTypes));
            assertEquals(20, target.Count);
            assertTrue(target.Contains(true));
            assertTrue(target.Contains((byte)55));
            assertTrue(target.Contains((short)44));
            assertTrue(target.Contains((int)33));
            assertTrue(target.Contains((sbyte)22));
            assertTrue(target.Contains((long)11));
            assertTrue(target.Contains((char)'\n'));
            assertTrue(target.Contains("hurray"));
            assertTrue(target.Contains((uint)99));
            assertTrue(target.Contains((ulong)89));
            assertTrue(target.Contains((ushort)79));
            assertTrue(target.Contains(new char[] { 't', 'w', 'o' }));
            assertTrue(target.Contains(new StringCharSequenceWrapper("testing")));
        }
        public virtual void TestModifyOnUnmodifiable()
        {
            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            set.AddAll(TEST_STOP_WORDS);
            int size = set.size();
            set = CharArraySet.UnmodifiableSet(set);
            assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
            string NOT_IN_SET = "SirGallahad";
            assertFalse("Test String already exists in set", set.Contains(NOT_IN_SET));

            try
            {
                set.Add(NOT_IN_SET.ToCharArray());
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.add(NOT_IN_SET);
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.Add(new StringBuilder(NOT_IN_SET));
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.clear();
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }
            try
            {
                set.add(NOT_IN_SET);
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            // NOTE: This results in a StackOverflow exception. Since this is not a public member of CharArraySet,
            // but an extension method for the test fixture (which apparently has a bug), this test is non-critical
            //// This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
            //// current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
            //// remove() on the iterator
            //try
            //{
            //    set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, TEST_STOP_WORDS, true));
            //    fail("Modified unmodifiable set");
            //}
            //catch (System.NotSupportedException)
            //{
            //    // expected
            //    assertEquals("Size of unmodifiable set has changed", size, set.size());
            //}

            #region Added for better .NET support
            // This test was added for .NET to check the Remove method, since the extension method
            // above fails to execute.
            try
            {
#pragma warning disable 612, 618
                set.Remove(TEST_STOP_WORDS[0]);
#pragma warning restore 612, 618
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }
            #endregion

            try
            {
                set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, new [] { NOT_IN_SET }, true));
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.addAll(new[] { NOT_IN_SET});
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
            }

            // LUCENENET Specific - added to test .NETified UnionWith method
            try
            {
                set.UnionWith(new[] { NOT_IN_SET });
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
            }

            for (int i = 0; i < TEST_STOP_WORDS.Length; i++)
            {
                assertTrue(set.contains(TEST_STOP_WORDS[i]));
            }
        }