/// <summary> /// Creates a stopword set from the given stopword array. /// </summary> /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param> /// <param name="stopWords"> An array of stopwords </param> /// <param name="ignoreCase"> If true, all words are lower cased first. </param> /// <returns> a Set containing the words </returns> public static CharArraySet MakeStopSet(LuceneVersion matchVersion, string[] stopWords, bool ignoreCase) { CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Length, ignoreCase); stopSet.UnionWith(stopWords); return(stopSet); }
internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list /// <summary> /// Creates a new CapitalizationFilterFactory </summary> public CapitalizationFilterFactory(IDictionary<string, string> args) : base(args) { AssureMatchVersion(); bool ignoreCase = GetBoolean(args, KEEP_IGNORE_CASE, false); IEnumerable<string> k = GetSet(args, KEEP); if (k != null) { keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); keep.UnionWith(k); } k = GetSet(args, OK_PREFIX); if (k != null) { okPrefix = new List<char[]>(); foreach (string item in k) { okPrefix.Add(item.ToCharArray()); } } minWordLength = GetInt(args, MIN_WORD_LENGTH, 0); maxWordCount = GetInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT); maxTokenLength = GetInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH); onlyFirstWord = GetBoolean(args, ONLY_FIRST_WORD, true); forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true); if (args.Count > 0) { throw new System.ArgumentException("Unknown parameters: " + args); } }
/// <summary> /// Creates a stopword set from the given stopword list. </summary> /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param> /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param> /// <param name="ignoreCase"> if true, all words are lower cased first </param> /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns> public static CharArraySet MakeStopSet <T1>(LuceneVersion matchVersion, IEnumerable <T1> stopWords, bool ignoreCase) { var stopSet = new CharArraySet(matchVersion, stopWords.Count(), ignoreCase); stopSet.UnionWith(stopWords); return(stopSet); }
private readonly CultureInfo culture; // LUCENENET specific /// <summary> /// Creates a new <see cref="CapitalizationFilterFactory"/> </summary> public CapitalizationFilterFactory(IDictionary <string, string> args) : base(args) { AssureMatchVersion(); bool ignoreCase = GetBoolean(args, KEEP_IGNORE_CASE, false); ICollection <string> k = GetSet(args, KEEP); if (k != null) { keep = new CharArraySet(m_luceneMatchVersion, 10, ignoreCase); keep.UnionWith(k); } k = GetSet(args, OK_PREFIX); if (k != null) { okPrefix = new List <char[]>(); foreach (string item in k) { okPrefix.Add(item.ToCharArray()); } } minWordLength = GetInt32(args, MIN_WORD_LENGTH, 0); maxWordCount = GetInt32(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT); maxTokenLength = GetInt32(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH); onlyFirstWord = GetBoolean(args, ONLY_FIRST_WORD, true); forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true); culture = GetCulture(args, CULTURE, null); if (args.Count > 0) { throw new ArgumentException("Unknown parameters: " + args); } }
/// <summary> /// Returns as <see cref="CharArraySet"/> from wordFiles, which /// can be a comma-separated list of filenames /// </summary> protected CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase) { AssureMatchVersion(); IList <string> files = SplitFileNames(wordFiles); CharArraySet words = null; if (files.Count > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase); foreach (string file in files) { var wlist = GetLines(loader, file.Trim()); words.UnionWith(StopFilter.MakeStopSet(m_luceneMatchVersion, wlist, ignoreCase)); } } return(words); }
/// <summary> /// Returns as <seealso cref="CharArraySet"/> from wordFiles, which /// can be a comma-separated list of filenames /// </summary> protected internal CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase) { AssureMatchVersion(); IEnumerable<string> files = SplitFileNames(wordFiles); CharArraySet words = null; if (files.Count() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(luceneMatchVersion, files.Count() * 10, ignoreCase); foreach (string file in files) { var wlist = GetLines(loader, file.Trim()); words.UnionWith(StopFilter.MakeStopSet(luceneMatchVersion, wlist, ignoreCase)); } } return words; }
public virtual void TestUnionWithCharSequence() { var originalValues = new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore" }; CharArraySet target = new CharArraySet(TEST_VERSION_CURRENT, originalValues, false); var existingValues = new List<ICharSequence> { new StringCharSequenceWrapper("seashells"), new StringCharSequenceWrapper("sea"), new StringCharSequenceWrapper("shore") }; var mixedExistingNonExistingValues = new List<ICharSequence> { new StringCharSequenceWrapper("true"), new StringCharSequenceWrapper("set"), new StringCharSequenceWrapper("of"), new StringCharSequenceWrapper("unique"), new StringCharSequenceWrapper("values"), new StringCharSequenceWrapper("except"), new StringCharSequenceWrapper("sells") }; // Add existing values assertFalse(target.UnionWith(existingValues)); assertEquals(7, target.Count); CollectionAssert.AreEquivalent(originalValues, target); // Add mixed existing/non-existing values assertTrue(target.UnionWith(mixedExistingNonExistingValues)); assertEquals(13, target.Count); CollectionAssert.AreEquivalent(new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore", "true", "set", "of", "unique", "values", "except"}, target); }
public virtual void TestUnionWithObject() { var originalValues = new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore" }; CharArraySet target = new CharArraySet(TEST_VERSION_CURRENT, originalValues, false); var existingValuesAsObject = new List<object> { "seashells", "sea", "shore" }; var mixedExistingNonExistingValuesAsObject = new List<object> { "true", "set", "of", "unique", "values", "except", "sells" }; var nonExistingMixedTypes = new object[] { true, (byte)55, (short)44, (int)33, (sbyte)22, (long)11, (char)'\n', "hurray", (uint)99, (ulong)89, (ushort)79, new char[] { 't', 'w', 'o' }, new StringCharSequenceWrapper("testing") }; // Add existing values assertFalse(target.UnionWith(existingValuesAsObject)); assertEquals(7, target.Count); CollectionAssert.AreEquivalent(originalValues, target); // Add mixed existing/non-existing values assertTrue(target.UnionWith(mixedExistingNonExistingValuesAsObject)); assertEquals(13, target.Count); CollectionAssert.AreEquivalent(new string[] { "sally", "sells", "seashells", "by", "the", "sea", "shore", "true", "set", "of", "unique", "values", "except"}, target); target.Clear(); assertEquals(0, target.Count); assertTrue(target.UnionWith(originalValues.Cast<object>())); // Need to cast here because the .NET return type is void for UnionWith. CollectionAssert.AreEquivalent(originalValues, target); // Add mixed types as object assertTrue(target.UnionWith(nonExistingMixedTypes)); assertEquals(20, target.Count); assertTrue(target.Contains(true)); assertTrue(target.Contains((byte)55)); assertTrue(target.Contains((short)44)); assertTrue(target.Contains((int)33)); assertTrue(target.Contains((sbyte)22)); assertTrue(target.Contains((long)11)); assertTrue(target.Contains((char)'\n')); assertTrue(target.Contains("hurray")); assertTrue(target.Contains((uint)99)); assertTrue(target.Contains((ulong)89)); assertTrue(target.Contains((ushort)79)); assertTrue(target.Contains(new char[] { 't', 'w', 'o' })); assertTrue(target.Contains(new StringCharSequenceWrapper("testing"))); }
public virtual void TestModifyOnUnmodifiable() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(TEST_STOP_WORDS); int size = set.size(); set = CharArraySet.UnmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); string NOT_IN_SET = "SirGallahad"; assertFalse("Test String already exists in set", set.Contains(NOT_IN_SET)); try { set.Add(NOT_IN_SET.ToCharArray()); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.Add(new StringBuilder(NOT_IN_SET)); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.clear(); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } // NOTE: This results in a StackOverflow exception. Since this is not a public member of CharArraySet, // but an extension method for the test fixture (which apparently has a bug), this test is non-critical //// This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's //// current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call //// remove() on the iterator //try //{ // set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, TEST_STOP_WORDS, true)); // fail("Modified unmodifiable set"); //} //catch (System.NotSupportedException) //{ // // expected // assertEquals("Size of unmodifiable set has changed", size, set.size()); //} #region Added for better .NET support // This test was added for .NET to check the Remove method, since the extension method // above fails to execute. try { #pragma warning disable 612, 618 set.Remove(TEST_STOP_WORDS[0]); #pragma warning restore 612, 618 fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } #endregion try { set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, new [] { NOT_IN_SET }, true)); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.addAll(new[] { NOT_IN_SET}); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } // LUCENENET Specific - added to test .NETified UnionWith method try { set.UnionWith(new[] { NOT_IN_SET }); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } for (int i = 0; i < TEST_STOP_WORDS.Length; i++) { assertTrue(set.contains(TEST_STOP_WORDS[i])); } }