/// <summary> /// Creates a stopword set from the given stopword array. /// </summary> /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param> /// <param name="stopWords"> An array of stopwords </param> /// <param name="ignoreCase"> If true, all words are lower cased first. </param> /// <returns> a Set containing the words </returns> public static CharArraySet MakeStopSet(LuceneVersion matchVersion, string[] stopWords, bool ignoreCase) { CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Length, ignoreCase); stopSet.AddAll(Arrays.AsList(stopWords)); return(stopSet); }
internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list /// <summary> /// Creates a new CapitalizationFilterFactory </summary> public CapitalizationFilterFactory(IDictionary <string, string> args) : base(args) { AssureMatchVersion(); bool ignoreCase = GetBoolean(args, KEEP_IGNORE_CASE, false); IEnumerable <string> k = GetSet(args, KEEP); if (k != null) { keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); keep.AddAll(k); } k = GetSet(args, OK_PREFIX); if (k != null) { okPrefix = new List <char[]>(); foreach (string item in k) { okPrefix.Add(item.ToCharArray()); } } minWordLength = GetInt(args, MIN_WORD_LENGTH, 0); maxWordCount = GetInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT); maxTokenLength = GetInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH); onlyFirstWord = GetBoolean(args, ONLY_FIRST_WORD, true); forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true); if (args.Count > 0) { throw new System.ArgumentException("Unknown parameters: " + args); } }
/// <summary> /// Creates a stopword set from the given stopword list. </summary> /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param> /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param> /// <param name="ignoreCase"> if true, all words are lower cased first </param> /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns> public static CharArraySet MakeStopSet <T1>(Version matchVersion, IList <T1> stopWords, bool ignoreCase) { var stopSet = new CharArraySet(matchVersion, stopWords.Count, ignoreCase); stopSet.AddAll(stopWords); return(stopSet); }
/// <summary> /// Creates a stopword set from the given stopword list. </summary> /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param> /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param> /// <param name="ignoreCase"> if true, all words are lower cased first </param> /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns> public static CharArraySet MakeStopSet <T1>(LuceneVersion matchVersion, IEnumerable <T1> stopWords, bool ignoreCase) { var stopSet = new CharArraySet(matchVersion, stopWords.Count(), ignoreCase); stopSet.AddAll(stopWords.Cast <object>().ToArray()); return(stopSet); }
internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list /// <summary> /// Creates a new CapitalizationFilterFactory </summary> public CapitalizationFilterFactory(IDictionary<string, string> args) : base(args) { assureMatchVersion(); bool ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false); HashSet<string> k = getSet(args, KEEP); if (k != null) { keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); keep.AddAll(k); } k = getSet(args, OK_PREFIX); if (k != null) { okPrefix = new List<char[]>(); foreach (string item in k) { okPrefix.Add(item.ToCharArray()); } } minWordLength = getInt(args, MIN_WORD_LENGTH, 0); maxWordCount = getInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT); maxTokenLength = getInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH); onlyFirstWord = getBoolean(args, ONLY_FIRST_WORD, true); forceFirstLetter = getBoolean(args, FORCE_FIRST_LETTER, true); if (args.Count > 0) { throw new System.ArgumentException("Unknown parameters: " + args); } }
static StopWordList() { { var englishStopSet = new CharArraySet(EnglishStopWords.Length, false); englishStopSet.AddAll(new System.Collections.ArrayList(EnglishStopWords)); EnglishStopWordsSet = CharArraySet.UnmodifiableSet(englishStopSet); } }
public virtual void TestNonZeroOffset() { string[] words = new string[] { "Hello", "World", "this", "is", "a", "test" }; char[] findme = "xthisy".ToCharArray(); CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(words); assertTrue(set.Contains(findme, 1, 4)); assertTrue(set.Contains(new string(findme, 1, 4))); // test unmodifiable set = CharArraySet.UnmodifiableSet(set); assertTrue(set.Contains(findme, 1, 4)); assertTrue(set.Contains(new string(findme, 1, 4))); }
private static ISet <string> GetStopWords() { int portalId; string cultureCode; var searchDoc = Thread.GetData(Thread.GetNamedDataSlot(Constants.TlsSearchInfo)) as SearchDocument; if (searchDoc == null) { portalId = 0; // default cultureCode = Thread.CurrentThread.CurrentCulture.Name; } else { portalId = searchDoc.PortalId; cultureCode = searchDoc.CultureCode; if (string.IsNullOrEmpty(cultureCode)) { var portalInfo = PortalController.Instance.GetPortal(portalId); if (portalInfo != null) { cultureCode = portalInfo.DefaultLanguage; } } } var stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET; var searchStopWords = SearchHelper.Instance.GetSearchStopWords(portalId, cultureCode); if (searchStopWords != null && !string.IsNullOrEmpty(searchStopWords.StopWords)) { //TODO Use cache from InternalSearchController var cultureInfo = new CultureInfo(cultureCode ?? "en-US"); var strArray = searchStopWords.StopWords.Split(',').Select(s => s.ToLower(cultureInfo)).ToArray(); var set = new CharArraySet(strArray.Length, false); set.AddAll(strArray); stops = CharArraySet.UnmodifiableSet(set); } return(stops); }
/// <summary> /// Returns as <seealso cref="CharArraySet"/> from wordFiles, which /// can be a comma-separated list of filenames /// </summary> protected internal CharArraySet GetWordSet(ResourceLoader loader, string wordFiles, bool ignoreCase) { assureMatchVersion(); IList<string> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.Count > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(luceneMatchVersion, files.Count * 10, ignoreCase); foreach (string file in files) { var wlist = getLines(loader, file.Trim()); words.AddAll(StopFilter.makeStopSet(luceneMatchVersion, wlist, ignoreCase)); } } return words; }
static Stopwords() { PORTUGUESE = new string[] { "a", "ainda", "alem", "ambas", "ambos", "antes", "ao", "aonde", "aos", "apos", "aquele", "aqueles", "as", "assim", "com", "como", "contra", "contudo", "cuja", "cujas", "cujo", "cujos", "da", "das", "de", "dela", "dele", "deles", "demais", "depois", "desde", "desta", "deste", "dispoe", "dispoem", "diversa", "diversas", "diversos", "do", "dos", "durante", "e", "ela", "elas", "ele", "eles", "em", "entao", "entre", "essa", "essas", "esse", "esses", "esta", "estas", "este", "estes", "ha", "isso", "isto", "logo", "mais", "mas", "mediante", "menos", "mesma", "mesmas", "mesmo", "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste", "nos", "o", "os", "ou", "outra", "outras", "outro", "outros", "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por", "porque", "portanto", "proprio", "propios", "quais", "qual", "qualquer", "quando", "quanto", "que", "quem", "quer", "se", "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua", "suas", "tal", "tambem", "teu", "teus", "toda", "todas", "todo", "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns" }; var stopSet = new CharArraySet(PORTUGUESE.Length, false); stopSet.AddAll(new System.Collections.ArrayList(PORTUGUESE)); PORTUGUESE_SET = CharArraySet.UnmodifiableSet(stopSet); }
/// <summary></summary> /// <param name="stopWords">An array of stopwords</param> /// <param name="ignoreCase">If true, all words are lower cased first.</param> /// <returns> a Set containing the words</returns> public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase) { var stopSet = new CharArraySet(stopWords.Length, ignoreCase); stopSet.AddAll(stopWords); return stopSet; }
public virtual void TestClear() { var set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(TEST_STOP_WORDS); assertEquals("Not all words added", TEST_STOP_WORDS.Length, set.size()); set.Clear(); assertEquals("not empty", 0, set.size()); for (var i = 0; i < TEST_STOP_WORDS.Length; i++) { assertFalse(set.Contains(TEST_STOP_WORDS[i])); } set.AddAll(TEST_STOP_WORDS); assertEquals("Not all words added", TEST_STOP_WORDS.Length, set.size()); for (var i = 0; i < TEST_STOP_WORDS.Length; i++) { assertTrue("Set doesn't contain " + TEST_STOP_WORDS[i], set.Contains(TEST_STOP_WORDS[i])); } }
public virtual void TestUnmodifiableSet() { var set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(TEST_STOP_WORDS); set.Add(Convert.ToInt32(1)); int size = set.size(); set = CharArraySet.UnmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); foreach (var stopword in TEST_STOP_WORDS) { assertTrue(set.Contains(stopword)); } assertTrue(set.Contains(Convert.ToInt32(1))); assertTrue(set.Contains("1")); assertTrue(set.Contains(new[] { '1' })); try { CharArraySet.UnmodifiableSet(null); fail("can not make null unmodifiable"); } catch (System.ArgumentNullException) // NOTE: In .NET we throw an ArgumentExcpetion, not a NullReferenceExeption { // expected } }
public virtual void TestModifyOnUnmodifiable() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.AddAll(TEST_STOP_WORDS); int size = set.size(); set = CharArraySet.UnmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); string NOT_IN_SET = "SirGallahad"; assertFalse("Test String already exists in set", set.Contains(NOT_IN_SET)); try { set.Add(NOT_IN_SET.ToCharArray()); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.Add(new StringBuilder(NOT_IN_SET)); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.clear(); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } // NOTE: This results in a StackOverflow exception. Since this is not a public member of CharArraySet, // but an extension method for the test fixture (which apparently has a bug), this test is non-critical //// This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's //// current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call //// remove() on the iterator //try //{ // set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, TEST_STOP_WORDS, true)); // fail("Modified unmodifiable set"); //} //catch (System.NotSupportedException) //{ // // expected // assertEquals("Size of unmodifiable set has changed", size, set.size()); //} #region Added for better .NET support // This test was added for .NET to check the Remove method, since the extension method // above fails to execute. try { #pragma warning disable 612, 618 set.Remove(TEST_STOP_WORDS[0]); #pragma warning restore 612, 618 fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } #endregion try { set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, new [] { NOT_IN_SET }, true)); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.addAll(new[] { NOT_IN_SET}); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } // LUCENENET Specific - added to test .NETified UnionWith method try { set.UnionWith(new[] { NOT_IN_SET }); fail("Modified unmodifiable set"); } catch (System.NotSupportedException) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } for (int i = 0; i < TEST_STOP_WORDS.Length; i++) { assertTrue(set.contains(TEST_STOP_WORDS[i])); } }
static StopAnalyzer() { { var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"}; var stopSet = new CharArraySet(stopWords.Length, false); stopSet.AddAll(stopWords); ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet); } }