Пример #1
0
        /// <summary>
        /// Creates a stopword set from the given stopword array.
        /// </summary>
        /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
        /// <param name="stopWords"> An array of stopwords </param>
        /// <param name="ignoreCase"> If true, all words are lower cased first. </param>
        /// <returns> a Set containing the words </returns>
        public static CharArraySet MakeStopSet(LuceneVersion matchVersion, string[] stopWords, bool ignoreCase)
        {
            CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Length, ignoreCase);

            stopSet.AddAll(Arrays.AsList(stopWords));
            return(stopSet);
        }
        internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list

        /// <summary>
        /// Creates a new CapitalizationFilterFactory </summary>
        public CapitalizationFilterFactory(IDictionary <string, string> args)
            : base(args)
        {
            AssureMatchVersion();
            bool ignoreCase        = GetBoolean(args, KEEP_IGNORE_CASE, false);
            IEnumerable <string> k = GetSet(args, KEEP);

            if (k != null)
            {
                keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
                keep.AddAll(k);
            }

            k = GetSet(args, OK_PREFIX);
            if (k != null)
            {
                okPrefix = new List <char[]>();
                foreach (string item in k)
                {
                    okPrefix.Add(item.ToCharArray());
                }
            }

            minWordLength    = GetInt(args, MIN_WORD_LENGTH, 0);
            maxWordCount     = GetInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
            maxTokenLength   = GetInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
            onlyFirstWord    = GetBoolean(args, ONLY_FIRST_WORD, true);
            forceFirstLetter = GetBoolean(args, FORCE_FIRST_LETTER, true);
            if (args.Count > 0)
            {
                throw new System.ArgumentException("Unknown parameters: " + args);
            }
        }
Пример #3
0
        /// <summary>
        /// Creates a stopword set from the given stopword list. </summary>
        /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
        /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
        /// <param name="ignoreCase"> if true, all words are lower cased first </param>
        /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns>
        public static CharArraySet MakeStopSet <T1>(Version matchVersion, IList <T1> stopWords, bool ignoreCase)
        {
            var stopSet = new CharArraySet(matchVersion, stopWords.Count, ignoreCase);

            stopSet.AddAll(stopWords);
            return(stopSet);
        }
Пример #4
0
        /// <summary>
        /// Creates a stopword set from the given stopword list. </summary>
        /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
        /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
        /// <param name="ignoreCase"> if true, all words are lower cased first </param>
        /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns>
        public static CharArraySet MakeStopSet <T1>(LuceneVersion matchVersion, IEnumerable <T1> stopWords, bool ignoreCase)
        {
            var stopSet = new CharArraySet(matchVersion, stopWords.Count(), ignoreCase);

            stopSet.AddAll(stopWords.Cast <object>().ToArray());
            return(stopSet);
        }
        internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list

        /// <summary>
        /// Creates a new CapitalizationFilterFactory </summary>
        public CapitalizationFilterFactory(IDictionary<string, string> args)
            : base(args)
        {
            assureMatchVersion();
            bool ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false);
            HashSet<string> k = getSet(args, KEEP);
            if (k != null)
            {
                keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
                keep.AddAll(k);
            }

            k = getSet(args, OK_PREFIX);
            if (k != null)
            {
                okPrefix = new List<char[]>();
                foreach (string item in k)
                {
                    okPrefix.Add(item.ToCharArray());
                }
            }

            minWordLength = getInt(args, MIN_WORD_LENGTH, 0);
            maxWordCount = getInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
            maxTokenLength = getInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
            onlyFirstWord = getBoolean(args, ONLY_FIRST_WORD, true);
            forceFirstLetter = getBoolean(args, FORCE_FIRST_LETTER, true);
            if (args.Count > 0)
            {
                throw new System.ArgumentException("Unknown parameters: " + args);
            }
        }
Пример #6
0
 static StopWordList()
 {
     {
         var englishStopSet = new CharArraySet(EnglishStopWords.Length, false);
         englishStopSet.AddAll(new System.Collections.ArrayList(EnglishStopWords));
         EnglishStopWordsSet = CharArraySet.UnmodifiableSet(englishStopSet);
     }
 }
Пример #7
0
        public virtual void TestNonZeroOffset()
        {
            string[] words = new string[] { "Hello", "World", "this", "is", "a", "test" };
            char[] findme = "xthisy".ToCharArray();
            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            set.AddAll(words);
            assertTrue(set.Contains(findme, 1, 4));
            assertTrue(set.Contains(new string(findme, 1, 4)));

            // test unmodifiable
            set = CharArraySet.UnmodifiableSet(set);
            assertTrue(set.Contains(findme, 1, 4));
            assertTrue(set.Contains(new string(findme, 1, 4)));
        }
Пример #8
0
        private static ISet <string> GetStopWords()
        {
            int    portalId;
            string cultureCode;

            var searchDoc = Thread.GetData(Thread.GetNamedDataSlot(Constants.TlsSearchInfo)) as SearchDocument;

            if (searchDoc == null)
            {
                portalId    = 0; // default
                cultureCode = Thread.CurrentThread.CurrentCulture.Name;
            }
            else
            {
                portalId    = searchDoc.PortalId;
                cultureCode = searchDoc.CultureCode;
                if (string.IsNullOrEmpty(cultureCode))
                {
                    var portalInfo = PortalController.Instance.GetPortal(portalId);
                    if (portalInfo != null)
                    {
                        cultureCode = portalInfo.DefaultLanguage;
                    }
                }
            }

            var stops           = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
            var searchStopWords = SearchHelper.Instance.GetSearchStopWords(portalId, cultureCode);

            if (searchStopWords != null && !string.IsNullOrEmpty(searchStopWords.StopWords))
            {
                //TODO Use cache from InternalSearchController
                var cultureInfo = new CultureInfo(cultureCode ?? "en-US");
                var strArray    = searchStopWords.StopWords.Split(',').Select(s => s.ToLower(cultureInfo)).ToArray();
                var set         = new CharArraySet(strArray.Length, false);
                set.AddAll(strArray);
                stops = CharArraySet.UnmodifiableSet(set);
            }

            return(stops);
        }
 /// <summary>
 /// Returns as <seealso cref="CharArraySet"/> from wordFiles, which
 /// can be a comma-separated list of filenames
 /// </summary>
 protected internal CharArraySet GetWordSet(ResourceLoader loader, string wordFiles, bool ignoreCase)
 {
     assureMatchVersion();
     IList<string> files = splitFileNames(wordFiles);
     CharArraySet words = null;
     if (files.Count > 0)
     {
         // default stopwords list has 35 or so words, but maybe don't make it that
         // big to start
         words = new CharArraySet(luceneMatchVersion, files.Count * 10, ignoreCase);
         foreach (string file in files)
         {
             var wlist = getLines(loader, file.Trim());
             words.AddAll(StopFilter.makeStopSet(luceneMatchVersion, wlist, ignoreCase));
         }
     }
     return words;
 }
Пример #10
0
        static Stopwords()
        {
            PORTUGUESE = new string[] {
                "a",
                "ainda",
                "alem",
                "ambas",
                "ambos",
                "antes",
                "ao",
                "aonde",
                "aos",
                "apos",
                "aquele",
                "aqueles",
                "as",
                "assim",
                "com",
                "como",
                "contra",
                "contudo",
                "cuja",
                "cujas",
                "cujo",
                "cujos",
                "da",
                "das",
                "de",
                "dela",
                "dele",
                "deles",
                "demais",
                "depois",
                "desde",
                "desta",
                "deste",
                "dispoe",
                "dispoem",
                "diversa",
                "diversas",
                "diversos",
                "do",
                "dos",
                "durante",
                "e",
                "ela",
                "elas",
                "ele",
                "eles",
                "em",
                "entao",
                "entre",
                "essa",
                "essas",
                "esse",
                "esses",
                "esta",
                "estas",
                "este",
                "estes",
                "ha",
                "isso",
                "isto",
                "logo",
                "mais",
                "mas",
                "mediante",
                "menos",
                "mesma",
                "mesmas",
                "mesmo",
                "mesmos",
                "na",
                "nas",
                "nao",
                "nas",
                "nem",
                "nesse",
                "neste",
                "nos",
                "o",
                "os",
                "ou",
                "outra",
                "outras",
                "outro",
                "outros",
                "pelas",
                "pelas",
                "pelo",
                "pelos",
                "perante",
                "pois",
                "por",
                "porque",
                "portanto",
                "proprio",
                "propios",
                "quais",
                "qual",
                "qualquer",
                "quando",
                "quanto",
                "que",
                "quem",
                "quer",
                "se",
                "seja",
                "sem",
                "sendo",
                "seu",
                "seus",
                "sob",
                "sobre",
                "sua",
                "suas",
                "tal",
                "tambem",
                "teu",
                "teus",
                "toda",
                "todas",
                "todo",
                "todos",
                "tua",
                "tuas",
                "tudo",
                "um",
                "uma",
                "umas",
                "uns"
            };
            var stopSet = new CharArraySet(PORTUGUESE.Length, false);

            stopSet.AddAll(new System.Collections.ArrayList(PORTUGUESE));
            PORTUGUESE_SET = CharArraySet.UnmodifiableSet(stopSet);
        }
Пример #11
0
		/// <summary></summary>
		/// <param name="stopWords">An array of stopwords</param>
		/// <param name="ignoreCase">If true, all words are lower cased first.</param>
		/// <returns> a Set containing the words</returns>
		public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase)
		{
			var stopSet = new CharArraySet(stopWords.Length, ignoreCase);
		    stopSet.AddAll(stopWords);
			return stopSet;
		}
Пример #12
0
 public virtual void TestClear()
 {
     var set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
     set.AddAll(TEST_STOP_WORDS);
     assertEquals("Not all words added", TEST_STOP_WORDS.Length, set.size());
     set.Clear();
     assertEquals("not empty", 0, set.size());
     for (var i = 0; i < TEST_STOP_WORDS.Length; i++)
     {
         assertFalse(set.Contains(TEST_STOP_WORDS[i]));
     }
     set.AddAll(TEST_STOP_WORDS);
     assertEquals("Not all words added", TEST_STOP_WORDS.Length, set.size());
     for (var i = 0; i < TEST_STOP_WORDS.Length; i++)
     {
         assertTrue("Set doesn't contain " + TEST_STOP_WORDS[i], set.Contains(TEST_STOP_WORDS[i]));
     }
 }
Пример #13
0
        public virtual void TestUnmodifiableSet()
        {
            var set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            set.AddAll(TEST_STOP_WORDS);
            set.Add(Convert.ToInt32(1));
            int size = set.size();
            set = CharArraySet.UnmodifiableSet(set);
            assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
            foreach (var stopword in TEST_STOP_WORDS)
            {
                assertTrue(set.Contains(stopword));
            }
            assertTrue(set.Contains(Convert.ToInt32(1)));
            assertTrue(set.Contains("1"));
            assertTrue(set.Contains(new[] { '1' }));

            try
            {
                CharArraySet.UnmodifiableSet(null);
                fail("can not make null unmodifiable");
            }
            catch (System.ArgumentNullException) // NOTE: In .NET we throw an ArgumentExcpetion, not a NullReferenceExeption
            {
                // expected
            }
        }
Пример #14
0
        public virtual void TestModifyOnUnmodifiable()
        {
            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
            set.AddAll(TEST_STOP_WORDS);
            int size = set.size();
            set = CharArraySet.UnmodifiableSet(set);
            assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
            string NOT_IN_SET = "SirGallahad";
            assertFalse("Test String already exists in set", set.Contains(NOT_IN_SET));

            try
            {
                set.Add(NOT_IN_SET.ToCharArray());
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.add(NOT_IN_SET);
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.Add(new StringBuilder(NOT_IN_SET));
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.clear();
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }
            try
            {
                set.add(NOT_IN_SET);
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            // NOTE: This results in a StackOverflow exception. Since this is not a public member of CharArraySet,
            // but an extension method for the test fixture (which apparently has a bug), this test is non-critical
            //// This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
            //// current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
            //// remove() on the iterator
            //try
            //{
            //    set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, TEST_STOP_WORDS, true));
            //    fail("Modified unmodifiable set");
            //}
            //catch (System.NotSupportedException)
            //{
            //    // expected
            //    assertEquals("Size of unmodifiable set has changed", size, set.size());
            //}

            #region Added for better .NET support
            // This test was added for .NET to check the Remove method, since the extension method
            // above fails to execute.
            try
            {
#pragma warning disable 612, 618
                set.Remove(TEST_STOP_WORDS[0]);
#pragma warning restore 612, 618
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }
            #endregion

            try
            {
                set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, new [] { NOT_IN_SET }, true));
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertEquals("Size of unmodifiable set has changed", size, set.size());
            }

            try
            {
                set.addAll(new[] { NOT_IN_SET});
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
            }

            // LUCENENET Specific - added to test .NETified UnionWith method
            try
            {
                set.UnionWith(new[] { NOT_IN_SET });
                fail("Modified unmodifiable set");
            }
            catch (System.NotSupportedException)
            {
                // expected
                assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
            }

            for (int i = 0; i < TEST_STOP_WORDS.Length; i++)
            {
                assertTrue(set.contains(TEST_STOP_WORDS[i]));
            }
        }
Пример #15
0
		static StopAnalyzer()
		{
			{
				var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
				var stopSet = new CharArraySet(stopWords.Length, false);
				stopSet.AddAll(stopWords);
				ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
			}
		}