public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("havnedistriktene"), false);
     Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT, NorwegianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "havnedistriktene", "havnedistriktene");
     CheckOneTerm(a, "havnedistrikter", "havnedistrikt");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("abbandonata"), false);
     Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT, ItalianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "abbandonata", "abbandonata");
     CheckOneTerm(a, "abbandonati", "abbandonat");
 }
 public virtual void TestExactCase()
 {
     StringReader reader = new StringReader("Now is The Time");
     CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "is", "the", "Time" }, false);
     TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords);
     AssertTokenStreamContents(stream, new string[] { "Now", "The" });
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("correspondente"), false);
     Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT, GalicianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "correspondente", "correspondente");
     CheckOneTerm(a, "corresponderá", "correspond");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("edeltäjistään"), false);
     Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT, FinnishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "edeltäjiinsä", "edeltäj");
     CheckOneTerm(a, "edeltäjistään", "edeltäjistään");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("undersøgelse"), false);
     Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT, DanishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "undersøgelse", "undersøgelse");
     CheckOneTerm(a, "undersøg", "undersøg");
 }
        protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>() as CharTermAttribute;
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();

            this.matchVersion = matchVersion;
            this.tokens = new LinkedList<CompoundToken>();
            if (minWordSize < 0)
            {
                throw new System.ArgumentException("minWordSize cannot be negative");
            }
            this.minWordSize = minWordSize;
            if (minSubwordSize < 0)
            {
                throw new System.ArgumentException("minSubwordSize cannot be negative");
            }
            this.minSubwordSize = minSubwordSize;
            if (maxSubwordSize < 0)
            {
                throw new System.ArgumentException("maxSubwordSize cannot be negative");
            }
            this.maxSubwordSize = maxSubwordSize;
            this.onlyLongestMatch = onlyLongestMatch;
            this.dictionary = dictionary;
        }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("jaktkarlarne"), false);
     Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT, SwedishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "jaktkarlarne", "jaktkarlarne");
     CheckOneTerm(a, "jaktkarlens", "jaktkarl");
 }
 public virtual void TestWithStemExclusionSet()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("پیاوە");
     Analyzer a = new SoraniAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
     AssertAnalyzesTo(a, "پیاوە", new string[] { "پیاوە" });
 }
            static DefaultsHolder()
            {
                try
                {
                    DEFAULT_STOP_SET = WordlistLoader.GetWordSet(IOUtils.GetDecodingReader(typeof(PolishAnalyzer),
                        typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STOPWORD_FILE, Encoding.UTF8), "#",
#pragma warning disable 612, 618
                        LuceneVersion.LUCENE_CURRENT);
#pragma warning restore 612, 618
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw new SystemException("Unable to load default stopword set", ex);
                }

                try
                {
                    DEFAULT_TABLE = StempelStemmer.Load(typeof(PolishAnalyzer).Assembly.GetManifestResourceStream(
                        typeof(PolishAnalyzer).Namespace + "." + DEFAULT_STEMMER_FILE));
                }
                catch (IOException ex)
                {
                    // default set should always be present as it is part of the
                    // distribution (embedded resource)
                    throw new SystemException("Unable to load default stemming tables", ex);
                }
            }
Example #11
0
 /// <summary>
 /// Test for NPE
 /// </summary>
 public virtual void testContainsWithNull()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     try
     {
       set.contains((char[]) null, 0, 10);
       fail("null value must raise NPE");
     }
     catch (System.NullReferenceException)
     {
     }
     try
     {
       set.contains((CharSequence) null);
       fail("null value must raise NPE");
     }
     catch (System.NullReferenceException)
     {
     }
     try
     {
       set.contains((object) null);
       fail("null value must raise NPE");
     }
     catch (System.NullReferenceException)
     {
     }
 }
 public virtual void TestWithKeywordAttribute()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("fischen");
     GermanStemFilter filter = new GermanStemFilter(new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Fischen Trinken")), set));
     AssertTokenStreamContents(filter, new string[] { "fischen", "trink" });
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("արծիվներ"), false);
     Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT, ArmenianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "արծիվներ", "արծիվներ");
     CheckOneTerm(a, "արծիվ", "արծ");
 }
        internal readonly bool forceFirstLetter; // make sure the first letter is capital even if it is in the keep list

        /// <summary>
        /// Creates a new CapitalizationFilterFactory </summary>
        public CapitalizationFilterFactory(IDictionary<string, string> args)
            : base(args)
        {
            assureMatchVersion();
            bool ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false);
            HashSet<string> k = getSet(args, KEEP);
            if (k != null)
            {
                keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
                keep.AddAll(k);
            }

            k = getSet(args, OK_PREFIX);
            if (k != null)
            {
                okPrefix = new List<char[]>();
                foreach (string item in k)
                {
                    okPrefix.Add(item.ToCharArray());
                }
            }

            minWordLength = getInt(args, MIN_WORD_LENGTH, 0);
            maxWordCount = getInt(args, MAX_WORD_COUNT, CapitalizationFilter.DEFAULT_MAX_WORD_COUNT);
            maxTokenLength = getInt(args, MAX_TOKEN_LENGTH, CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH);
            onlyFirstWord = getBoolean(args, ONLY_FIRST_WORD, true);
            forceFirstLetter = getBoolean(args, FORCE_FIRST_LETTER, true);
            if (args.Count > 0)
            {
                throw new System.ArgumentException("Unknown parameters: " + args);
            }
        }
Example #15
0
 public void SetArticles(ISet<string> articles)
 {
     if (articles is CharArraySet)
         this.articles = (CharArraySet)articles;
     else
         this.articles = new CharArraySet(articles, true);
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("babakocsi"), false);
     Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT, HungarianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "babakocsi", "babakocsi");
     CheckOneTerm(a, "babakocsijáért", "babakocs");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("peledakan"), false);
     Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, IndonesianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "peledakan", "peledakan");
     CheckOneTerm(a, "pembunuhan", "bunuh");
 }
 public virtual void Inform(ResourceLoader loader)
 {
     if (wordFiles != null)
     {
         words = GetWordSet(loader, wordFiles, ignoreCase);
     }
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("tirgiem"), false);
     Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, LatvianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "tirgiem", "tirgiem");
     CheckOneTerm(a, "tirgus", "tirg");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("books"), false);
     Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT, EnglishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "books", "books");
     CheckOneTerm(a, "book", "book");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("absenţa"), false);
     Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT, RomanianAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "absenţa", "absenţa");
     CheckOneTerm(a, "absenţi", "absenţ");
 }
 public virtual void TestWithStemExclusionSet()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("hole");
     CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
     AssertAnalyzesTo(cz, "hole desek", new string[] { "hole", "desk" });
 }
 public virtual void TestWithStemExclusionSet()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("представление");
     Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.DefaultStopSet, set);
     AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("ağacı"), false);
     Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT, TurkishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "ağacı", "ağacı");
     CheckOneTerm(a, "ağaç", "ağaç");
 }
 public virtual void TestWithStemExclusionSet()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("строеве");
     Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
     AssertAnalyzesTo(a, "строевете строеве", new string[] { "строй", "строеве" });
 }
        public ChineseFilter(TokenStream @in)
            : base(@in)
        {

            stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false);
            termAtt = AddAttribute<ICharTermAttribute>();
        }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("llengües"), false);
     Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT, CatalanAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "llengües", "llengües");
     CheckOneTerm(a, "llengua", "llengu");
 }
        /// <summary>
        /// Creates a CapitalizationFilter with the specified parameters. </summary>
        /// <param name="in"> input tokenstream </param>
        /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
        /// <param name="keep"> a keep word list.  Each word that should be kept separated by whitespace. </param>
        /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
        /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
        /// <param name="minWordLength"> how long the word needs to be to get capitalization applied.  If the
        ///                      minWordLength is 3, "and" > "And" but "or" stays "or". </param>
        /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
        ///                     assumed to be correct. </param>
        /// <param name="maxTokenLength"> ??? </param>
        public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
            : base(@in)
        {
            // LUCENENET: The guard clauses were copied here from the version of Lucene.
            // Apparently, the tests were not ported from 4.8.0 because they expected this and the
            // original tests did not. Adding them anyway because there is no downside to this.
            if (minWordLength < 0)
            {
                throw new ArgumentOutOfRangeException("minWordLength must be greater than or equal to zero");
            }
            if (maxWordCount < 1)
            {
                throw new ArgumentOutOfRangeException("maxWordCount must be greater than zero");
            }
            if (maxTokenLength < 1)
            {
                throw new ArgumentOutOfRangeException("maxTokenLength must be greater than zero");
            }

            this.onlyFirstWord = onlyFirstWord;
            this.keep = keep;
            this.forceFirstLetter = forceFirstLetter;
            this.okPrefix = okPrefix;
            this.minWordLength = minWordLength;
            this.maxWordCount = maxWordCount;
            this.maxTokenLength = maxTokenLength;
            termAtt = AddAttribute<ICharTermAttribute>();
        }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("feirmeoireacht"), false);
     Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT, IrishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "feirmeoireacht", "feirmeoireacht");
     CheckOneTerm(a, "siopadóireacht", "siopadóir");
 }
 public virtual void TestExclude()
 {
     CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("chicano"), false);
     Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT, SpanishAnalyzer.DefaultStopSet, exclusionSet);
     CheckOneTerm(a, "chicana", "chican");
     CheckOneTerm(a, "chicano", "chicano");
 }
Example #31
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 public BulgarianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Example #32
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public PortugueseAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Example #33
0
 public AnalyzerAnonymousInnerClassHelper2(TestGermanLightStemFilter outerInstance, CharArraySet exclusionSet)
 {
     this.outerInstance = outerInstance;
     this.exclusionSet  = exclusionSet;
 }
Example #34
0
 /// <summary>
 /// Creates a new <see cref="DictionaryCompoundWordTokenFilter"/>
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <see cref="TokenStream"/> to process </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 /// <param name="minWordSize">
 ///          only words longer than this get processed </param>
 /// <param name="minSubwordSize">
 ///          only subwords longer than this get to the output stream </param>
 /// <param name="maxSubwordSize">
 ///          only subwords shorter than this get to the output stream </param>
 /// <param name="onlyLongestMatch">
 ///          Add only the longest matching subword to the stream </param>
 public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
     : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
 {
     if (dictionary == null)
     {
         throw new ArgumentException("dictionary cannot be null");
     }
 }
Example #35
0
 /// <summary>
 /// Creates a new <see cref="DictionaryCompoundWordTokenFilter"/>
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <see cref="TokenStream"/> to process </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary)
     : base(matchVersion, input, dictionary)
 {
     if (dictionary == null)
     {
         throw new ArgumentException("dictionary cannot be null");
     }
 }
 internal static void AssertCapitalizesToKeyword(string input, string expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
 {
     AssertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), new string[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
 }
        internal static void AssertCapitalizesTo(Tokenizer tokenizer, string[] expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
        {
            CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength,
                                                                   // LUCENENET specific - pass in the invariant culture to get the same behavior as Lucene,
                                                                   // otherwise the filter is culture-sensitive.
                                                                   CultureInfo.InvariantCulture);

            AssertTokenStreamContents(filter, expected);
        }
 public AnalyzerAnonymousInnerClassHelper2(TestFrenchMinimalStemFilter outerInstance, CharArraySet exclusionSet)
 {
     this.outerInstance = outerInstance;
     this.exclusionSet  = exclusionSet;
 }
Example #39
0
 protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary)
     : this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
 {
 }
Example #40
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// <para>
 /// <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
 /// <seealso cref="GreekLowerCaseFilter"/> for best results.
 ///
 /// </para>
 /// </summary>
 /// <param name="matchVersion"> Lucene compatibility version,
 ///   See <a href="#version">above</a> </param>
 /// <param name="stopwords"> a stopword set </param>
 public GreekAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : base(matchVersion, stopwords)
 {
 }
Example #41
0
 /// <summary>
 /// Builds an analyzer with the given stop words and a set of work to be
 /// excluded from the <see cref="CzechStemFilter"/>.
 /// </summary>
 /// <param name="matchVersion"> <see cref="LuceneVersion"/> to match </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionTable"> a stemming exclusion set </param>
 public CzechAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
 }
Example #42
0
        protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            m_termAtt   = AddAttribute <ICharTermAttribute>();
            m_offsetAtt = AddAttribute <IOffsetAttribute>();
            posIncAtt   = AddAttribute <IPositionIncrementAttribute>();

            this.m_matchVersion = matchVersion;
            this.m_tokens       = new Queue <CompoundToken>();
            if (minWordSize < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(minWordSize), "minWordSize cannot be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }
            this.m_minWordSize = minWordSize;
            if (minSubwordSize < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(minSubwordSize), "minSubwordSize cannot be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }
            this.m_minSubwordSize = minSubwordSize;
            if (maxSubwordSize < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(maxSubwordSize), "maxSubwordSize cannot be negative"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }
            this.m_maxSubwordSize   = maxSubwordSize;
            this.m_onlyLongestMatch = onlyLongestMatch;
            this.m_dictionary       = dictionary;
        }
Example #43
0
        public virtual void testMethods()
        {
            CharArrayMap <int?>       cm = new CharArrayMap <int?>(TEST_VERSION_CURRENT, 2, false);
            Dictionary <string, int?> hm = new Dictionary <string, int?>();

            hm["foo"] = 1;
            hm["bar"] = 2;
            cm.putAll(hm);
            assertEquals(hm.Count, cm.size());
            hm["baz"] = 3;
            cm.putAll(hm);
            assertEquals(hm.Count, cm.size());

            CharArraySet cs = cm.Keys;
            int          n  = 0;

            foreach (object o in cs)
            {
                assertTrue(cm.containsKey(o));
                char[] co = (char[])o;
                assertTrue(cm.containsKey(co, 0, co.Length));
                n++;
            }
            assertEquals(hm.Count, n);
            assertEquals(hm.Count, cs.size());
            assertEquals(cm.size(), cs.size());
            cs.clear();
            assertEquals(0, cs.size());
            assertEquals(0, cm.size());
            try
            {
                cs.add("test");
                fail("keySet() allows adding new keys");
            }
            catch (System.NotSupportedException)
            {
                // pass
            }
            cm.putAll(hm);
            assertEquals(hm.Count, cs.size());
            assertEquals(cm.size(), cs.size());

            IEnumerator <KeyValuePair <object, int?> > iter1 = cm.entrySet().GetEnumerator();

            n = 0;
            while (iter1.MoveNext())
            {
                KeyValuePair <object, int?> entry = iter1.Current;
                object key = entry.Key;
                int?   val = entry.Value;
                assertEquals(cm.get(key), val);
                entry.Value = val * 100;
                assertEquals(val * 100, (int)cm.get(key));
                n++;
            }
            assertEquals(hm.Count, n);
            cm.clear();
            cm.putAll(hm);
            assertEquals(cm.size(), n);

            CharArrayMap <int?> .EntryIterator iter2 = cm.entrySet().GetEnumerator();
            n = 0;
            while (iter2.hasNext())
            {
                char[] keyc = iter2.nextKey();
                int?   val  = iter2.currentValue();
                assertEquals(hm[new string(keyc)], val);
                iter2.Value = val * 100;
                assertEquals(val * 100, (int)cm.get(keyc));
                n++;
            }
            assertEquals(hm.Count, n);

            cm.entrySet().clear();
            assertEquals(0, cm.size());
            assertEquals(0, cm.entrySet().size());
            assertTrue(cm.Empty);
        }
Example #44
0
 /// <summary>
 /// Builds an analyzer with the given stop words and a stem exclusion set.
 /// If a stem exclusion set is provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/>
 /// before <see cref="BulgarianStemFilter"/>.
 /// </summary>
 public BulgarianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
 }
Example #45
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Example #46
0
 /// <summary>
 /// Create a new <see cref="SetKeywordMarkerFilter"/>, that marks the current token as a
 /// keyword if the tokens term buffer is contained in the given set via the
 /// <see cref="KeywordAttribute"/>.
 /// </summary>
 /// <param name="in">
 ///          <see cref="TokenStream"/> to filter </param>
 /// <param name="keywordSet">
 ///          the keywords set to lookup the current termbuffer </param>
 public SetKeywordMarkerFilter(TokenStream @in, CharArraySet keywordSet)
     : base(@in)
 {
     this.keywordSet = keywordSet;
     termAtt         = AddAttribute <ICharTermAttribute>();
 }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: static void assertCapitalizesTo(String input, String expected[], boolean onlyFirstWord, org.apache.lucene.analysis.util.CharArraySet keep, boolean forceFirstLetter, java.util.Collection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) throws java.io.IOException
        internal static void assertCapitalizesTo(string input, string[] expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
        {
            assertCapitalizesTo(new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false), expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
        }
Example #48
0
        protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            m_termAtt   = AddAttribute <ICharTermAttribute>();
            m_offsetAtt = AddAttribute <IOffsetAttribute>();
            posIncAtt   = AddAttribute <IPositionIncrementAttribute>();

            this.m_matchVersion = matchVersion;
            this.m_tokens       = new Queue <CompoundToken>();
            if (minWordSize < 0)
            {
                throw new ArgumentException("minWordSize cannot be negative");
            }
            this.m_minWordSize = minWordSize;
            if (minSubwordSize < 0)
            {
                throw new ArgumentException("minSubwordSize cannot be negative");
            }
            this.m_minSubwordSize = minSubwordSize;
            if (maxSubwordSize < 0)
            {
                throw new ArgumentException("maxSubwordSize cannot be negative");
            }
            this.m_maxSubwordSize   = maxSubwordSize;
            this.m_onlyLongestMatch = onlyLongestMatch;
            this.m_dictionary       = dictionary;
        }
Example #49
0
 /// <summary>
 /// Creates a new WordDelimiterFilter using <see cref="WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE"/>
 /// as its charTypeTable
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="in"> <see cref="TokenStream"/> to be filtered </param>
 /// <param name="configurationFlags"> Flags configuring the filter </param>
 /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
 public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, WordDelimiterFlags configurationFlags, CharArraySet protWords)
     : this(matchVersion, @in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords)
 {
 }
Example #50
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
Example #51
0
 public ChineseFilter(TokenStream _in)
     : base(_in)
 {
     stopTable = new CharArraySet((IEnumerable <string>)STOP_WORDS, false);
     termAtt   = AddAttribute <ITermAttribute>();
 }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: static void assertCapitalizesTo(org.apache.lucene.analysis.Tokenizer tokenizer, String expected[], boolean onlyFirstWord, org.apache.lucene.analysis.util.CharArraySet keep, boolean forceFirstLetter, java.util.Collection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) throws java.io.IOException
        internal static void assertCapitalizesTo(Tokenizer tokenizer, string[] expected, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection <char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
        {
            CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);

            assertTokenStreamContents(filter, expected);
        }
Example #53
0
 public KeepWordFilter(Version version, bool enablePositionIncrements, TokenStream @in, CharArraySet words)
     : base(version, enablePositionIncrements, @in)
 {
     this.words = words;
     termAtt    = AddAttribute <ICharTermAttribute>();
 }
Example #54
0
        /// <summary>
        /// Creates a new WordDelimiterFilter
        /// </summary>
        /// <param name="matchVersion"> lucene compatibility version </param>
        /// <param name="in"> TokenStream to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords)
            : base(@in)
        {
            this.termAttribute   = AddAttribute <ICharTermAttribute>();
            this.offsetAttribute = AddAttribute <IOffsetAttribute>();
            this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>();
            this.typeAttribute   = AddAttribute <ITypeAttribute>();
            concat    = new WordDelimiterConcatenation(this);
            concatAll = new WordDelimiterConcatenation(this);
            sorter    = new OffsetSorter(this);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
            }
            this.flags     = configurationFlags;
            this.protWords = protWords;
            this.iterator  = new WordDelimiterIterator(charTypeTable,
                                                       Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE),
                                                       Has(WordDelimiterFlags.SPLIT_ON_NUMERICS),
                                                       Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE));
        }
 /// <summary>
 /// Creates a new <see cref="HyphenationCompoundWordTokenFilter"/> instance.
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <see cref="TokenStream"/> to process </param>
 /// <param name="hyphenator">
 ///          the hyphenation pattern tree to use for hyphenation </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 public HyphenationCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input,
                                           HyphenationTree hyphenator, CharArraySet dictionary)
     : this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
            DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
 {
 }
Example #56
0
 /// <summary>
 /// Create a new <seealso cref="KeepWordFilter"/>.
 /// <para><b>NOTE</b>: The words set passed to this constructor will be directly
 /// used by this filter and should not be modified.
 /// </para>
 /// </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="words">   the words to keep </param>
 public KeepWordFilter(Version version, TokenStream @in, CharArraySet words)
     : base(version, @in)
 {
     this.words = words;
 }
Example #57
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public FinnishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Example #58
0
        private static Analyzer GetAnalyzer()
        {
            var stopwords = new CharArraySet(LuceneVersion.LUCENE_48, System.IO.File.ReadAllLines(stopPath), false);

            return(new EnglishAnalyzer(LuceneVersion.LUCENE_48, stopwords));
        }
Example #59
0
        public virtual void TestStopPositons()
        {
            StringBuilder sb = new StringBuilder();
            List <string> a  = new List <string>();

            for (int i = 0; i < 20; i++)
            {
                string w = English.IntToEnglish(i).Trim();
                sb.Append(w).Append(" ");
                if (i % 3 != 0)
                {
                    a.Add(w);
                }
            }
            log(sb.ToString());
            string[] stopWords = a.ToArray();
            for (int i = 0; i < a.Count; i++)
            {
                log("Stop: " + stopWords[i]);
            }
            CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords);
            // with increments
            StringReader reader = new StringReader(sb.ToString());

#pragma warning disable 612, 618
            StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
            DoTestStopPositons(stpf, true);
            // without increments
            reader = new StringReader(sb.ToString());
            stpf   = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
#pragma warning restore 612, 618
            DoTestStopPositons(stpf, false);
            // with increments, concatenating two stop filters
            List <string> a0 = new List <string>();
            List <string> a1 = new List <string>();
            for (int i = 0; i < a.Count; i++)
            {
                if (i % 2 == 0)
                {
                    a0.Add(a[i]);
                }
                else
                {
                    a1.Add(a[i]);
                }
            }
            string[] stopWords0 = a0.ToArray();
            for (int i = 0; i < a0.Count; i++)
            {
                log("Stop0: " + stopWords0[i]);
            }
            string[] stopWords1 = a1.ToArray();
            for (int i = 0; i < a1.Count; i++)
            {
                log("Stop1: " + stopWords1[i]);
            }
            CharArraySet stopSet0 = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords0);
            CharArraySet stopSet1 = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords1);
            reader = new StringReader(sb.ToString());
            StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
#pragma warning disable 612, 618
            stpf0.SetEnablePositionIncrements(true);
#pragma warning restore 612, 618
            StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
            DoTestStopPositons(stpf01, true);
        }
Example #60
0
 /// <summary>
 /// Builds an analyzer with the given stop words. </summary>
 /// <param name="matchVersion"> Lucene version to match - See <see cref="UAX29URLEmailAnalyzer"/> </param>
 /// <param name="stopWords"> stop words  </param>
 public UAX29URLEmailAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords)
     : base(matchVersion, stopWords)
 {
 }