Exemple #1
0
        /// <summary>
        /// Creates a new <see cref="WordDelimiterFilterFactory"/> </summary>
        public WordDelimiterFilterFactory(IDictionary <string, string> args)
            : base(args)
        {
            AssureMatchVersion();
            WordDelimiterFlags flags = 0;

            if (GetInt32(args, "generateWordParts", 1) != 0)
            {
                flags |= WordDelimiterFlags.GENERATE_WORD_PARTS;
            }
            if (GetInt32(args, "generateNumberParts", 1) != 0)
            {
                flags |= WordDelimiterFlags.GENERATE_NUMBER_PARTS;
            }
            if (GetInt32(args, "catenateWords", 0) != 0)
            {
                flags |= WordDelimiterFlags.CATENATE_WORDS;
            }
            if (GetInt32(args, "catenateNumbers", 0) != 0)
            {
                flags |= WordDelimiterFlags.CATENATE_NUMBERS;
            }
            if (GetInt32(args, "catenateAll", 0) != 0)
            {
                flags |= WordDelimiterFlags.CATENATE_ALL;
            }
            if (GetInt32(args, "splitOnCaseChange", 1) != 0)
            {
                flags |= WordDelimiterFlags.SPLIT_ON_CASE_CHANGE;
            }
            if (GetInt32(args, "splitOnNumerics", 1) != 0)
            {
                flags |= WordDelimiterFlags.SPLIT_ON_NUMERICS;
            }
            if (GetInt32(args, "preserveOriginal", 0) != 0)
            {
                flags |= WordDelimiterFlags.PRESERVE_ORIGINAL;
            }
            if (GetInt32(args, "stemEnglishPossessive", 1) != 0)
            {
                flags |= WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE;
            }
            wordFiles  = Get(args, PROTECTED_TOKENS);
            types      = Get(args, TYPES);
            this.flags = flags;
            if (args.Count > 0)
            {
                throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args));
            }
        }
        /// <summary>
        /// Creates a new <see cref="Lucene47WordDelimiterFilter"/>
        /// </summary>
        /// <param name="in"> <see cref="TokenStream"/> to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public Lucene47WordDelimiterFilter(TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords)
            : base(@in)
        {
            termAttribute   = AddAttribute <ICharTermAttribute>();
            offsetAttribute = AddAttribute <IOffsetAttribute>();
            posIncAttribute = AddAttribute <IPositionIncrementAttribute>();
            typeAttribute   = AddAttribute <ITypeAttribute>();
            concat          = new WordDelimiterConcatenation(this);
            concatAll       = new WordDelimiterConcatenation(this);

            this.flags     = configurationFlags;
            this.protWords = protWords;
            this.iterator  = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE));
        }
        public virtual void TestLotsOfConcatenating()
        {
            WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS
                                       | WordDelimiterFlags.GENERATE_NUMBER_PARTS
                                       | WordDelimiterFlags.CATENATE_WORDS
                                       | WordDelimiterFlags.CATENATE_NUMBERS
                                       | WordDelimiterFlags.CATENATE_ALL
                                       | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE
                                       | WordDelimiterFlags.SPLIT_ON_NUMERICS
                                       | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE;

            /* analyzer that uses whitespace + wdf */
            Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this, flags);

            AssertAnalyzesTo(a, "abc-def-123-456", new string[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, new int[] { 1, 0, 0, 1, 1, 0, 1 });
        }
        public virtual void TestOffsets()
        {
            WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS
                                       | WordDelimiterFlags.GENERATE_NUMBER_PARTS
                                       | WordDelimiterFlags.CATENATE_ALL
                                       | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE
                                       | WordDelimiterFlags.SPLIT_ON_NUMERICS
                                       | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE;
            // test that subwords and catenated subwords have
            // the correct offsets.
            WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);

            AssertTokenStreamContents(wdf, new string[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 });

            wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);

            AssertTokenStreamContents(wdf, new string[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 });
        }
        public virtual void TestPositionIncrements()
        {
            WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS
                                       | WordDelimiterFlags.GENERATE_NUMBER_PARTS
                                       | WordDelimiterFlags.CATENATE_ALL
                                       | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE
                                       | WordDelimiterFlags.SPLIT_ON_NUMERICS
                                       | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE;

            CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "NUTCH" }, false);

            /* analyzer that uses whitespace + wdf */
            Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, flags, protWords);

            /* in this case, works as expected. */
            AssertAnalyzesTo(a, "LUCENE / SOLR", new string[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, new int[] { 1, 1 });

            /* only in this case, posInc of 2 ?! */
            AssertAnalyzesTo(a, "LUCENE / solR", new string[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 1, 0, 1 });

            AssertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new string[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 1, 1 });

            /* analyzer that will consume tokens with large position increments */
            Analyzer a2 = new AnalyzerAnonymousInnerClassHelper2(this, flags, protWords);

            /* increment of "largegap" is preserved */
            AssertAnalyzesTo(a2, "LUCENE largegap SOLR", new string[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, new int[] { 1, 10, 1 });

            /* the "/" had a position increment of 10, where did it go?!?!! */
            AssertAnalyzesTo(a2, "LUCENE / SOLR", new string[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, new int[] { 1, 11 });

            /* in this case, the increment of 10 from the "/" is carried over */
            AssertAnalyzesTo(a2, "LUCENE / solR", new string[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 11, 0, 1 });

            AssertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new string[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 11, 1 });

            Analyzer a3 = new AnalyzerAnonymousInnerClassHelper3(this, flags, protWords);

            AssertAnalyzesTo(a3, "lucene.solr", new string[] { "lucene", "lucenesolr", "solr" }, new int[] { 0, 0, 7 }, new int[] { 6, 11, 11 }, new int[] { 1, 0, 1 });

            /* the stopword should add a gap here */
            AssertAnalyzesTo(a3, "the lucene.solr", new string[] { "lucene", "lucenesolr", "solr" }, new int[] { 4, 4, 11 }, new int[] { 10, 15, 15 }, new int[] { 2, 0, 1 });
        }
Exemple #6
0
        public virtual void TestLotsOfConcatenating()
        {
            WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS
                                       | WordDelimiterFlags.GENERATE_NUMBER_PARTS
                                       | WordDelimiterFlags.CATENATE_WORDS
                                       | WordDelimiterFlags.CATENATE_NUMBERS
                                       | WordDelimiterFlags.CATENATE_ALL
                                       | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE
                                       | WordDelimiterFlags.SPLIT_ON_NUMERICS
                                       | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE;

            /* analyzer that uses whitespace + wdf */
            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                return(new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null)));
            });

            AssertAnalyzesTo(a, "abc-def-123-456", new string[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, new int[] { 1, 0, 0, 1, 1, 0, 1 });
        }
        public virtual void TestRandomHugeStrings()
        {
            int numIterations = AtLeast(5);

            for (int i = 0; i < numIterations; i++)
            {
                WordDelimiterFlags flags = (WordDelimiterFlags)Random.Next(512);
                CharArraySet       protectedWords;
                if (Random.nextBoolean())
                {
                    protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "a", "b", "cd" }, false);
                }
                else
                {
                    protectedWords = null;
                }

                Analyzer a = new AnalyzerAnonymousInnerClassHelper7(this, flags, protectedWords);
                CheckRandomData(Random, a, 100 * RANDOM_MULTIPLIER, 8192);
            }
        }
Exemple #8
0
        /// <summary>
        /// Creates a new WordDelimiterFilter
        /// </summary>
        /// <param name="matchVersion"> lucene compatibility version </param>
        /// <param name="in"> TokenStream to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords)
            : base(@in)
        {
            this.termAttribute   = AddAttribute <ICharTermAttribute>();
            this.offsetAttribute = AddAttribute <IOffsetAttribute>();
            this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>();
            this.typeAttribute   = AddAttribute <ITypeAttribute>();
            concat    = new WordDelimiterConcatenation(this);
            concatAll = new WordDelimiterConcatenation(this);
            sorter    = new OffsetSorter(this);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
            }
            this.flags     = configurationFlags;
            this.protWords = protWords;
            this.iterator  = new WordDelimiterIterator(charTypeTable,
                                                       Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE),
                                                       Has(WordDelimiterFlags.SPLIT_ON_NUMERICS),
                                                       Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE));
        }
        public virtual void TestEmptyTerm()
        {
            Random random = Random;

            for (int i = 0; i < 512; i++)
            {
                WordDelimiterFlags flags = (WordDelimiterFlags)i;
                CharArraySet       protectedWords;
                if (random.nextBoolean())
                {
                    protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "a", "b", "cd" }, false);
                }
                else
                {
                    protectedWords = null;
                }

                Analyzer a = new AnalyzerAnonymousInnerClassHelper8(this, flags, protectedWords);
                // depending upon options, this thing may or may not preserve the empty term
                CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
            }
        }
Exemple #10
0
 /// <summary>
 /// Determines whether the given flag is set
 /// </summary>
 /// <param name="flag"> Flag to see if set </param>
 /// <returns> <c>true</c> if flag is set </returns>
 private bool Has(WordDelimiterFlags flag)
 {
     return((flags & flag) != 0);
 }
Exemple #11
0
 /// <summary>
 /// Creates a new WordDelimiterFilter using <see cref="WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE"/>
 /// as its charTypeTable
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="in"> <see cref="TokenStream"/> to be filtered </param>
 /// <param name="configurationFlags"> Flags configuring the filter </param>
 /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
 public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, WordDelimiterFlags configurationFlags, CharArraySet protWords)
     : this(matchVersion, @in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords)
 {
 }
 public AnalyzerAnonymousInnerClassHelper8(TestWordDelimiterFilter outerInstance, WordDelimiterFlags flags, CharArraySet protectedWords)
 {
     this.outerInstance  = outerInstance;
     this.flags          = flags;
     this.protectedWords = protectedWords;
 }
 public AnalyzerAnonymousInnerClassHelper5(TestWordDelimiterFilter outerInstance, WordDelimiterFlags flags)
 {
     this.outerInstance = outerInstance;
     this.flags         = flags;
 }
Exemple #14
0
 /// <summary>
 /// Creates a new <see cref="Lucene47WordDelimiterFilter"/> using <see cref="WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE"/>
 /// as its charTypeTable
 /// </summary>
 /// <param name="in"> <see cref="TokenStream"/> to be filtered </param>
 /// <param name="configurationFlags"> Flags configuring the filter </param>
 /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
 public Lucene47WordDelimiterFilter(TokenStream @in, WordDelimiterFlags configurationFlags, CharArraySet protWords)
     : this(@in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords)
 {
 }