/// <summary> /// Creates a new <see cref="WordDelimiterFilterFactory"/> </summary> public WordDelimiterFilterFactory(IDictionary <string, string> args) : base(args) { AssureMatchVersion(); WordDelimiterFlags flags = 0; if (GetInt32(args, "generateWordParts", 1) != 0) { flags |= WordDelimiterFlags.GENERATE_WORD_PARTS; } if (GetInt32(args, "generateNumberParts", 1) != 0) { flags |= WordDelimiterFlags.GENERATE_NUMBER_PARTS; } if (GetInt32(args, "catenateWords", 0) != 0) { flags |= WordDelimiterFlags.CATENATE_WORDS; } if (GetInt32(args, "catenateNumbers", 0) != 0) { flags |= WordDelimiterFlags.CATENATE_NUMBERS; } if (GetInt32(args, "catenateAll", 0) != 0) { flags |= WordDelimiterFlags.CATENATE_ALL; } if (GetInt32(args, "splitOnCaseChange", 1) != 0) { flags |= WordDelimiterFlags.SPLIT_ON_CASE_CHANGE; } if (GetInt32(args, "splitOnNumerics", 1) != 0) { flags |= WordDelimiterFlags.SPLIT_ON_NUMERICS; } if (GetInt32(args, "preserveOriginal", 0) != 0) { flags |= WordDelimiterFlags.PRESERVE_ORIGINAL; } if (GetInt32(args, "stemEnglishPossessive", 1) != 0) { flags |= WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE; } wordFiles = Get(args, PROTECTED_TOKENS); types = Get(args, TYPES); this.flags = flags; if (args.Count > 0) { throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args)); } }
/// <summary> /// Creates a new <see cref="Lucene47WordDelimiterFilter"/> /// </summary> /// <param name="in"> <see cref="TokenStream"/> to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public Lucene47WordDelimiterFilter(TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
public virtual void TestLotsOfConcatenating() { WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS | WordDelimiterFlags.GENERATE_NUMBER_PARTS | WordDelimiterFlags.CATENATE_WORDS | WordDelimiterFlags.CATENATE_NUMBERS | WordDelimiterFlags.CATENATE_ALL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE | WordDelimiterFlags.SPLIT_ON_NUMERICS | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE; /* analyzer that uses whitespace + wdf */ Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this, flags); AssertAnalyzesTo(a, "abc-def-123-456", new string[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, new int[] { 1, 0, 0, 1, 1, 0, 1 }); }
public virtual void TestOffsets() { WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS | WordDelimiterFlags.GENERATE_NUMBER_PARTS | WordDelimiterFlags.CATENATE_ALL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE | WordDelimiterFlags.SPLIT_ON_NUMERICS | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE; // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 }); wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 }); }
public virtual void TestPositionIncrements() { WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS | WordDelimiterFlags.GENERATE_NUMBER_PARTS | WordDelimiterFlags.CATENATE_ALL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE | WordDelimiterFlags.SPLIT_ON_NUMERICS | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE; CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "NUTCH" }, false); /* analyzer that uses whitespace + wdf */ Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, flags, protWords); /* in this case, works as expected. */ AssertAnalyzesTo(a, "LUCENE / SOLR", new string[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, new int[] { 1, 1 }); /* only in this case, posInc of 2 ?! */ AssertAnalyzesTo(a, "LUCENE / solR", new string[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 1, 0, 1 }); AssertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new string[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 1, 1 }); /* analyzer that will consume tokens with large position increments */ Analyzer a2 = new AnalyzerAnonymousInnerClassHelper2(this, flags, protWords); /* increment of "largegap" is preserved */ AssertAnalyzesTo(a2, "LUCENE largegap SOLR", new string[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, new int[] { 1, 10, 1 }); /* the "/" had a position increment of 10, where did it go?!?!! */ AssertAnalyzesTo(a2, "LUCENE / SOLR", new string[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, new int[] { 1, 11 }); /* in this case, the increment of 10 from the "/" is carried over */ AssertAnalyzesTo(a2, "LUCENE / solR", new string[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 11, 0, 1 }); AssertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new string[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 11, 1 }); Analyzer a3 = new AnalyzerAnonymousInnerClassHelper3(this, flags, protWords); AssertAnalyzesTo(a3, "lucene.solr", new string[] { "lucene", "lucenesolr", "solr" }, new int[] { 0, 0, 7 }, new int[] { 6, 11, 11 }, new int[] { 1, 0, 1 }); /* the stopword should add a gap here */ AssertAnalyzesTo(a3, "the lucene.solr", new string[] { "lucene", "lucenesolr", "solr" }, new int[] { 4, 4, 11 }, new int[] { 10, 15, 15 }, new int[] { 2, 0, 1 }); }
public virtual void TestLotsOfConcatenating() { WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS | WordDelimiterFlags.GENERATE_NUMBER_PARTS | WordDelimiterFlags.CATENATE_WORDS | WordDelimiterFlags.CATENATE_NUMBERS | WordDelimiterFlags.CATENATE_ALL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE | WordDelimiterFlags.SPLIT_ON_NUMERICS | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE; /* analyzer that uses whitespace + wdf */ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return(new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null))); }); AssertAnalyzesTo(a, "abc-def-123-456", new string[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, new int[] { 1, 0, 0, 1, 1, 0, 1 }); }
public virtual void TestRandomHugeStrings() { int numIterations = AtLeast(5); for (int i = 0; i < numIterations; i++) { WordDelimiterFlags flags = (WordDelimiterFlags)Random.Next(512); CharArraySet protectedWords; if (Random.nextBoolean()) { protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "a", "b", "cd" }, false); } else { protectedWords = null; } Analyzer a = new AnalyzerAnonymousInnerClassHelper7(this, flags, protectedWords); CheckRandomData(Random, a, 100 * RANDOM_MULTIPLIER, 8192); } }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { this.termAttribute = AddAttribute <ICharTermAttribute>(); this.offsetAttribute = AddAttribute <IOffsetAttribute>(); this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); sorter = new OffsetSorter(this); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
public virtual void TestEmptyTerm() { Random random = Random; for (int i = 0; i < 512; i++) { WordDelimiterFlags flags = (WordDelimiterFlags)i; CharArraySet protectedWords; if (random.nextBoolean()) { protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new string[] { "a", "b", "cd" }, false); } else { protectedWords = null; } Analyzer a = new AnalyzerAnonymousInnerClassHelper8(this, flags, protectedWords); // depending upon options, this thing may or may not preserve the empty term CheckAnalysisConsistency(random, a, random.nextBoolean(), ""); } }
/// <summary> /// Determines whether the given flag is set /// </summary> /// <param name="flag"> Flag to see if set </param> /// <returns> <c>true</c> if flag is set </returns> private bool Has(WordDelimiterFlags flag) { return((flags & flag) != 0); }
/// <summary> /// Creates a new WordDelimiterFilter using <see cref="WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE"/> /// as its charTypeTable /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> <see cref="TokenStream"/> to be filtered </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, WordDelimiterFlags configurationFlags, CharArraySet protWords) : this(matchVersion, @in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords) { }
public AnalyzerAnonymousInnerClassHelper8(TestWordDelimiterFilter outerInstance, WordDelimiterFlags flags, CharArraySet protectedWords) { this.outerInstance = outerInstance; this.flags = flags; this.protectedWords = protectedWords; }
public AnalyzerAnonymousInnerClassHelper5(TestWordDelimiterFilter outerInstance, WordDelimiterFlags flags) { this.outerInstance = outerInstance; this.flags = flags; }
/// <summary> /// Creates a new <see cref="Lucene47WordDelimiterFilter"/> using <see cref="WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE"/> /// as its charTypeTable /// </summary> /// <param name="in"> <see cref="TokenStream"/> to be filtered </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public Lucene47WordDelimiterFilter(TokenStream @in, WordDelimiterFlags configurationFlags, CharArraySet protWords) : this(@in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords) { }