/// <summary> /// Creates a new <see cref="Lucene47WordDelimiterFilter"/> /// </summary> /// <param name="in"> <see cref="TokenStream"/> to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public Lucene47WordDelimiterFilter(TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { this.termAttribute = AddAttribute <ICharTermAttribute>(); this.offsetAttribute = AddAttribute <IOffsetAttribute>(); this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); sorter = new OffsetSorter(this); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
// parses a list of MappingCharFilter style rules into a custom byte[] type table private byte[] ParseTypes(IList <string> rules) { IDictionary <char, byte> typeMap = new JCG.SortedDictionary <char, byte>(); foreach (string rule in rules) { Match m = typePattern.Match(rule); if (!m.Success) { throw new ArgumentException("Invalid Mapping Rule : [" + rule + "]"); } string lhs = ParseString(m.Groups[1].Value.Trim()); byte rhs = ParseType(m.Groups[2].Value.Trim()); if (lhs.Length != 1) { throw new ArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); } if (rhs == WordDelimiterFilter.NOT_SET) { throw new ArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); } typeMap[lhs[0]] = rhs; } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance byte[] types = new byte[Math.Max(typeMap.Keys.LastOrDefault() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.Length)]; for (int i = 0; i < types.Length; i++) { types[i] = WordDelimiterIterator.GetType(i); } foreach (var mapping in typeMap) { types[mapping.Key] = mapping.Value; } return(types); }