/// <summary> /// Creates a new <see cref="Lucene47WordDelimiterFilter"/> /// </summary> /// <param name="in"> <see cref="TokenStream"/> to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public Lucene47WordDelimiterFilter(TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(Version matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in) { if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } if (!matchVersion.onOrAfter(Version.LUCENE_48)) { throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public Lucene47WordDelimiterFilter(TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in) { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); typeAttribute = AddAttribute <ITypeAttribute>(); if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { this.termAttribute = AddAttribute <ICharTermAttribute>(); this.offsetAttribute = AddAttribute <IOffsetAttribute>(); this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); sorter = new OffsetSorter(this); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
// parses a list of MappingCharFilter style rules into a custom byte[] type table private sbyte[] ParseTypes(IEnumerable <string> rules) { IDictionary <char, sbyte> typeMap = new SortedDictionary <char, sbyte>(); foreach (string rule in rules) { //Matcher m = typePattern.matcher(rule); //if (!m.find()) Match m = typePattern.Match(rule); if (!m.Success) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]"); } string lhs = ParseString(m.Groups[1].Value.Trim()); sbyte rhs = ParseType(m.Groups[2].Value.Trim()); if (lhs.Length != 1) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); } if (rhs == WordDelimiterFilter.NOT_SET) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); } typeMap[lhs[0]] = rhs; } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance sbyte[] types = new sbyte[Math.Max(typeMap.Keys.LastOrDefault() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.Length)]; for (int i = 0; i < types.Length; i++) { types[i] = WordDelimiterIterator.GetType(i); } foreach (var mapping in typeMap) { types[mapping.Key] = mapping.Value; } return(types); }
// parses a list of MappingCharFilter style rules into a custom byte[] type table private sbyte[] parseTypes(IList <string> rules) { SortedMap <char?, sbyte?> typeMap = new SortedDictionary <char?, sbyte?>(); foreach (string rule in rules) { Matcher m = typePattern.matcher(rule); if (!m.find()) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]"); } string lhs = parseString(m.group(1).Trim()); sbyte? rhs = parseType(m.group(2).Trim()); if (lhs.Length != 1) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); } if (rhs == null) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); } typeMap.put(lhs[0], rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance sbyte[] types = new sbyte[Math.Max(typeMap.LastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.Length)]; for (int i = 0; i < types.Length; i++) { types[i] = WordDelimiterIterator.getType(i); } foreach (KeyValuePair <char?, sbyte?> mapping in typeMap.EntrySet()) { types[mapping.Key] = mapping.Value; } return(types); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in) { if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(SPLIT_ON_CASE_CHANGE), Has(SPLIT_ON_NUMERICS), Has(STEM_ENGLISH_POSSESSIVE)); this.termAttribute = AddAttribute<ICharTermAttribute>(); this.offsetAttribute = AddAttribute<IOffsetAttribute>(); this.posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute<ITypeAttribute>(); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public Lucene47WordDelimiterFilter(TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in) { termAttribute = AddAttribute<ICharTermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); typeAttribute = AddAttribute<ITypeAttribute>(); if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(SPLIT_ON_CASE_CHANGE), Has(SPLIT_ON_NUMERICS), Has(STEM_ENGLISH_POSSESSIVE)); }