private IStandardTokenizerInterface GetScannerFor(LuceneVersion matchVersion) { // best effort NPE if you dont call reset #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47)) { return(new UAX29URLEmailTokenizerImpl(m_input)); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) { return(new UAX29URLEmailTokenizerImpl40(m_input)); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { return(new UAX29URLEmailTokenizerImpl36(m_input)); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34)) { return(new UAX29URLEmailTokenizerImpl34(m_input)); } else { return(new UAX29URLEmailTokenizerImpl31(m_input)); } #pragma warning restore 612, 618 }
/// <summary> /// Constructs a <see cref="StandardTokenizer"/> filtered by a /// <see cref="StandardFilter"/>, a <see cref="LowerCaseFilter"/>, a <see cref="StopFilter"/>, /// and a <see cref="SnowballFilter"/> /// </summary> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English", StringComparison.Ordinal) || name.Equals("Porter", StringComparison.Ordinal) || name.Equals("Lovins", StringComparison.Ordinal))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish", StringComparison.Ordinal)) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return(new TokenStreamComponents(tokenizer, result)); }
/// <summary> /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <see cref="NGramTokenFilter"/> for details. </param> /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter( #pragma warning disable 612, 618 LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { posIncAtt = AddAttribute <IPositionIncrementAttribute>(); posLenAtt = AddAttribute <IPositionLengthAttribute>(); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(); } termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
private void Init(LuceneVersion matchVersion) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47)) { this.scanner = new StandardTokenizerImpl(m_input); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) { this.scanner = new StandardTokenizerImpl40(m_input); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34)) { this.scanner = new StandardTokenizerImpl34(m_input); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { this.scanner = new StandardTokenizerImpl31(m_input); } #pragma warning restore 612, 618 else { this.scanner = new ClassicTokenizerImpl(m_input); } termAtt = AddAttribute <ICharTermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); }
private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { #pragma warning disable 612, 618 if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } #pragma warning disable 612, 618 charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } termAtt = AddAttribute <ICharTermAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); posLenAtt = AddAttribute <IPositionLengthAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }
/// <summary> /// Creates a new <see cref="ThaiWordFilter"/> with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); posAtt = AddAttribute <IPositionIncrementAttribute>(); }
private bool hasIllegalOffsets = false; // only if the length changed before this filter /// <summary> /// Creates a new <see cref="ThaiWordFilter"/> with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); posAtt = AddAttribute <IPositionIncrementAttribute>(); }
private bool hasIllegalOffsets = false; // only if the length changed before this filter /// <summary> /// Creates a new ThaiWordFilter with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posAtt = AddAttribute<IPositionIncrementAttribute>(); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer; TokenStream result; if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { tokenizer = new HMMChineseTokenizer(reader); result = tokenizer; } else { #pragma warning disable 612, 618 tokenizer = new SentenceTokenizer(reader); result = new WordTokenFilter(tokenizer); #pragma warning restore 612, 618 } // result = new LowerCaseFilter(result); // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text. // The porter stemming is too strict, this is not a bug, this is a feature:) result = new PorterStemFilter(result); if (stopWords.Any()) { result = new StopFilter(matchVersion, result, stopWords); } return(new TokenStreamComponents(tokenizer, result)); }
/// <summary> /// Return a <see cref="CharacterUtils"/> instance compatible with Java 1.4. </summary> public static CharacterUtils GetJava4Instance(LuceneVersion matchVersion) // LUCENENET specific - added matchVersion parameter so we can support backward compatible Unicode support { #pragma warning disable 612, 618 return(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_4 : JAVA_4_BW_COMPAT); #pragma warning restore 612, 618 }
/// <summary> /// Builds an analyzer with the default stop words (<see cref="DefaultStopSet"/>). /// </summary> public FrenchAnalyzer(LuceneVersion matchVersion) #pragma warning disable 612, 618 : this(matchVersion, matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET : DefaultSetHolder.DEFAULT_STOP_SET_30) #pragma warning restore 612, 618 { }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (version == null) { throw new System.ArgumentException("version must not be null"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance; this.minGram = minGram; this.maxGram = maxGram; this.side = side; }
/// <summary> /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. /// </summary> /// <param name="matchVersion"> /// a version instance </param> /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. </returns> public static CharacterUtils GetInstance(LuceneVersion matchVersion) { #pragma warning disable 612, 618 return(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4); #pragma warning restore 612, 618 }
/// <summary> /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the /// text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/> /// filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided, /// <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader) { if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } if (stemdict != null) { result = new StemmerOverrideFilter(result, stemdict); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer()); return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } result = new DutchStemFilter(result, origStemdict); return(new TokenStreamComponents(source, result)); } }
/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter( #pragma warning disable 612, 618 LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { posIncAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords)); this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); #pragma warning disable 612, 618 if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { this.stemdict = null; this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap <string> .EntryIterator iter = (CharArrayMap <string> .EntryIterator)stemOverrideDict.EntrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.HasNext) { char[] nextKey = iter.NextKey(); spare.CopyChars(nextKey, 0, nextKey.Length); builder.Add(new string(spare.Chars), iter.CurrentValue); } try { this.stemdict = builder.Build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) : this(matchVersion, stopwords, stemExclusionTable, matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap <string> .EmptyMap()) { // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 }
private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements) { if (!enablePositionIncrements && version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams"); } }
/// <summary> /// Builds an analyzer with the default stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> public ThaiAnalyzer(LuceneVersion matchVersion) : this(matchVersion, #pragma warning disable 612, 618 matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ? #pragma warning restore 612, 618 DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET) { }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, #pragma warning disable 612, 618 matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ? #pragma warning restore 612, 618 DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap <string> .EmptyMap()) { // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 }
private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements) { if (!enablePositionIncrements && #pragma warning disable 612, 618 version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams"); } }
public override bool IncrementToken() { if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { return(input.IncrementToken()); // TODO: add some niceties for the new grammar } else { return(IncrementTokenClassic()); } }
public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets) : base(@in) { if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4"); } termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); this.updateOffsets = updateOffsets; }
public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets) : base(@in) { if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4"); } termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.updateOffsets = updateOffsets; }
public override sealed bool IncrementToken() { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { return(input.IncrementToken()); // TODO: add some niceties for the new grammar } else { return(IncrementTokenClassic()); } }
private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }
/// <summary> /// Initializes a query parser. Called by the QueryParser constructor /// </summary> /// <param name="matchVersion">Lucene version to match.</param> /// <param name="f">the default field for query terms.</param> /// <param name="a">used to find terms in the query text.</param> public virtual void Init(LuceneVersion matchVersion, string f, Analyzer a) { Analyzer = a; m_field = f; #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { AutoGeneratePhraseQueries = false; } else { AutoGeneratePhraseQueries = true; } }
private IStandardTokenizerInterface GetScannerFor(LuceneVersion matchVersion) { // best effort NPE if you dont call reset if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47)) { return(new UAX29URLEmailTokenizerImpl(input)); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) { return(new UAX29URLEmailTokenizerImpl40(input)); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { return(new UAX29URLEmailTokenizerImpl36(input)); } else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34)) { return(new UAX29URLEmailTokenizerImpl34(input)); } else { return(new UAX29URLEmailTokenizerImpl31(input)); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] buffer = termAtt.Buffer(); int bufferLength = termAtt.Length; if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) && (buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { termAtt.Length = bufferLength - 2; // Strip last 2 characters off } return(true); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) #pragma warning restore 612, 618 { KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); return(new TokenStreamComponents(tokenizer, tokenizer)); } else { KeywordTokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, #pragma warning disable 612, 618 new ICUCollationKeyFilter(tokenizer, collator))); #pragma warning restore 612, 618 } }
private void Init(LuceneVersion version, Side side, int minGram, int maxGram) { //if (version == null) //{ // throw new ArgumentException("version must not be null"); //} if (!Enum.IsDefined(typeof(Side), side)) { throw new ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { if (side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); } } else { maxGram = Math.Min(maxGram, 1024); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in) { if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(SPLIT_ON_CASE_CHANGE), Has(SPLIT_ON_NUMERICS), Has(STEM_ENGLISH_POSSESSIVE)); this.termAttribute = AddAttribute<ICharTermAttribute>(); this.offsetAttribute = AddAttribute<IOffsetAttribute>(); this.posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute<ITypeAttribute>(); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { this.termAttribute = AddAttribute <ICharTermAttribute>(); this.offsetAttribute = AddAttribute <IOffsetAttribute>(); this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); sorter = new OffsetSorter(this); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords)); this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); #pragma warning disable 612, 618 if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { this.stemdict = null; this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap<string>.EntryIterator iter = (CharArrayMap<string>.EntryIterator)stemOverrideDict.EntrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.HasNext) { char[] nextKey = iter.NextKey(); spare.CopyChars(nextKey, 0, nextKey.Length); builder.Add(new string(spare.Chars), iter.CurrentValue); } try { this.stemdict = builder.Build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, #pragma warning disable 612, 618 matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ? #pragma warning restore 612, 618 DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap<string>.EmptyMap()) { // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 }
private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements) { if (!enablePositionIncrements && #pragma warning disable 612, 618 version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new System.ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams"); } }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { //if (version == null) //{ // throw new System.ArgumentException("version must not be null"); //} if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (!Enum.IsDefined(typeof(Side), side)) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); this.posLenAtt = AddAttribute<IPositionLengthAttribute>(); }
/// <summary> /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. /// </summary> /// <param name="matchVersion"> /// a version instance </param> /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. </returns> public static CharacterUtils GetInstance(LuceneVersion matchVersion) { #pragma warning disable 612, 618 return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4; #pragma warning restore 612, 618 }
private void Init(LuceneVersion version, Side side, int minGram, int maxGram) { //if (version == null) //{ // throw new System.ArgumentException("version must not be null"); //} if (!Enum.IsDefined(typeof(Side), side)) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { if (side == Side.BACK) { throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); } } else { maxGram = Math.Min(maxGram, 1024); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Partially reverses the given input buffer in-place from the given offset /// up to the given length. </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="buffer"> the input char array to reverse </param> /// <param name="start"> the offset from where to reverse the buffer </param> /// <param name="len"> the length in the buffer up to where the /// buffer should be reversed </param> public static void Reverse(LuceneVersion matchVersion, char[] buffer, int start, int len) { #pragma warning disable 612, 618 if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { ReverseUnicode3(buffer, start, len); #pragma warning restore 612, 618 return; } /* modified version of Apache Harmony AbstractStringBuilder reverse0() */ if (len < 2) { return; } int end = (start + len) - 1; char frontHigh = buffer[start]; char endLow = buffer[end]; bool allowFrontSur = true, allowEndSur = true; int mid = start + (len >> 1); for (int i = start; i < mid; ++i, --end) { char frontLow = buffer[i + 1]; char endHigh = buffer[end - 1]; bool surAtFront = allowFrontSur && char.IsSurrogatePair(frontHigh, frontLow); if (surAtFront && (len < 3)) { // nothing to do since surAtFront is allowed and 1 char left return; } bool surAtEnd = allowEndSur && char.IsSurrogatePair(endHigh, endLow); allowFrontSur = allowEndSur = true; if (surAtFront == surAtEnd) { if (surAtFront) { // both surrogates buffer[end] = frontLow; buffer[--end] = frontHigh; buffer[i] = endHigh; buffer[++i] = endLow; frontHigh = buffer[i + 1]; endLow = buffer[end - 1]; } else { // neither surrogates buffer[end] = frontHigh; buffer[i] = endLow; frontHigh = frontLow; endLow = endHigh; } } else { if (surAtFront) { // surrogate only at the front buffer[end] = frontLow; buffer[i] = endLow; endLow = endHigh; allowFrontSur = false; } else { // surrogate only at the end buffer[end] = frontHigh; buffer[i] = endHigh; frontHigh = frontLow; allowEndSur = false; } } } if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) { // only if odd length buffer[end] = allowFrontSur ? endLow : frontHigh; } }
/// <summary> /// Builds an analyzer with the default stop words: /// <seealso cref="#getDefaultStopSet()"/>. /// </summary> public GermanAnalyzer(LuceneVersion matchVersion) #pragma warning disable 612, 618 : this(matchVersion, matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET : DefaultSetHolder.DEFAULT_SET_30) #pragma warning restore 612, 618 { }
private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { #pragma warning disable 612, 618 if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } #pragma warning disable 612, 618 charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } termAtt = AddAttribute<ICharTermAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }
/// <summary> /// Initializes a query parser. Called by the QueryParser constructor /// </summary> /// <param name="matchVersion">Lucene version to match.</param> /// <param name="f">the default field for query terms.</param> /// <param name="a">used to find terms in the query text.</param> public virtual void Init(LuceneVersion matchVersion, string f, Analyzer a) { Analyzer = a; field = f; #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { AutoGeneratePhraseQueries = false; } else { AutoGeneratePhraseQueries = true; } }
/// <summary> /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. /// </summary> /// <param name="matchVersion"> /// a version instance </param> /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given /// <seealso cref="LuceneVersion"/> instance. </returns> public static CharacterUtils GetInstance(LuceneVersion matchVersion) { return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4; }