/// <summary> /// Creates a new ThaiWordFilter with the specified match version. </summary> public ThaiWordFilter(Version matchVersion, TokenStream input) : base(matchVersion.onOrAfter(Version.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31); }
private StandardTokenizerInterface getScannerFor(Version matchVersion) { // best effort NPE if you dont call reset if (matchVersion.onOrAfter(Version.LUCENE_47)) { return(new UAX29URLEmailTokenizerImpl(input)); } else if (matchVersion.onOrAfter(Version.LUCENE_40)) { return(new UAX29URLEmailTokenizerImpl40(input)); } else if (matchVersion.onOrAfter(Version.LUCENE_36)) { return(new UAX29URLEmailTokenizerImpl36(input)); } else if (matchVersion.onOrAfter(Version.LUCENE_34)) { return(new UAX29URLEmailTokenizerImpl34(input)); } else { return(new UAX29URLEmailTokenizerImpl31(input)); } }
/// Construct a new RussianLetterTokenizer. * <param name="matchVersion"> Lucene version /// to match See <seealso cref="<a href="#version">above</a>"/> /// </param> /// <param name="in"> /// the input to split up into tokens </param> public RussianLetterTokenizer(Version matchVersion, Reader @in) : base(matchVersion, @in) { }
/// <summary> /// Creates a new UAX29URLEmailTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> /// </summary> public UAX29URLEmailTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input) { this.scanner = getScannerFor(matchVersion); }
/// <summary> /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the <a href="#version">Lucene match version</a> </param> /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : base(version, factory, input, minGram, maxGram, true) { }
public StandardFilter(Version matchVersion, TokenStream @in) : base(@in) { this.matchVersion = matchVersion; }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
private void init(Version matchVersion) { this.scanner = new ClassicTokenizerImpl(input); }
/// <summary> /// Builds an analyzer with the given stop words /// </summary> /// <param name="matchVersion"> /// lucene compatibility version </param> /// <param name="stopwords"> /// a stopword set </param> public PersianAnalyzer(Version matchVersion, CharArraySet stopwords) : base(matchVersion, stopwords) { }
/// <summary> /// Construct a new ArabicLetterTokenizer. </summary> /// <param name="matchVersion"> Lucene version /// to match See <seealso cref="<a href="#version">above</a>"/> /// </param> /// <param name="in"> /// the input to split up into tokens </param> public ArabicLetterTokenizer(Version matchVersion, Reader @in) : base(matchVersion, @in) { }
public EnglishPossessiveFilter(Version version, TokenStream input) : base(input) { this.matchVersion = version; }
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); if (stemOverrideDict.Empty || !matchVersion.onOrAfter(Version.LUCENE_31)) { this.stemdict = null; this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap<string>.EntryIterator iter = stemOverrideDict.entrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.hasNext()) { char[] nextKey = iter.nextKey(); spare.copyChars(nextKey, 0, nextKey.Length); builder.add(spare, iter.currentValue()); } try { this.stemdict = builder.build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) : this(matchVersion, stopwords, stemExclusionTable, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.emptyMap<string>()) { // historically, this ctor never the stem dict!!!!! // so we populate it only for >= 3.6 }
/// <summary> /// Builds an analyzer with the default stop words (<seealso cref="#getDefaultStopSet()"/>) /// and a few default entries for the stem exclusion table. /// /// </summary> public DutchAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT) { // historically, only this ctor populated the stem dict!!!!! }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
/// <summary> /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before /// stemming. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param> public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
/// <summary> /// Wraps <seealso cref="StandardAnalyzer"/>. /// </summary> public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) : this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize) { }
/// <summary> /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>. Attaches /// the <code>input</code> to the newly created JFlex scanner. /// </summary> /// <param name="input"> The input reader /// /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param> public ClassicTokenizer(Version matchVersion, Reader input) : base(input) { init(matchVersion); }
/// <summary> /// Wraps <seealso cref="StandardAnalyzer"/>. /// </summary> public ShingleAnalyzerWrapper(Version matchVersion) : this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE) { }
/// <summary> /// Builds an analyzer with the default stop words: /// <seealso cref="#DEFAULT_STOPWORD_FILE"/>. /// </summary> public HindiAnalyzer(Version version) : this(version, DefaultSetHolder.DEFAULT_STOP_SET) { }
/// <summary> /// Builds an analyzer with the given stop words /// </summary> /// <param name="matchVersion"> /// lucene compatibility version </param> /// <param name="stopwords"> /// a stopword set </param> /// <param name="stemExclusionSet"> /// a stemming exclusion set </param> public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords) { exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
/// <summary> /// Create a GreekLowerCaseFilter that normalizes Greek token text. /// </summary> /// <param name="matchVersion"> Lucene compatibility version, /// See <a href="#version">above</a> </param> /// <param name="in"> TokenStream to filter </param> public GreekLowerCaseFilter(Version matchVersion, TokenStream @in) : base(@in) { this.charUtils = CharacterUtils.getInstance(matchVersion); }
/// <summary> /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range /// </summary> /// <param name="version"> the <a href="#version">Lucene match version</a> </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : base(version, input, minGram, maxGram, true) { }
/// <summary> /// Creates a new instance of the UAX29URLEmailTokenizer. Attaches /// the <code>input</code> to the newly created JFlex scanner. /// </summary> /// <param name="input"> The input reader </param> public UAX29URLEmailTokenizer(Version matchVersion, Reader input) : base(input) { this.scanner = getScannerFor(matchVersion); }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(matchVersion, factory, input) { }
public IndicTokenizer(Version matchVersion, Reader input) : base(matchVersion, input) { }
/// <summary> /// Construct a new RussianLetterTokenizer using a given /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>. * @param /// matchVersion Lucene version to match See /// <seealso cref="<a href="#version">above</a>"/> /// </summary> /// <param name="factory"> /// the attribute factory to use for this <seealso cref="Tokenizer"/> </param> /// <param name="in"> /// the input to split up into tokens </param> public RussianLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader @in) : base(matchVersion, factory, @in) { }
public RussianAnalyzer(Version matchVersion) : this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET : DefaultSetHolder.DEFAULT_STOP_SET_30) { }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
/// <summary> /// Builds the named analyzer with no stop words. </summary> public SnowballAnalyzer(Version matchVersion, string name) { this.name = name; this.matchVersion = matchVersion; }
/// <summary> /// Builds an analyzer with the default stop words: /// <seealso cref="#DEFAULT_STOPWORD_FILE"/>. /// </summary> public PersianAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) { }
/// <summary> /// Builds the named analyzer with the given stop words. </summary> public SnowballAnalyzer(Version matchVersion, string name, CharArraySet stopWords) : this(matchVersion, name) { stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopWords)); }
/// <summary> /// Builds an analyzer with the default stop words: <seealso cref="#getDefaultStopSet"/>. /// </summary> public EnglishAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) { }
/// <summary> /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> /// </summary> public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input) { init(matchVersion); }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: protected void doTestTokenizerFactoryArguments(final org.apache.lucene.util.Version ver, final Class delegatorClass) throws Exception //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: protected internal virtual void doTestTokenizerFactoryArguments(Version ver, Type delegatorClass) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String clazz = org.apache.lucene.analysis.pattern.PatternTokenizerFactory.class.getName(); //JAVA TO C# CONVERTER WARNING: The .NET Type.FullName property will not always yield results identical to the Java Class.getName method: string clazz = typeof(PatternTokenizerFactory).FullName; TokenFilterFactory factory = null; // simple arg form factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "pattern", "(.*)", "group", "0"); assertDelegator(factory, delegatorClass); // prefix factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.group", "0"); assertDelegator(factory, delegatorClass); // sanity check that sub-PatternTokenizerFactory fails w/o pattern try { factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz); fail("tokenizerFactory should have complained about missing pattern arg"); } catch (Exception) { // :NOOP: } // sanity check that sub-PatternTokenizerFactory fails on unexpected try { factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.bogusbogusbogus", "bogus", "tokenizerFactory.group", "0"); fail("tokenizerFactory should have complained about missing pattern arg"); } catch (Exception) { // :NOOP: } }
/// <summary> /// Builds an analyzer with the given stop words /// </summary> /// <param name="version"> lucene compatibility version </param> /// <param name="stopwords"> a stopword set </param> public HindiAnalyzer(Version version, CharArraySet stopwords) : this(version, stopwords, CharArraySet.EMPTY_SET) { }