/// <summary>
 /// Creates a new ThaiWordFilter with the specified match version. </summary>
 public ThaiWordFilter(Version matchVersion, TokenStream input)
     : base(matchVersion.onOrAfter(Version.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
 {
     if (!DBBI_AVAILABLE)
     {
       throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
 }
Exemple #2
0
 private StandardTokenizerInterface getScannerFor(Version matchVersion)
 {
     // best effort NPE if you dont call reset
     if (matchVersion.onOrAfter(Version.LUCENE_47))
     {
         return(new UAX29URLEmailTokenizerImpl(input));
     }
     else if (matchVersion.onOrAfter(Version.LUCENE_40))
     {
         return(new UAX29URLEmailTokenizerImpl40(input));
     }
     else if (matchVersion.onOrAfter(Version.LUCENE_36))
     {
         return(new UAX29URLEmailTokenizerImpl36(input));
     }
     else if (matchVersion.onOrAfter(Version.LUCENE_34))
     {
         return(new UAX29URLEmailTokenizerImpl34(input));
     }
     else
     {
         return(new UAX29URLEmailTokenizerImpl31(input));
     }
 }
 /// Construct a new RussianLetterTokenizer. * <param name="matchVersion"> Lucene version
 /// to match See <seealso cref="<a href="#version">above</a>"/>
 /// </param>
 /// <param name="in">
 ///          the input to split up into tokens </param>
 public RussianLetterTokenizer(Version matchVersion, Reader @in)
     : base(matchVersion, @in)
 {
 }
Exemple #4
0
 /// <summary>
 /// Creates a new UAX29URLEmailTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
 /// </summary>
 public UAX29URLEmailTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input)
 {
     this.scanner = getScannerFor(matchVersion);
 }
Exemple #5
0
 /// <summary>
 /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
 /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : base(version, factory, input, minGram, maxGram, true)
 {
 }
Exemple #6
0
 public StandardFilter(Version matchVersion, TokenStream @in) : base(@in)
 {
     this.matchVersion = matchVersion;
 }
Exemple #7
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
 private void init(Version matchVersion)
 {
     this.scanner = new ClassicTokenizerImpl(input);
 }
 /// <summary>
 /// Builds an analyzer with the given stop words 
 /// </summary>
 /// <param name="matchVersion">
 ///          lucene compatibility version </param>
 /// <param name="stopwords">
 ///          a stopword set </param>
 public PersianAnalyzer(Version matchVersion, CharArraySet stopwords)
     : base(matchVersion, stopwords)
 {
 }
 /// <summary>
 /// Construct a new ArabicLetterTokenizer. </summary>
 /// <param name="matchVersion"> Lucene version
 /// to match See <seealso cref="<a href="#version">above</a>"/>
 /// </param>
 /// <param name="in">
 ///          the input to split up into tokens </param>
 public ArabicLetterTokenizer(Version matchVersion, Reader @in)
     : base(matchVersion, @in)
 {
 }
 public EnglishPossessiveFilter(Version version, TokenStream input)
     : base(input)
 {
     this.matchVersion = version;
 }
Exemple #12
0
 public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict)
 {
     this.matchVersion = matchVersion;
     this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
     this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
     if (stemOverrideDict.Empty || !matchVersion.onOrAfter(Version.LUCENE_31))
     {
       this.stemdict = null;
       this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
     }
     else
     {
       this.origStemdict = null;
       // we don't need to ignore case here since we lowercase in this analyzer anyway
       StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
       CharArrayMap<string>.EntryIterator iter = stemOverrideDict.entrySet().GetEnumerator();
       CharsRef spare = new CharsRef();
       while (iter.hasNext())
       {
     char[] nextKey = iter.nextKey();
     spare.copyChars(nextKey, 0, nextKey.Length);
     builder.add(spare, iter.currentValue());
       }
       try
       {
     this.stemdict = builder.build();
       }
       catch (IOException ex)
       {
     throw new Exception("can not build stem dict", ex);
       }
     }
 }
Exemple #13
0
 public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable)
     : this(matchVersion, stopwords, stemExclusionTable, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.emptyMap<string>())
 {
     // historically, this ctor never the stem dict!!!!!
     // so we populate it only for >= 3.6
 }
Exemple #14
0
 /// <summary>
 /// Builds an analyzer with the default stop words (<seealso cref="#getDefaultStopSet()"/>) 
 /// and a few default entries for the stem exclusion table.
 /// 
 /// </summary>
 public DutchAnalyzer(Version matchVersion)
     : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT)
 {
     // historically, only this ctor populated the stem dict!!!!!
 }
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Exemple #16
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
 /// <summary>
 /// Wraps <seealso cref="StandardAnalyzer"/>.
 /// </summary>
 public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) : this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize)
 {
 }
 /// <summary>
 /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>.  Attaches
 /// the <code>input</code> to the newly created JFlex scanner.
 /// </summary>
 /// <param name="input"> The input reader
 ///
 /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
 public ClassicTokenizer(Version matchVersion, Reader input) : base(input)
 {
     init(matchVersion);
 }
 /// <summary>
 /// Wraps <seealso cref="StandardAnalyzer"/>.
 /// </summary>
 public ShingleAnalyzerWrapper(Version matchVersion) : this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE)
 {
 }
Exemple #20
0
 /// <summary>
 /// Builds an analyzer with the default stop words:
 /// <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
 /// </summary>
 public HindiAnalyzer(Version version)
     : this(version, DefaultSetHolder.DEFAULT_STOP_SET)
 {
 }
Exemple #21
0
 /// <summary>
 /// Builds an analyzer with the given stop words
 /// </summary>
 /// <param name="matchVersion">
 ///          lucene compatibility version </param>
 /// <param name="stopwords">
 ///          a stopword set </param>
 /// <param name="stemExclusionSet">
 ///          a stemming exclusion set </param>
 public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
 /// <summary>
 /// Create a GreekLowerCaseFilter that normalizes Greek token text.
 /// </summary>
 /// <param name="matchVersion"> Lucene compatibility version, 
 ///   See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public GreekLowerCaseFilter(Version matchVersion, TokenStream @in)
     : base(@in)
 {
     this.charUtils = CharacterUtils.getInstance(matchVersion);
 }
	  /// <summary>
	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	  /// </summary>
	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	  /// <param name="minGram"> the smallest n-gram to generate </param>
	  /// <param name="maxGram"> the largest n-gram to generate </param>
	  public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : base(version, input, minGram, maxGram, true)
	  {
	  }
Exemple #24
0
 /// <summary>
 /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : base(version, input, minGram, maxGram, true)
 {
 }
	  /// <summary>
	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	  /// </summary>
	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	  /// <param name="minGram"> the smallest n-gram to generate </param>
	  /// <param name="maxGram"> the largest n-gram to generate </param>
	  public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : base(version, factory, input, minGram, maxGram, true)
	  {
	  }
Exemple #26
0
 /// <summary>
 /// Creates a new instance of the UAX29URLEmailTokenizer.  Attaches
 /// the <code>input</code> to the newly created JFlex scanner.
 /// </summary>
 /// <param name="input"> The input reader </param>
 public UAX29URLEmailTokenizer(Version matchVersion, Reader input) : base(input)
 {
     this.scanner = getScannerFor(matchVersion);
 }
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Exemple #28
0
 public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input)
     : base(matchVersion, factory, input)
 {
 }
Exemple #29
0
 public IndicTokenizer(Version matchVersion, Reader input)
     : base(matchVersion, input)
 {
 }
 /// <summary>
 /// Construct a new RussianLetterTokenizer using a given
 /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>. * @param
 /// matchVersion Lucene version to match See
 /// <seealso cref="<a href="#version">above</a>"/>
 /// </summary>
 /// <param name="factory">
 ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
 /// <param name="in">
 ///          the input to split up into tokens </param>
 public RussianLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader @in)
     : base(matchVersion, factory, @in)
 {
 }
 public RussianAnalyzer(Version matchVersion)
     : this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET : DefaultSetHolder.DEFAULT_STOP_SET_30)
 {
 }
Exemple #32
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public DanishAnalyzer(Version matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
 /// <summary>
 /// Builds the named analyzer with no stop words. </summary>
 public SnowballAnalyzer(Version matchVersion, string name)
 {
     this.name = name;
     this.matchVersion = matchVersion;
 }
 /// <summary>
 /// Builds an analyzer with the default stop words:
 /// <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
 /// </summary>
 public PersianAnalyzer(Version matchVersion)
     : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
 {
 }
 /// <summary>
 /// Builds the named analyzer with the given stop words. </summary>
 public SnowballAnalyzer(Version matchVersion, string name, CharArraySet stopWords)
     : this(matchVersion, name)
 {
     stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopWords));
 }
Exemple #36
0
 /// <summary>
 /// Create a GreekLowerCaseFilter that normalizes Greek token text.
 /// </summary>
 /// <param name="matchVersion"> Lucene compatibility version,
 ///   See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public GreekLowerCaseFilter(Version matchVersion, TokenStream @in) : base(@in)
 {
     this.charUtils = CharacterUtils.getInstance(matchVersion);
 }
Exemple #37
0
 /// <summary>
 /// Builds an analyzer with the default stop words: <seealso cref="#getDefaultStopSet"/>.
 /// </summary>
 public EnglishAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
 {
 }
 /// <summary>
 /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
 /// </summary>
 public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input)
 {
     init(matchVersion);
 }
Exemple #39
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: protected void doTestTokenizerFactoryArguments(final org.apache.lucene.util.Version ver, final Class delegatorClass) throws Exception
        //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
        protected internal virtual void doTestTokenizerFactoryArguments(Version ver, Type delegatorClass)
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String clazz = org.apache.lucene.analysis.pattern.PatternTokenizerFactory.class.getName();
            //JAVA TO C# CONVERTER WARNING: The .NET Type.FullName property will not always yield results identical to the Java Class.getName method:
            string clazz = typeof(PatternTokenizerFactory).FullName;
            TokenFilterFactory factory = null;

            // simple arg form
            factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "pattern", "(.*)", "group", "0");
            assertDelegator(factory, delegatorClass);

            // prefix
            factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.group", "0");
            assertDelegator(factory, delegatorClass);

            // sanity check that sub-PatternTokenizerFactory fails w/o pattern
            try
            {
              factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz);
              fail("tokenizerFactory should have complained about missing pattern arg");
            }
            catch (Exception)
            {
              // :NOOP:
            }

            // sanity check that sub-PatternTokenizerFactory fails on unexpected
            try
            {
              factory = tokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.bogusbogusbogus", "bogus", "tokenizerFactory.group", "0");
              fail("tokenizerFactory should have complained about missing pattern arg");
            }
            catch (Exception)
            {
              // :NOOP:
            }
        }
Exemple #41
0
 /// <summary>
 /// Builds an analyzer with the given stop words 
 /// </summary>
 /// <param name="version"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public HindiAnalyzer(Version version, CharArraySet stopwords)
     : this(version, stopwords, CharArraySet.EMPTY_SET)
 {
 }