Example #1
0
 /// <summary>
 /// Create a new UpperCaseFilter, that normalizes token text to upper case.
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
     : base(@in)
 {
     termAtt = AddAttribute<ICharTermAttribute>();
     termAtt = AddAttribute<ICharTermAttribute>();
     charUtils = CharacterUtils.GetInstance(matchVersion);
 }
Example #2
0
 public StandardFilter(LuceneVersion matchVersion, TokenStream @in)
     : base(@in)
 {
     this.matchVersion = matchVersion;
     typeAtt = AddAttribute<ITypeAttribute>();
     termAtt = AddAttribute<ICharTermAttribute>();
 }
        /// <summary>
        /// Creates NGramTokenFilter with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <a href="#version">above</a> for details. </param>
        /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
        {
            this.version = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
                posLenAtt = AddAttribute<IPositionLengthAttribute>();
            }
            else
            {
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
        }
 /// <summary>
 /// Create a new <seealso cref="CodepointCountFilter"/>. This will filter out tokens whose
 /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="Character#CodePointCount(char[], int, int)"/>
 /// &lt; min) or too long (<seealso cref="Character#codePointCount(char[], int, int)"/> &gt; max). </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="min">     the minimum length </param>
 /// <param name="max">     the maximum length </param>
 public CodepointCountFilter(LuceneVersion version, TokenStream @in, int min, int max)
     : base(version, @in)
 {
     this.min = min;
     this.max = max;
     termAtt = AddAttribute<ICharTermAttribute>();
 }
        protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>() as CharTermAttribute;
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();

            this.matchVersion = matchVersion;
            this.tokens = new LinkedList<CompoundToken>();
            if (minWordSize < 0)
            {
                throw new System.ArgumentException("minWordSize cannot be negative");
            }
            this.minWordSize = minWordSize;
            if (minSubwordSize < 0)
            {
                throw new System.ArgumentException("minSubwordSize cannot be negative");
            }
            this.minSubwordSize = minSubwordSize;
            if (maxSubwordSize < 0)
            {
                throw new System.ArgumentException("maxSubwordSize cannot be negative");
            }
            this.maxSubwordSize = maxSubwordSize;
            this.onlyLongestMatch = onlyLongestMatch;
            this.dictionary = dictionary;
        }
 /// <summary>
 /// Create a new <seealso cref="FilteringTokenFilter"/>. </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 public FilteringTokenFilter(LuceneVersion version, TokenStream @in)
     : base(@in)
 {
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     this.version = version;
     this.enablePositionIncrements = true;
 }
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            if (version == null)
            {
              throw new System.ArgumentException("version must not be null");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
              throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (side == null)
            {
              throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
              throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
              throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version = version;
            this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;
        }
 /// <summary>
 /// Create a new <seealso cref="TypeTokenFilter"/>. </summary>
 /// <param name="version">      the Lucene match version </param>
 /// <param name="input">        the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="stopTypes">    the types to filter </param>
 /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will
 ///                     be kept, otherwise they will be filtered out </param>
 public TypeTokenFilter(LuceneVersion version, TokenStream input, IEnumerable<string> stopTypes, bool useWhiteList)
     : base(version, input)
 {
     typeAttribute = AddAttribute<ITypeAttribute>();
     this.stopTypes = new HashSet<string>(stopTypes);
     this.useWhiteList = useWhiteList;
 }
 /// <summary>
 /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/>
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <seealso cref="TokenStream"/> to process </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 /// <param name="minWordSize">
 ///          only words longer than this get processed </param>
 /// <param name="minSubwordSize">
 ///          only subwords longer than this get to the output stream </param>
 /// <param name="maxSubwordSize">
 ///          only subwords shorter than this get to the output stream </param>
 /// <param name="onlyLongestMatch">
 ///          Add only the longest matching subword to the stream </param>
 public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
     : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
 {
     if (dictionary == null)
     {
         throw new System.ArgumentException("dictionary cannot be null");
     }
 }
 /// <summary>
 /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/>
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the
 ///          dictionaries if Version > 3.0. See <a
 ///          href="CompoundWordTokenFilterBase.html#version"
 ///          >CompoundWordTokenFilterBase</a> for details. </param>
 /// <param name="input">
 ///          the <seealso cref="TokenStream"/> to process </param>
 /// <param name="dictionary">
 ///          the word dictionary to match against. </param>
 public DictionaryCompoundWordTokenFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary)
     : base(matchVersion, input, dictionary)
 {
     if (dictionary == null)
     {
         throw new System.ArgumentException("dictionary cannot be null");
     }
 }
Example #11
0
 /// <summary>
 /// Creates index upgrader on the given directory, using an <seealso cref="IndexWriter"/> using the given
 /// {@code matchVersion}. You have the possibility to upgrade indexes with multiple commit points by removing
 /// all older ones. If {@code infoStream} is not {@code null}, all logging output will be sent to this stream.
 /// </summary>
 public IndexUpgrader(Directory dir, LuceneVersion matchVersion, TextWriter infoStream, bool deletePriorCommits)
     : this(dir, new IndexWriterConfig(matchVersion, null), deletePriorCommits)
 {
     if (null != infoStream)
     {
         this.Iwc.SetInfoStream(infoStream);
     }
 }
Example #12
0
        /// <summary>
        /// Creates a new <seealso cref="CharTokenizer"/> instance
        /// </summary>
        /// <param name="matchVersion">
        ///          Lucene version to match </param>
        /// <param name="input">
        ///          the input to split up into tokens </param>
        protected CharTokenizer(LuceneVersion matchVersion, TextReader input)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            charUtils = CharacterUtils.GetInstance(matchVersion);
        }
Example #13
0
 /// <summary>
 /// Construct a token stream filtering the given input using a Set of common
 /// words to create bigrams. Outputs both unigrams with position increment and
 /// bigrams with position increment 0 type=gram where one or both of the words
 /// in a potential bigram are in the set of common words .
 /// </summary>
 /// <param name="input"> TokenStream input in filter chain </param>
 /// <param name="commonWords"> The set of common words. </param>
 public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords)
     : base(input)
 {
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
     typeAttribute = AddAttribute<ITypeAttribute>();
     posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
     posLenAttribute = AddAttribute<IPositionLengthAttribute>();
     this.commonWords = commonWords;
 }
Example #14
0
 public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets)
     : base(@in)
 {
     if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
     }
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     this.updateOffsets = updateOffsets;
 }
        private bool hasIllegalOffsets = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new ThaiWordFilter with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
              : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            if (!DBBI_AVAILABLE)
            {
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            }
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posAtt = AddAttribute<IPositionIncrementAttribute>();
        }
        /// <summary>
        /// Initialize this factory via a set of key-value pairs.
        /// </summary>
        protected internal AbstractAnalysisFactory(IDictionary<string, string> args)
        {
            ExplicitLuceneMatchVersion = false;
            originalArgs = Collections.UnmodifiableMap(args);
            string version = Get(args, LUCENE_MATCH_VERSION_PARAM);
            // LUCENENET TODO: What should we do if the version is null?
            //luceneMatchVersion = version == null ? (LuceneVersion?)null : LuceneVersionHelpers.ParseLeniently(version);
            luceneMatchVersion = version == null ?
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_CURRENT :
#pragma warning restore 612, 618
                LuceneVersionHelpers.ParseLeniently(version);
            args.Remove(CLASS_NAME); // consume the class arg
        }
Example #17
0
 /// <summary>
 /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose
 /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/>
 /// &lt; min) or too long (<seealso cref="CharTermAttribute#length()"/> &gt; max). </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="min">     the minimum length </param>
 /// <param name="max">     the maximum length </param>
 public LengthFilter(LuceneVersion version, TokenStream @in, int min, int max)
     : base(version, @in)
 {
     if (min < 0)
     {
         throw new ArgumentOutOfRangeException("minimum length must be greater than or equal to zero");
     }
     if (min > max)
     {
         throw new ArgumentOutOfRangeException("maximum length must not be greater than minimum length");
     }
     this.min = min;
     this.max = max;
     this.termAtt = AddAttribute<ICharTermAttribute>();
 }
        /// <summary>
        /// Create a new <seealso cref="CodepointCountFilter"/>. This will filter out tokens whose
        /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="Character#CodePointCount(char[], int, int)"/>
        /// &lt; min) or too long (<seealso cref="Character#codePointCount(char[], int, int)"/> &gt; max). </summary>
        /// <param name="version"> the Lucene match version </param>
        /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
        /// <param name="min">     the minimum length </param>
        /// <param name="max">     the maximum length </param>
        public CodepointCountFilter(LuceneVersion version, TokenStream @in, int min, int max)
            : base(version, @in)
        {
            // LUCENENET: The guard clauses were copied here from the version of Lucene.
            // Apparently, the tests were not ported from 4.8.0 because they expected this and the
            // original tests did not. Adding them anyway because there is no downside to this.
            if (min < 0)
            {
                throw new ArgumentOutOfRangeException("minimum length must be greater than or equal to zero");
            }
            if (min > max)
            {
                throw new ArgumentOutOfRangeException("maximum length must not be greater than minimum length");
            }

            this.min = min;
            this.max = max;
            termAtt = AddAttribute<ICharTermAttribute>();
        }
Example #19
0
 /// <summary>
 /// Builds an analyzer with the stop words from the given reader. </summary>
 /// <seealso cref="WordlistLoader.GetWordSet(TextReader, LuceneVersion)"/>
 /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardAnalyzer"/> </param>
 /// <param name="stopwords"> <see cref="TextReader"/> to read stop words from  </param>
 public StandardAnalyzer(LuceneVersion matchVersion, TextReader stopwords)
     : this(matchVersion, LoadStopwordSet(stopwords, matchVersion))
 {
 }
Example #20
0
 /// <summary>
 /// Creates a <see cref="MultiFieldQueryParser"/>. Allows passing of a map with term to
 /// Boost, and the boost to apply to each term.
 ///
 /// <para/>
 /// It will, when <see cref="QueryParserBase.Parse(string)"/> is called, construct a query like this
 /// (assuming the query consists of two terms and you specify the two fields
 /// <c>title</c> and <c>body</c>):
 /// <para/>
 ///
 /// <code>
 /// (title:term1 body:term1) (title:term2 body:term2)
 /// </code>
 ///
 /// <para/>
 /// When <see cref="QueryParserBase.DefaultOperator"/> is set to <see cref="QueryParserBase.AND_OPERATOR"/>, the result will be:
 /// <para/>
 ///
 /// <code>
 /// +(title:term1 body:term1) +(title:term2 body:term2)
 /// </code>
 ///
 /// <para/>
 /// When you pass a boost (title=>5 body=>10) you can get
 /// <para/>
 ///
 /// <code>
 /// +(title:term1^5.0 body:term1^10.0) +(title:term2^5.0 body:term2^10.0)
 /// </code>
 ///
 /// <para/>
 /// In other words, all the query's terms must appear, but it doesn't matter
 /// in what fields they appear.
 /// <para/>
 /// </summary>
 public MultiFieldQueryParser(LuceneVersion matchVersion, string[] fields, Analyzer analyzer, IDictionary <string, float> boosts)
     : this(matchVersion, fields, analyzer)
 {
     this.m_boosts = boosts;
 }
 public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
     : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
 {
 }
 public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
     : base(input)
 {
     Init(version, side, minGram, maxGram);
 }
Example #23
0
 /// <summary>
 /// Creates a new <see cref="SimpleAnalyzer"/> </summary>
 /// <param name="matchVersion"> <see cref="LuceneVersion"/> to match </param>
 public SimpleAnalyzer(LuceneVersion matchVersion)
 {
     this.matchVersion = matchVersion;
 }
Example #24
0
 /// <summary>
 /// Builds an analyzer with the default stop words (<see cref="STOP_WORDS_SET"/>). </summary>
 /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardAnalyzer"/> </param>
 public StandardAnalyzer(LuceneVersion matchVersion)
     : this(matchVersion, STOP_WORDS_SET)
 {
 }
Example #25
0
 /// <summary>
 /// Creates a new instance of the <see cref="StandardTokenizer"/>.  Attaches
 /// the <paramref name="input"/> to the newly created JFlex-generated (then ported to .NET) scanner.
 /// </summary>
 /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardTokenizer"/> </param>
 /// <param name="input"> The input reader
 ///
 /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
 public StandardTokenizer(LuceneVersion matchVersion, TextReader input)
     : base(input)
 {
     Init(matchVersion);
 }
Example #26
0
 /// <summary>
 /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
 /// </summary>
 public FinnishAnalyzer(LuceneVersion matchVersion)
     : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
 {
 }
Example #27
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public FinnishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Example #28
0
 /// <summary>
 /// Builds an analyzer with the given stop words
 /// </summary>
 /// <param name="matchVersion">
 ///          lucene compatibility version </param>
 /// <param name="stopwords">
 ///          a stopword set </param>
 /// <param name="stemExclutionSet">
 ///          a stemming exclusion set </param>
 public FrenchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclutionSet)
     : base(matchVersion, stopwords)
 {
     this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclutionSet));
 }
Example #29
0
 /// <summary>
 /// Create a new <see cref="KeepWordFilter"/>.
 /// <para><c>NOTE</c>: The words set passed to this constructor will be directly
 /// used by this filter and should not be modified.
 /// </para>
 /// </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <see cref="TokenStream"/> to consume </param>
 /// <param name="words">   the words to keep </param>
 public KeepWordFilter(LuceneVersion version, TokenStream @in, CharArraySet words)
     : base(version, @in)
 {
     this.words = words;
     termAtt    = AddAttribute <ICharTermAttribute>();
 }
Example #30
0
 public KeepWordFilter(LuceneVersion version, bool enablePositionIncrements, TokenStream @in, CharArraySet words)
     : base(version, enablePositionIncrements, @in)
 {
     this.words = words;
     termAtt    = AddAttribute <ICharTermAttribute>();
 }
 public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
     : base(factory, input)
 {
     Init(version, side, minGram, maxGram);
 }
Example #32
0
 /// <summary>
 /// Builds the named analyzer with no stop words. </summary>
 public SnowballAnalyzer(LuceneVersion matchVersion, string name)
 {
     this.name         = name;
     this.matchVersion = matchVersion;
 }
Example #33
0
 /// <summary>
 /// Create a new instance, loading from a previously built
 /// <see cref="AnalyzingInfixSuggester"/> directory, if it exists.
 /// This directory must be
 /// private to the infix suggester (i.e., not an external
 /// Lucene index).  Note that <see cref="Dispose()"/>
 /// will also dispose the provided directory.
 /// </summary>
 public AnalyzingInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer)
     : this(matchVersion, dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS)
 {
 }
 /// <summary>
 /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
 /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
     : this(version, input, Side.FRONT, minGram, maxGram)
 {
 }
Example #35
0
 /// <summary>
 /// Builds an analyzer with the given stop words. </summary>
 /// <param name="matchVersion"> Lucene compatibility version - See <see cref="StandardAnalyzer"/> </param>
 /// <param name="stopWords"> stop words  </param>
 public StandardAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords)
     : base(matchVersion, stopWords)
 {
 }
Example #36
0
 /// <summary>
 /// Construct a new LetterTokenizer using a given
 /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>.
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
 /// <param name="factory">
 ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
 /// <param name="in">
 ///          the input to split up into tokens </param>
 public LetterTokenizer(LuceneVersion matchVersion, AttributeSource.AttributeFactory factory, TextReader @in)
     : base(matchVersion, factory, @in)
 {
 }
Example #37
0
 /// <summary>
 /// Builds the named analyzer with the given stop words. </summary>
 public SnowballAnalyzer(LuceneVersion matchVersion, string name, CharArraySet stopWords) : this(matchVersion, name)
 {
     stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopWords));
 }
Example #38
0
 /// <summary>
 /// Builds an analyzer which removes words in
 /// <see cref="ENGLISH_STOP_WORDS_SET"/>. </summary>
 /// <param name="matchVersion"> See <see cref="LuceneVersion"/> </param>
 public StopAnalyzer(LuceneVersion matchVersion)
     : this(matchVersion, ENGLISH_STOP_WORDS_SET)
 {
 }
Example #39
0
 /// <summary>
 /// Creates a new <see cref="StandardTokenizer"/> with a given <see cref="AttributeSource.AttributeFactory"/>
 /// </summary>
 public StandardTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input)
     : base(factory, input)
 {
     Init(matchVersion);
 }
 /// <summary>
 /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
 /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
 /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
     : this(version, factory, input, Side.FRONT, minGram, maxGram)
 {
 }
Example #41
0
        // NOTE: This was moved into the QueryParserBase class.

        // * The default operator for parsing queries.
        // * Use <see cref="QueryParser.DefaultOperator"/> to change it.
        // */

        //public enum Operator
        //{
        //    OR,
        //    AND
        //}

        /// <summary>
        /// Constructs a query parser.
        /// </summary>
        /// <param name="matchVersion">Lucene version to match.</param>
        /// <param name="f">the default field for query terms.</param>
        /// <param name="a">used to find terms in the query text.</param>
        public QueryParser(LuceneVersion matchVersion, string f, Analyzer a)
            : this(new FastCharStream(new StringReader("")))
        {
            Init(matchVersion, f, a);
        }
 /// <summary>
 /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
     : base(version, input, minGram, maxGram, true)
 {
 }
Example #43
0
 /// <summary>
 /// Builds an analyzer with the stop words from the given file. </summary>
 /// <seealso cref="WordlistLoader.GetWordSet(TextReader, LuceneVersion)"/>
 /// <param name="matchVersion"> See <see cref="LuceneVersion"/> </param>
 /// <param name="stopwordsFile"> File to load stop words from  </param>
 public StopAnalyzer(LuceneVersion matchVersion, FileInfo stopwordsFile)
     : this(matchVersion, LoadStopwordSet(stopwordsFile, matchVersion))
 {
 }
Example #44
0
 /// <summary>
 /// create a new index writer config with random defaults </summary>
 public static IndexWriterConfig NewIndexWriterConfig(LuceneVersion v, Analyzer a)
 {
     return NewIndexWriterConfig(Random(), v, a);
 }
Example #45
0
 /// <summary>
 /// Construct a new LetterTokenizer.
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
 /// <param name="in">
 ///          the input to split up into tokens </param>
 public LetterTokenizer(LuceneVersion matchVersion, TextReader @in)
     : base(matchVersion, @in)
 {
 }
 /// <summary>
 /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
 /// </summary>
 public DanishAnalyzer(LuceneVersion matchVersion)
     : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
 {
 }
 private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements)
 {
     if (!enablePositionIncrements &&
     #pragma warning disable 612, 618
         version.OnOrAfter(LuceneVersion.LUCENE_44))
     #pragma warning restore 612, 618
     {
         throw new System.ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
     }
 }
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public DanishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
 }
Example #49
0
 // NOTE: This was moved into the QueryParserBase class.
 ///* The default operator_Renamed for parsing queries. 
 // * Use {@link QueryParser#setDefaultOperator} to change it.
 // */
 //public enum Operator
 //{
 //    OR,
 //    AND
 //}
 /// <summary>
 /// Constructs a query parser.
 /// </summary>
 /// <param name="matchVersion">Lucene version to match.</param>
 /// <param name="f">the default field for query terms.</param>
 /// <param name="a">used to find terms in the query text.</param>
 public QueryParser(LuceneVersion matchVersion, string f, Analyzer a)
     : this(new FastCharStream(new StringReader("")))
 {
     Init(matchVersion, f, a);
 }
Example #50
0
 /// <summary>
 /// Create a new instance, loading from a previously built
 /// directory, if it exists.
 /// </summary>
 public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer)
     : base(matchVersion, dir, analyzer)
 {
     this.blenderType = BlenderType.POSITION_LINEAR;
     this.numFactor   = DEFAULT_NUM_FACTOR;
 }
Example #51
0
 /// <summary>
 /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
 /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
 /// stemming.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
 public NorwegianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
     : base(matchVersion, stopwords)
 {
     this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
 }
Example #52
0
 /// <summary>
 /// Creates a MultiFieldQueryParser.
 ///
 /// <para/>
 /// It will, when <see cref="QueryParserBase.Parse(string)"/> is called, construct a query like this
 /// (assuming the query consists of two terms and you specify the two fields
 /// <c>title</c> and <c>body</c>):
 /// <para/>
 ///
 /// <code>
 /// (title:term1 body:term1) (title:term2 body:term2)
 /// </code>
 ///
 /// <para/>
 /// When <see cref="QueryParserBase.DefaultOperator"/> is set to <see cref="QueryParserBase.AND_OPERATOR"/>, the result will be:
 /// <para/>
 ///
 /// <code>
 /// +(title:term1 body:term1) +(title:term2 body:term2)
 /// </code>
 ///
 /// <para/>
 /// In other words, all the query's terms must appear, but it doesn't matter
 /// in what fields they appear.
 /// <para/>
 /// </summary>
 public MultiFieldQueryParser(LuceneVersion matchVersion, string[] fields, Analyzer analyzer)
     : base(matchVersion, null, analyzer)
 {
     this.m_fields = fields;
 }
 /// <summary>
 /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
 /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram)
     : base(version, factory, input, minGram, maxGram, true)
 {
 }
Example #54
0
 /// <summary>
 /// Builds an analyzer with the default stop words: <see cref="DEFAULT_STOPWORD_FILE"/>.
 /// </summary>
 public NorwegianAnalyzer(LuceneVersion matchVersion)
     : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
 {
 }
Example #55
0
        /// <summary>
        /// create a new index writer config with random defaults using the specified random </summary>
        public static IndexWriterConfig NewIndexWriterConfig(Random r, LuceneVersion v, Analyzer a)
        {
            IndexWriterConfig c = new IndexWriterConfig(v, a);
            c.SetSimilarity(ClassEnvRule.Similarity);
            if (VERBOSE)
            {
                // Even though TestRuleSetupAndRestoreClassEnv calls
                // InfoStream.setDefault, we do it again here so that
                // the PrintStreamInfoStream.messageID increments so
                // that when there are separate instances of
                // IndexWriter created we see "IW 0", "IW 1", "IW 2",
                // ... instead of just always "IW 0":
                c.InfoStream = new TestRuleSetupAndRestoreClassEnv.ThreadNameFixingPrintStreamInfoStream(Console.Out);
            }

            if (r.NextBoolean())
            {
                c.SetMergeScheduler(new SerialMergeScheduler());
            }
            else if (Rarely(r))
            {
                int maxThreadCount = TestUtil.NextInt(Random(), 1, 4);
                int maxMergeCount = TestUtil.NextInt(Random(), maxThreadCount, maxThreadCount + 4);
                ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler();
                cms.SetMaxMergesAndThreads(maxMergeCount, maxThreadCount);
                c.SetMergeScheduler(cms);
            }
            if (r.NextBoolean())
            {
                if (Rarely(r))
                {
                    // crazy value
                    c.SetMaxBufferedDocs(TestUtil.NextInt(r, 2, 15));
                }
                else
                {
                    // reasonable value
                    c.SetMaxBufferedDocs(TestUtil.NextInt(r, 16, 1000));
                }
            }
            if (r.NextBoolean())
            {
                if (Rarely(r))
                {
                    // crazy value
                    c.SetTermIndexInterval(r.NextBoolean() ? TestUtil.NextInt(r, 1, 31) : TestUtil.NextInt(r, 129, 1000));
                }
                else
                {
                    // reasonable value
                    c.SetTermIndexInterval(TestUtil.NextInt(r, 32, 128));
                }
            }
            if (r.NextBoolean())
            {
                int maxNumThreadStates = Rarely(r) ? TestUtil.NextInt(r, 5, 20) : TestUtil.NextInt(r, 1, 4); // reasonable value -  crazy value

                if (Rarely(r))
                {
                    // Retrieve the package-private setIndexerThreadPool
                    // method:
                    MethodInfo setIndexerThreadPoolMethod = typeof(IndexWriterConfig).GetMethod("SetIndexerThreadPool", new Type[] { typeof(DocumentsWriterPerThreadPool) });
                    //setIndexerThreadPoolMethod.setAccessible(true);
                    Type clazz = typeof(RandomDocumentsWriterPerThreadPool);
                    ConstructorInfo ctor = clazz.GetConstructor(new[] { typeof(int), typeof(Random) });
                    //ctor.Accessible = true;
                    // random thread pool
                    setIndexerThreadPoolMethod.Invoke(c, new[] { ctor.Invoke(new object[] { maxNumThreadStates, r }) });
                }
                else
                {
                    // random thread pool
                    c.SetMaxThreadStates(maxNumThreadStates);
                }
            }

            c.SetMergePolicy(NewMergePolicy(r));

            if (Rarely(r))
            {
                c.SetMergedSegmentWarmer(new SimpleMergedSegmentWarmer(c.InfoStream));
            }
            c.SetUseCompoundFile(r.NextBoolean());
            c.SetReaderPooling(r.NextBoolean());
            c.SetReaderTermsIndexDivisor(TestUtil.NextInt(r, 1, 4));
            c.SetCheckIntegrityAtMerge(r.NextBoolean());
            return c;
        }
Example #56
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public NorwegianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="matchVersion"> lucene compatibility version </param>
 /// <param name="stopwords"> a stopword set </param>
 public DanishAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
     : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
 {
 }
Example #58
0
 public LetterTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, TextReader reader)
     : base(TEST_VERSION_CURRENT, reader)
 {
 }
 /// <summary>
 /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given
 /// <seealso cref="LuceneVersion"/> instance.
 /// </summary>
 /// <param name="matchVersion">
 ///          a version instance </param>
 /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given
 ///         <seealso cref="LuceneVersion"/> instance. </returns>
 public static CharacterUtils GetInstance(LuceneVersion matchVersion)
 {
     return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4;
 }
Example #60
0
 /// <summary>
 /// Create a new instance, loading from a previously built
 /// directory, if it exists.
 /// </summary>
 /// <param name="blenderType"> Type of blending strategy, see BlenderType for more precisions </param>
 /// <param name="numFactor">   Factor to multiply the number of searched elements before ponderate </param>
 /// <exception cref="IOException"> If there are problems opening the underlying Lucene index. </exception>
 public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, BlenderType blenderType, int numFactor)
     : base(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars)
 {
     this.blenderType = blenderType;
     this.numFactor   = numFactor;
 }