private IStandardTokenizerInterface GetScannerFor(LuceneVersion matchVersion)
        {
            // best effort NPE if you dont call reset
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47))
            {
                return(new UAX29URLEmailTokenizerImpl(m_input));
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
            {
                return(new UAX29URLEmailTokenizerImpl40(m_input));
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                return(new UAX29URLEmailTokenizerImpl36(m_input));
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34))
            {
                return(new UAX29URLEmailTokenizerImpl34(m_input));
            }
            else
            {
                return(new UAX29URLEmailTokenizerImpl31(m_input));
            }
#pragma warning restore 612, 618
        }
Example #2
0
        /// <summary>
        /// Constructs a <see cref="StandardTokenizer"/> filtered by a
        ///    <see cref="StandardFilter"/>, a <see cref="LowerCaseFilter"/>, a <see cref="StopFilter"/>,
        ///    and a <see cref="SnowballFilter"/>
        /// </summary>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   tokenizer = new StandardTokenizer(matchVersion, reader);
            TokenStream result    = new StandardFilter(matchVersion, tokenizer);

            // remove the possessive 's for english stemmers
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English", StringComparison.Ordinal) || name.Equals("Porter", StringComparison.Ordinal) || name.Equals("Lovins", StringComparison.Ordinal)))
            {
                result = new EnglishPossessiveFilter(result);
            }
            // Use a special lowercase filter for turkish, the stemmer expects it.
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish", StringComparison.Ordinal))
            {
                result = new TurkishLowerCaseFilter(result);
            }
            else
            {
                result = new LowerCaseFilter(matchVersion, result);
            }
            if (stopSet != null)
            {
                result = new StopFilter(matchVersion, result, stopSet);
            }
            result = new SnowballFilter(result, name);
            return(new TokenStreamComponents(tokenizer, result));
        }
Example #3
0
        /// <summary>
        /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <see cref="NGramTokenFilter"/> for details. </param>
        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
        {
            this.version   = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                             CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                posIncAtt = AddAttribute <IPositionIncrementAttribute>();
                posLenAtt = AddAttribute <IPositionLengthAttribute>();
            }
            else
            {
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper();
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper();
            }
            termAtt   = AddAttribute <ICharTermAttribute>();
            offsetAtt = AddAttribute <IOffsetAttribute>();
        }
Example #4
0
        private void Init(LuceneVersion matchVersion)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47))
            {
                this.scanner = new StandardTokenizerImpl(m_input);
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
            {
                this.scanner = new StandardTokenizerImpl40(m_input);
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34))
            {
                this.scanner = new StandardTokenizerImpl34(m_input);
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                this.scanner = new StandardTokenizerImpl31(m_input);
            }
#pragma warning restore 612, 618
            else
            {
                this.scanner = new ClassicTokenizerImpl(m_input);
            }

            termAtt    = AddAttribute <ICharTermAttribute>();
            posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
        }
Example #5
0
        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
        {
#pragma warning disable 612, 618
            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                throw new ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
            }
#pragma warning disable 612, 618
            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                        CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }
            termAtt        = AddAttribute <ICharTermAttribute>();
            posIncAtt      = AddAttribute <IPositionIncrementAttribute>();
            posLenAtt      = AddAttribute <IPositionLengthAttribute>();
            offsetAtt      = AddAttribute <IOffsetAttribute>();
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.edgesOnly = edgesOnly;
            charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
            buffer         = new int[charBuffer.Buffer.Length];

            // Make the term att large enough
            termAtt.ResizeBuffer(2 * maxGram);
        }
Example #6
0
        /// <summary>
        /// Creates a new <see cref="ThaiWordFilter"/> with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
            : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt       = AddAttribute <ICharTermAttribute>();
            offsetAtt     = AddAttribute <IOffsetAttribute>();
            posAtt        = AddAttribute <IPositionIncrementAttribute>();
        }
Example #7
0
        private bool hasIllegalOffsets           = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new <see cref="ThaiWordFilter"/> with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
            : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            if (!DBBI_AVAILABLE)
            {
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            }
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt       = AddAttribute <ICharTermAttribute>();
            offsetAtt     = AddAttribute <IOffsetAttribute>();
            posAtt        = AddAttribute <IPositionIncrementAttribute>();
        }
        private bool hasIllegalOffsets = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new ThaiWordFilter with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
              : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            if (!DBBI_AVAILABLE)
            {
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            }
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posAtt = AddAttribute<IPositionIncrementAttribute>();
        }
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   tokenizer;
            TokenStream result;

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                tokenizer = new HMMChineseTokenizer(reader);
                result    = tokenizer;
            }
            else
            {
#pragma warning disable 612, 618
                tokenizer = new SentenceTokenizer(reader);
                result    = new WordTokenFilter(tokenizer);
#pragma warning restore 612, 618
            }
            // result = new LowerCaseFilter(result);
            // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
            // The porter stemming is too strict, this is not a bug, this is a feature:)
            result = new PorterStemFilter(result);
            if (stopWords.Any())
            {
                result = new StopFilter(matchVersion, result, stopWords);
            }
            return(new TokenStreamComponents(tokenizer, result));
        }
Example #10
0
        /// <summary>
        /// Return a <see cref="CharacterUtils"/> instance compatible with Java 1.4. </summary>
        public static CharacterUtils GetJava4Instance(LuceneVersion matchVersion) // LUCENENET specific - added matchVersion parameter so we can support backward compatible Unicode support
        {
#pragma warning disable 612, 618
            return(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_4 : JAVA_4_BW_COMPAT);

#pragma warning restore 612, 618
        }
Example #11
0
        /// <summary>
        /// Builds an analyzer with the default stop words (<see cref="DefaultStopSet"/>).
        /// </summary>
        public FrenchAnalyzer(LuceneVersion matchVersion)
#pragma warning disable 612, 618
            : this(matchVersion, matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ?
                   DefaultSetHolder.DEFAULT_STOP_SET : DefaultSetHolder.DEFAULT_STOP_SET_30)
#pragma warning restore 612, 618
        {
        }
Example #12
0
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input)
        {
            if (version == null)
            {
                throw new System.ArgumentException("version must not be null");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (side == null)
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version   = version;
            this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;
        }
Example #13
0
        /// <summary>
        /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given
        /// <seealso cref="LuceneVersion"/> instance.
        /// </summary>
        /// <param name="matchVersion">
        ///          a version instance </param>
        /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given
        ///         <seealso cref="LuceneVersion"/> instance. </returns>
        public static CharacterUtils GetInstance(LuceneVersion matchVersion)
        {
#pragma warning disable 612, 618
            return(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4);

#pragma warning restore 612, 618
        }
Example #14
0
 /// <summary>
 /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the
 /// text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
 ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>,
 ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
 ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
 {
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     {
         Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
         TokenStream result = new StandardFilter(matchVersion, source);
         result = new LowerCaseFilter(matchVersion, result);
         result = new StopFilter(matchVersion, result, stoptable);
         if (excltable.Count > 0)
         {
             result = new SetKeywordMarkerFilter(result, excltable);
         }
         if (stemdict != null)
         {
             result = new StemmerOverrideFilter(result, stemdict);
         }
         result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
         return(new TokenStreamComponents(source, result));
     }
     else
     {
         Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
         TokenStream result = new StandardFilter(matchVersion, source);
         result = new StopFilter(matchVersion, result, stoptable);
         if (excltable.Count > 0)
         {
             result = new SetKeywordMarkerFilter(result, excltable);
         }
         result = new DutchStemFilter(result, origStemdict);
         return(new TokenStreamComponents(source, result));
     }
 }
        /// <summary>
        /// Creates NGramTokenFilter with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <a href="#version">above</a> for details. </param>
        /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
        {
            this.version = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
                posLenAtt = AddAttribute<IPositionLengthAttribute>();
            }
            else
            {
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
        }
Example #16
0
        public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict)
        {
            this.matchVersion = matchVersion;
            this.stoptable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
            this.excltable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
#pragma warning disable 612, 618
            if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                this.stemdict     = null;
                this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
            }
            else
            {
                this.origStemdict = null;
                // we don't need to ignore case here since we lowercase in this analyzer anyway
                StemmerOverrideFilter.Builder        builder = new StemmerOverrideFilter.Builder(false);
                CharArrayMap <string> .EntryIterator iter    = (CharArrayMap <string> .EntryIterator)stemOverrideDict.EntrySet().GetEnumerator();
                CharsRef spare = new CharsRef();
                while (iter.HasNext)
                {
                    char[] nextKey = iter.NextKey();
                    spare.CopyChars(nextKey, 0, nextKey.Length);
                    builder.Add(new string(spare.Chars), iter.CurrentValue);
                }
                try
                {
                    this.stemdict = builder.Build();
                }
                catch (IOException ex)
                {
                    throw new Exception("can not build stem dict", ex);
                }
            }
        }
Example #17
0
 public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable)
     : this(matchVersion, stopwords, stemExclusionTable,
            matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap <string> .EmptyMap())
 {
     // historically, this ctor never the stem dict!!!!!
     // so we populate it only for >= 3.6
 }
 private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements)
 {
     if (!enablePositionIncrements && version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
     }
 }
Example #19
0
        /// <summary>
        /// Builds an analyzer with the default stop words.
        /// </summary>
        /// <param name="matchVersion"> lucene compatibility version </param>
        public ThaiAnalyzer(LuceneVersion matchVersion)
            : this(matchVersion,
#pragma warning disable 612, 618
                   matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
#pragma warning restore 612, 618
                   DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        {
        }
Example #20
0
        public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
            : this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
#pragma warning disable 612, 618
                   matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
#pragma warning restore 612, 618
                   DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap <string> .EmptyMap())
        {
            // historically, this ctor never the stem dict!!!!!
            // so we populate it only for >= 3.6
        }
Example #21
0
        private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements)
        {
            if (!enablePositionIncrements &&
#pragma warning disable 612, 618
                version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                throw new ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
            }
        }
Example #22
0
 public override bool IncrementToken()
 {
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     {
         return(input.IncrementToken()); // TODO: add some niceties for the new grammar
     }
     else
     {
         return(IncrementTokenClassic());
     }
 }
Example #23
0
 public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets)
     : base(@in)
 {
     if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
     }
     termAtt            = AddAttribute <ICharTermAttribute>();
     offsetAtt          = AddAttribute <IOffsetAttribute>();
     this.updateOffsets = updateOffsets;
 }
Example #24
0
 public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets)
     : base(@in)
 {
     if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
     }
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     this.updateOffsets = updateOffsets;
 }
Example #25
0
        public override sealed bool IncrementToken()
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                return(input.IncrementToken()); // TODO: add some niceties for the new grammar
            }
            else
            {
                return(IncrementTokenClassic());
            }
        }
Example #26
0
 private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
 {
     if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
     }
     charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram   = minGram;
     this.maxGram   = maxGram;
     this.edgesOnly = edgesOnly;
     charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
     buffer         = new int[charBuffer.Buffer.Length];
     // Make the term att large enough
     termAtt.ResizeBuffer(2 * maxGram);
 }
Example #27
0
        /// <summary>
        /// Initializes a query parser.  Called by the QueryParser constructor
        /// </summary>
        /// <param name="matchVersion">Lucene version to match.</param>
        /// <param name="f">the default field for query terms.</param>
        /// <param name="a">used to find terms in the query text.</param>
        public virtual void Init(LuceneVersion matchVersion, string f, Analyzer a)
        {
            Analyzer = a;
            m_field  = f;
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                AutoGeneratePhraseQueries = false;
            }
            else
            {
                AutoGeneratePhraseQueries = true;
            }
        }
 private IStandardTokenizerInterface GetScannerFor(LuceneVersion matchVersion)
 {
     // best effort NPE if you dont call reset
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47))
     {
         return(new UAX29URLEmailTokenizerImpl(input));
     }
     else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
     {
         return(new UAX29URLEmailTokenizerImpl40(input));
     }
     else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
     {
         return(new UAX29URLEmailTokenizerImpl36(input));
     }
     else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34))
     {
         return(new UAX29URLEmailTokenizerImpl34(input));
     }
     else
     {
         return(new UAX29URLEmailTokenizerImpl31(input));
     }
 }
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }
            char[] buffer       = termAtt.Buffer();
            int    bufferLength = termAtt.Length;

            if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) && (buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                termAtt.Length = bufferLength - 2; // Strip last 2 characters off
            }

            return(true);
        }
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
#pragma warning restore 612, 618
            {
                KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
                return(new TokenStreamComponents(tokenizer, tokenizer));
            }
            else
            {
                KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
                return(new TokenStreamComponents(tokenizer,
#pragma warning disable 612, 618
                                                 new ICUCollationKeyFilter(tokenizer, collator)));

#pragma warning restore 612, 618
            }
        }
Example #31
0
        private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
        {
            //if (version == null)
            //{
            //    throw new ArgumentException("version must not be null");
            //}

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                if (side == Side.BACK)
                {
                    throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
                }
            }
            else
            {
                maxGram = Math.Min(maxGram, 1024);
            }

            this.minGram    = minGram;
            this.maxGram    = maxGram;
            this.side       = side;
            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
        }
Example #32
0
        /// <summary>
        /// Creates a new WordDelimiterFilter
        /// </summary>
        /// <param name="in"> TokenStream to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords)
              : base(@in)
        {
            if (!InstanceFieldsInitialized)
            {
                InitializeInstanceFields();
                InstanceFieldsInitialized = true;
            }
            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
            }
            this.flags = configurationFlags;
            this.protWords = protWords;
            this.iterator = new WordDelimiterIterator(charTypeTable, Has(SPLIT_ON_CASE_CHANGE), Has(SPLIT_ON_NUMERICS), Has(STEM_ENGLISH_POSSESSIVE));

            this.termAttribute = AddAttribute<ICharTermAttribute>();
            this.offsetAttribute = AddAttribute<IOffsetAttribute>();
            this.posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
            this.typeAttribute = AddAttribute<ITypeAttribute>();
        }
Example #33
0
        /// <summary>
        /// Creates a new WordDelimiterFilter
        /// </summary>
        /// <param name="matchVersion"> lucene compatibility version </param>
        /// <param name="in"> TokenStream to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords)
            : base(@in)
        {
            this.termAttribute   = AddAttribute <ICharTermAttribute>();
            this.offsetAttribute = AddAttribute <IOffsetAttribute>();
            this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>();
            this.typeAttribute   = AddAttribute <ITypeAttribute>();
            concat    = new WordDelimiterConcatenation(this);
            concatAll = new WordDelimiterConcatenation(this);
            sorter    = new OffsetSorter(this);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
            }
            this.flags     = configurationFlags;
            this.protWords = protWords;
            this.iterator  = new WordDelimiterIterator(charTypeTable,
                                                       Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE),
                                                       Has(WordDelimiterFlags.SPLIT_ON_NUMERICS),
                                                       Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE));
        }
 public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict)
 {
     this.matchVersion = matchVersion;
     this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
     this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
     #pragma warning disable 612, 618
     if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     #pragma warning restore 612, 618
     {
         this.stemdict = null;
         this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
     }
     else
     {
         this.origStemdict = null;
         // we don't need to ignore case here since we lowercase in this analyzer anyway
         StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
         CharArrayMap<string>.EntryIterator iter = (CharArrayMap<string>.EntryIterator)stemOverrideDict.EntrySet().GetEnumerator();
         CharsRef spare = new CharsRef();
         while (iter.HasNext)
         {
             char[] nextKey = iter.NextKey();
             spare.CopyChars(nextKey, 0, nextKey.Length);
             builder.Add(new string(spare.Chars), iter.CurrentValue);
         }
         try
         {
             this.stemdict = builder.Build();
         }
         catch (IOException ex)
         {
             throw new Exception("can not build stem dict", ex);
         }
     }
 }
        public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
            : this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
#pragma warning disable 612, 618
                    matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
#pragma warning restore 612, 618
                    DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap<string>.EmptyMap())
        {
            // historically, this ctor never the stem dict!!!!!
            // so we populate it only for >= 3.6
        }
 private static void CheckPositionIncrement(LuceneVersion version, bool enablePositionIncrements)
 {
     if (!enablePositionIncrements &&
     #pragma warning disable 612, 618
         version.OnOrAfter(LuceneVersion.LUCENE_44))
     #pragma warning restore 612, 618
     {
         throw new System.ArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
     }
 }
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
              : base(input)
        {

            //if (version == null)
            //{
            //    throw new System.ArgumentException("version must not be null");
            //}

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;

            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
        }
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            if (version == null)
            {
              throw new System.ArgumentException("version must not be null");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
              throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (side == null)
            {
              throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
              throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
              throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.version = version;
            this.charUtils = version.onOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;
        }
 /// <summary>
 /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given
 /// <seealso cref="LuceneVersion"/> instance.
 /// </summary>
 /// <param name="matchVersion">
 ///          a version instance </param>
 /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given
 ///         <seealso cref="LuceneVersion"/> instance. </returns>
 public static CharacterUtils GetInstance(LuceneVersion matchVersion)
 {
     #pragma warning disable 612, 618
     return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4;
     #pragma warning restore 612, 618
 }
        private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
        {
            //if (version == null)
            //{
            //    throw new System.ArgumentException("version must not be null");
            //}

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                if (side == Side.BACK)
                {
                    throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
                }
            }
            else
            {
                maxGram = Math.Min(maxGram, 1024);
            }

            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;
            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
        }
        /// <summary>
        /// Partially reverses the given input buffer in-place from the given offset
        /// up to the given length. </summary>
        /// <param name="matchVersion"> See <a href="#version">above</a> </param>
        /// <param name="buffer"> the input char array to reverse </param>
        /// <param name="start"> the offset from where to reverse the buffer </param>
        /// <param name="len"> the length in the buffer up to where the
        ///        buffer should be reversed </param>
        public static void Reverse(LuceneVersion matchVersion, char[] buffer, int start, int len)
        {
#pragma warning disable 612, 618
            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))

            {
                ReverseUnicode3(buffer, start, len);
#pragma warning restore 612, 618
                return;
            }
            /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
            if (len < 2)
            {
                return;
            }
            int end = (start + len) - 1;
            char frontHigh = buffer[start];
            char endLow = buffer[end];
            bool allowFrontSur = true, allowEndSur = true;
            int mid = start + (len >> 1);
            for (int i = start; i < mid; ++i, --end)
            {
                char frontLow = buffer[i + 1];
                char endHigh = buffer[end - 1];
                bool surAtFront = allowFrontSur && char.IsSurrogatePair(frontHigh, frontLow);
                if (surAtFront && (len < 3))
                {
                    // nothing to do since surAtFront is allowed and 1 char left
                    return;
                }
                bool surAtEnd = allowEndSur && char.IsSurrogatePair(endHigh, endLow);
                allowFrontSur = allowEndSur = true;
                if (surAtFront == surAtEnd)
                {
                    if (surAtFront)
                    {
                        // both surrogates
                        buffer[end] = frontLow;
                        buffer[--end] = frontHigh;
                        buffer[i] = endHigh;
                        buffer[++i] = endLow;
                        frontHigh = buffer[i + 1];
                        endLow = buffer[end - 1];
                    }
                    else
                    {
                        // neither surrogates
                        buffer[end] = frontHigh;
                        buffer[i] = endLow;
                        frontHigh = frontLow;
                        endLow = endHigh;
                    }
                }
                else
                {
                    if (surAtFront)
                    {
                        // surrogate only at the front
                        buffer[end] = frontLow;
                        buffer[i] = endLow;
                        endLow = endHigh;
                        allowFrontSur = false;
                    }
                    else
                    {
                        // surrogate only at the end
                        buffer[end] = frontHigh;
                        buffer[i] = endHigh;
                        frontHigh = frontLow;
                        allowEndSur = false;
                    }
                }
            }
            if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur))
            {
                // only if odd length
                buffer[end] = allowFrontSur ? endLow : frontHigh;
            }
        }
        /// <summary>
        /// Builds an analyzer with the default stop words:
        /// <seealso cref="#getDefaultStopSet()"/>.
        /// </summary>
        public GermanAnalyzer(LuceneVersion matchVersion)
#pragma warning disable 612, 618
              : this(matchVersion, matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? 
                    DefaultSetHolder.DEFAULT_SET : DefaultSetHolder.DEFAULT_SET_30)
#pragma warning restore 612, 618
        {
        }
        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
        {
#pragma warning disable 612, 618
            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
            }
#pragma warning disable 612, 618
            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            this.minGram = minGram;
            this.maxGram = maxGram;
            this.edgesOnly = edgesOnly;
            charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
            buffer = new int[charBuffer.Buffer.Length];



            // Make the term att large enough
            termAtt.ResizeBuffer(2 * maxGram);
        }
 /// <summary>
 /// Initializes a query parser.  Called by the QueryParser constructor
 /// </summary>
 /// <param name="matchVersion">Lucene version to match.</param>
 /// <param name="f">the default field for query terms.</param>
 /// <param name="a">used to find terms in the query text.</param>
 public virtual void Init(LuceneVersion matchVersion, string f, Analyzer a)
 {
     Analyzer = a;
     field = f;
     #pragma warning disable 612, 618
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     #pragma warning restore 612, 618
     {
         AutoGeneratePhraseQueries = false;
     }
     else
     {
         AutoGeneratePhraseQueries = true;
     }
 }
 /// <summary>
 /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given
 /// <seealso cref="LuceneVersion"/> instance.
 /// </summary>
 /// <param name="matchVersion">
 ///          a version instance </param>
 /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given
 ///         <seealso cref="LuceneVersion"/> instance. </returns>
 public static CharacterUtils GetInstance(LuceneVersion matchVersion)
 {
     return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4;
 }
Example #46
0
	  private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
	  {
		if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
		{
		  throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
		}
		charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
		if (minGram < 1)
		{
		  throw new System.ArgumentException("minGram must be greater than zero");
		}
		if (minGram > maxGram)
		{
		  throw new System.ArgumentException("minGram must not be greater than maxGram");
		}
		this.minGram = minGram;
		this.maxGram = maxGram;
		this.edgesOnly = edgesOnly;
		charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
		buffer = new int[charBuffer.Buffer.Length];
		// Make the term att large enough
		termAtt.ResizeBuffer(2 * maxGram);
	  }