Example #1
0
 /// <summary>
 /// Creates NGramTokenFilter with given min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram)
     : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
 {
     this.version = version;
     this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
     if (version.OnOrAfter(Version.LUCENE_44))
     {
         posIncAtt = AddAttribute(typeof(PositionIncrementAttribute));
         posLenAtt = AddAttribute(typeof(PositionLengthAttribute));
     }
     else
     {
         posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
         posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
     }
 }
Example #2
0
 /// <summary>
 /// Create a new ReverseStringFilter that reverses and marks all tokens in the
 /// supplied <seealso cref="TokenStream"/>.
 /// <para>
 /// The reversed tokens will be prepended (marked) by the <code>marker</code>
 /// character.
 /// </para>
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> <seealso cref="TokenStream"/> to filter </param>
 /// <param name="marker"> A character used to mark reversed tokens </param>
 public ReverseStringFilter(Version matchVersion, TokenStream @in, char marker) : base(@in)
 {
     this.matchVersion = matchVersion;
     this.marker       = marker;
 }
Example #3
0
        /// <summary>
        /// Reverses the given input buffer in-place </summary>
        /// <param name="matchVersion"> See <a href="#version">above</a> </param>
        /// <param name="buffer"> the input char array to reverse </param>
//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
//ORIGINAL LINE: public static void reverse(org.apache.lucene.util.Version matchVersion, final char[] buffer)
        public static void reverse(Version matchVersion, char[] buffer)
        {
            reverse(matchVersion, buffer, 0, buffer.Length);
        }
Example #4
0
        /// <summary>
        /// Partially reverses the given input buffer in-place from the given offset
        /// up to the given length. </summary>
        /// <param name="matchVersion"> See <a href="#version">above</a> </param>
        /// <param name="buffer"> the input char array to reverse </param>
        /// <param name="start"> the offset from where to reverse the buffer </param>
        /// <param name="len"> the length in the buffer up to where the
        ///        buffer should be reversed </param>
        public static void reverse(Version matchVersion, char[] buffer, int start, int len)
        {
            if (!matchVersion.OnOrAfter(Version.LUCENE_31))
            {
                reverseUnicode3(buffer, start, len);
                return;
            }
            /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
            if (len < 2)
            {
                return;
            }
            int  end = (start + len) - 1;
            char frontHigh = buffer[start];
            char endLow = buffer[end];
            bool allowFrontSur = true, allowEndSur = true;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int mid = start + (len >> 1);
            int mid = start + (len >> 1);

            for (int i = start; i < mid; ++i, --end)
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char frontLow = buffer[i + 1];
                char frontLow = buffer[i + 1];
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char endHigh = buffer[end - 1];
                char endHigh = buffer[end - 1];
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final boolean surAtFront = allowFrontSur && Character.isSurrogatePair(frontHigh, frontLow);
                bool surAtFront = allowFrontSur && char.IsSurrogatePair(frontHigh, frontLow);
                if (surAtFront && (len < 3))
                {
                    // nothing to do since surAtFront is allowed and 1 char left
                    return;
                }
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final boolean surAtEnd = allowEndSur && Character.isSurrogatePair(endHigh, endLow);
                bool surAtEnd = allowEndSur && char.IsSurrogatePair(endHigh, endLow);
                allowFrontSur = allowEndSur = true;
                if (surAtFront == surAtEnd)
                {
                    if (surAtFront)
                    {
                        // both surrogates
                        buffer[end]   = frontLow;
                        buffer[--end] = frontHigh;
                        buffer[i]     = endHigh;
                        buffer[++i]   = endLow;
                        frontHigh     = buffer[i + 1];
                        endLow        = buffer[end - 1];
                    }
                    else
                    {
                        // neither surrogates
                        buffer[end] = frontHigh;
                        buffer[i]   = endLow;
                        frontHigh   = frontLow;
                        endLow      = endHigh;
                    }
                }
                else
                {
                    if (surAtFront)
                    {
                        // surrogate only at the front
                        buffer[end]   = frontLow;
                        buffer[i]     = endLow;
                        endLow        = endHigh;
                        allowFrontSur = false;
                    }
                    else
                    {
                        // surrogate only at the end
                        buffer[end] = frontHigh;
                        buffer[i]   = endHigh;
                        frontHigh   = frontLow;
                        allowEndSur = false;
                    }
                }
            }
            if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur))
            {
                // only if odd length
                buffer[end] = allowFrontSur ? endLow : frontHigh;
            }
        }
Example #5
0
 /// <summary>
 /// Returns a fully initialized TokenFilterFactory with the specified name and key-value arguments.
 /// <seealso cref="ClasspathResourceLoader"/> is used for loading resources, so any required ones should
 /// be on the test classpath.
 /// </summary>
 protected internal virtual TokenFilterFactory TokenFilterFactory(string name, Version version, params string[] keysAndValues)
 {
     return(TokenFilterFactory(name, version, new ClasspathResourceLoader(this.GetType()), keysAndValues));
 }
 /// <summary>
 /// Returns a fully initialized CharFilterFactory with the specified name, version, resource loader, 
 /// and key-value arguments.
 /// </summary>
 protected internal virtual CharFilterFactory CharFilterFactory(string name, Version matchVersion, IResourceLoader loader, params string[] keysAndValues)
 {
     return (CharFilterFactory)AnalysisFactory(Lucene.Net.Analysis.Util.CharFilterFactory.LookupClass(name), matchVersion, loader, keysAndValues);
 }
 /// <summary>
 /// Reverses the given input string
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="input"> the string to reverse </param>
 /// <returns> the given input string in reversed order </returns>
 //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
 //ORIGINAL LINE: public static String reverse(org.apache.lucene.util.Version matchVersion, final String input)
 public static string reverse(Version matchVersion, string input)
 {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final char[] charInput = input.toCharArray();
     char[] charInput = input.ToCharArray();
     reverse(matchVersion, charInput, 0, charInput.Length);
     return new string(charInput);
 }
Example #8
0
	  /// <summary>
	  /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
	  /// leading and trailing whitespace). Every line of the Reader should contain only
	  /// one word. The words need to be in lowercase if you make use of an
	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	  /// </summary>
	  /// <param name="reader"> Reader containing the wordlist </param>
	  /// <param name="comment"> The string representing a comment. </param>
	  /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
	  /// <returns> A CharArraySet with the reader's words </returns>
	  public static CharArraySet GetWordSet(TextReader reader, string comment, Version matchVersion)
	  {
		return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
	  }
 public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram)
     : this(version, input, GetSide(sideLabel), minGram, maxGram)
 {
 }
 /// <summary>
 /// Create a new ReverseStringFilter that reverses and marks all tokens in the
 /// supplied <seealso cref="TokenStream"/>.
 /// <para>
 /// The reversed tokens will be prepended (marked) by the <code>marker</code>
 /// character.
 /// </para>
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> <seealso cref="TokenStream"/> to filter </param>
 /// <param name="marker"> A character used to mark reversed tokens </param>
 public ReverseStringFilter(Version matchVersion, TokenStream @in, char marker)
     : base(@in)
 {
     this.matchVersion = matchVersion;
     this.marker = marker;
 }
 public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram)
     : base(factory, input)
 {
     init(version, side, minGram, maxGram);
 }
 public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram)
     : base(input)
 {
     init(version, side, minGram, maxGram);
 }
        private void init(Version version, Side side, int minGram, int maxGram)
        {
            if (version == null)
            {
                throw new System.ArgumentException("version must not be null");
            }

            if (side == null)
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            if (version.OnOrAfter(Version.LUCENE_44))
            {
                if (side == Side.BACK)
                {
                    throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
                }
            }
            else
            {
                maxGram = Math.Min(maxGram, 1024);
            }

            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;
        }
 /// <summary>
 /// Creates a CharArraySet from a file.
 /// </summary>
 /// <param name="stopwords">
 ///          the stopwords reader to load
 /// </param>
 /// <param name="matchVersion">
 ///          the Lucene version for cross version compatibility </param>
 /// <returns> a CharArraySet containing the distinct stopwords from the given
 ///         reader </returns>
 /// <exception cref="IOException">
 ///           if loading the stopwords throws an <seealso cref="IOException"/> </exception>
 protected internal static CharArraySet loadStopwordSet(Reader stopwords, Version matchVersion)
 {
     try
     {
       return WordlistLoader.GetWordSet(stopwords, matchVersion);
     }
     finally
     {
       IOUtils.Close(stopwords);
     }
 }
 /// <summary>
 /// Reverses the given input buffer in-place </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="buffer"> the input char array to reverse </param>
 //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
 //ORIGINAL LINE: public static void reverse(org.apache.lucene.util.Version matchVersion, final char[] buffer)
 public static void reverse(Version matchVersion, char[] buffer)
 {
     reverse(matchVersion, buffer, 0, buffer.Length);
 }
 /// <summary>
 /// Creates a new Analyzer with an empty stopword set
 /// </summary>
 /// <param name="version">
 ///          the Lucene version for cross version compatibility </param>
 protected internal StopwordAnalyzerBase(Version version)
     : this(version, null)
 {
 }
 /// <summary>
 /// Partially reverses the given input buffer in-place from offset 0
 /// up to the given length. </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="buffer"> the input char array to reverse </param>
 /// <param name="len"> the length in the buffer up to where the
 ///        buffer should be reversed </param>
 //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
 //ORIGINAL LINE: public static void reverse(org.apache.lucene.util.Version matchVersion, final char[] buffer, final int len)
 public static void reverse(Version matchVersion, char[] buffer, int len)
 {
     reverse(matchVersion, buffer, 0, len);
 }
 /// <summary>
 /// Creates a new StandardTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> 
 /// </summary>
 public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input)
     : base(factory, input)
 {
     Init(matchVersion);
 }
 /// <summary>
 /// Partially reverses the given input buffer in-place from the given offset
 /// up to the given length. </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="buffer"> the input char array to reverse </param>
 /// <param name="start"> the offset from where to reverse the buffer </param>
 /// <param name="len"> the length in the buffer up to where the
 ///        buffer should be reversed </param>
 public static void reverse(Version matchVersion, char[] buffer, int start, int len)
 {
     if (!matchVersion.OnOrAfter(Version.LUCENE_31))
     {
       reverseUnicode3(buffer, start, len);
       return;
     }
     /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
     if (len < 2)
     {
       return;
     }
     int end = (start + len) - 1;
     char frontHigh = buffer[start];
     char endLow = buffer[end];
     bool allowFrontSur = true, allowEndSur = true;
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final int mid = start + (len >> 1);
     int mid = start + (len >> 1);
     for (int i = start; i < mid; ++i, --end)
     {
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final char frontLow = buffer[i + 1];
       char frontLow = buffer[i + 1];
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final char endHigh = buffer[end - 1];
       char endHigh = buffer[end - 1];
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final boolean surAtFront = allowFrontSur && Character.isSurrogatePair(frontHigh, frontLow);
       bool surAtFront = allowFrontSur && char.IsSurrogatePair(frontHigh, frontLow);
       if (surAtFront && (len < 3))
       {
     // nothing to do since surAtFront is allowed and 1 char left
     return;
       }
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final boolean surAtEnd = allowEndSur && Character.isSurrogatePair(endHigh, endLow);
       bool surAtEnd = allowEndSur && char.IsSurrogatePair(endHigh, endLow);
       allowFrontSur = allowEndSur = true;
       if (surAtFront == surAtEnd)
       {
     if (surAtFront)
     {
       // both surrogates
       buffer[end] = frontLow;
       buffer[--end] = frontHigh;
       buffer[i] = endHigh;
       buffer[++i] = endLow;
       frontHigh = buffer[i + 1];
       endLow = buffer[end - 1];
     }
     else
     {
       // neither surrogates
       buffer[end] = frontHigh;
       buffer[i] = endLow;
       frontHigh = frontLow;
       endLow = endHigh;
     }
       }
       else
       {
     if (surAtFront)
     {
       // surrogate only at the front
       buffer[end] = frontLow;
       buffer[i] = endLow;
       endLow = endHigh;
       allowFrontSur = false;
     }
     else
     {
       // surrogate only at the end
       buffer[end] = frontHigh;
       buffer[i] = endHigh;
       frontHigh = frontLow;
       allowEndSur = false;
     }
       }
     }
     if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur))
     {
       // only if odd length
       buffer[end] = allowFrontSur ? endLow : frontHigh;
     }
 }
Example #20
0
 /// <summary>
 /// Create set with enough capacity to hold startSize terms
 /// </summary>
 /// <param name="matchVersion">
 ///          compatibility match version see <a href="#version">Version
 ///          note</a> above for details. </param>
 /// <param name="startSize">
 ///          the initial capacity </param>
 /// <param name="ignoreCase">
 ///          <code>false</code> if and only if the set should be case sensitive
 ///          otherwise <code>true</code>. </param>
 public CharArraySet(Lucene.Net.Util.LuceneVersion matchVersion, int startSize, bool ignoreCase) : this(new CharArrayMap <>(matchVersion, startSize, ignoreCase))
 /// <summary>
 /// Create a new ReverseStringFilter that reverses all tokens in the 
 /// supplied <seealso cref="TokenStream"/>.
 /// <para>
 /// The reversed tokens will not be marked. 
 /// </para>
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> <seealso cref="TokenStream"/> to filter </param>
 public ReverseStringFilter(Version matchVersion, TokenStream @in)
     : this(matchVersion, @in, NOMARKER)
 {
 }
 /// <summary>
 /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
 /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram)
     : this(version, factory, input, Side.FRONT, minGram, maxGram)
 {
 }
Example #23
0
 /// <summary>
 /// Creates a new instance initialized with the given stopword set
 /// </summary>
 /// <param name="version">
 ///          the Lucene version for cross version compatibility </param>
 /// <param name="stopwords">
 ///          the analyzer's stopword set </param>
 protected internal StopwordAnalyzerBase(Version version, CharArraySet stopwords)
 {
     matchVersion = version;
     // analyzers should use char array set for stopwords!
     this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet.UnmodifiableSet(CharArraySet.Copy(version, stopwords));
 }
Example #24
0
        /// <summary>
        /// Partially reverses the given input buffer in-place from offset 0
        /// up to the given length. </summary>
        /// <param name="matchVersion"> See <a href="#version">above</a> </param>
        /// <param name="buffer"> the input char array to reverse </param>
        /// <param name="len"> the length in the buffer up to where the
        ///        buffer should be reversed </param>
//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
//ORIGINAL LINE: public static void reverse(org.apache.lucene.util.Version matchVersion, final char[] buffer, final int len)
        public static void reverse(Version matchVersion, char[] buffer, int len)
        {
            reverse(matchVersion, buffer, 0, len);
        }
Example #25
0
 /// <summary>
 /// Creates a new Analyzer with an empty stopword set
 /// </summary>
 /// <param name="version">
 ///          the Lucene version for cross version compatibility </param>
 protected internal StopwordAnalyzerBase(Version version) : this(version, null)
 {
 }
Example #26
0
 /// <summary>
 /// Create a new ReverseStringFilter that reverses all tokens in the
 /// supplied <seealso cref="TokenStream"/>.
 /// <para>
 /// The reversed tokens will not be marked.
 /// </para>
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> <seealso cref="TokenStream"/> to filter </param>
 public ReverseStringFilter(Version matchVersion, TokenStream @in) : this(matchVersion, @in, NOMARKER)
 {
 }
Example #27
0
 /// <summary>
 /// Creates a new instance of the <seealso cref="StandardTokenizer"/>.  Attaches
 /// the <code>input</code> to the newly created JFlex scanner.
 /// </summary>
 /// <param name="input"> The input reader
 ///
 /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
 public StandardTokenizer(Version matchVersion, Reader input)
     : base(input)
 {
     Init(matchVersion);
 }
Example #28
0
 /// <summary>
 /// Creates NGramTokenFilter with default min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 public NGramTokenFilter(Version version, TokenStream input)
     : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
 {
 }
Example #29
0
 /// <summary>
 /// Creates a new StandardTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
 /// </summary>
 public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input)
     : base(factory, input)
 {
     Init(matchVersion);
 }
 /// <summary>
 /// Creates a CharArraySet from a file.
 /// </summary>
 /// <param name="stopwords">
 ///          the stopwords file to load
 /// </param>
 /// <param name="matchVersion">
 ///          the Lucene version for cross version compatibility </param>
 /// <returns> a CharArraySet containing the distinct stopwords from the given
 ///         file </returns>
 /// <exception cref="IOException">
 ///           if loading the stopwords throws an <seealso cref="IOException"/> </exception>
 protected internal static CharArraySet LoadStopwordSet(File stopwords, Version matchVersion)
 {
     Reader reader = null;
     try
     {
       reader = IOUtils.GetDecodingReader(stopwords, StandardCharsets.UTF_8);
       return WordlistLoader.GetWordSet(reader, matchVersion);
     }
     finally
     {
       IOUtils.Close(reader);
     }
 }
Example #31
0
        /// <summary>
        /// Creates a new instance of the <seealso cref="StandardTokenizer"/>.  Attaches
        /// the <code>input</code> to the newly created JFlex scanner.
        /// </summary>
        /// <param name="input"> The input reader
        /// 
        /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
        public StandardTokenizer(Version matchVersion, Reader input)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();

            Init(matchVersion);
        }
 /// <summary>
 /// Creates a new instance initialized with the given stopword set
 /// </summary>
 /// <param name="version">
 ///          the Lucene version for cross version compatibility </param>
 /// <param name="stopwords">
 ///          the analyzer's stopword set </param>
 protected internal StopwordAnalyzerBase(Version version, CharArraySet stopwords)
 {
     matchVersion = version;
     // analyzers should use char array set for stopwords!
     this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet.UnmodifiableSet(CharArraySet.Copy(version, stopwords));
 }
Example #33
0
 private void Init(Version matchVersion)
 {
     if (matchVersion.OnOrAfter(Version.LUCENE_47))
     {
         this.scanner = new StandardTokenizerImpl(input);
     }
     else if (matchVersion.OnOrAfter(Version.LUCENE_40))
     {
         this.scanner = new StandardTokenizerImpl40(input);
     }
     else if (matchVersion.OnOrAfter(Version.LUCENE_34))
     {
         this.scanner = new StandardTokenizerImpl34(input);
     }
     else if (matchVersion.OnOrAfter(Version.LUCENE_31))
     {
         this.scanner = new StandardTokenizerImpl31(input);
     }
     else
     {
         this.scanner = new ClassicTokenizerImpl(input);
     }
 }
Example #34
0
	  /// <summary>
	  /// Reads stopwords from a stopword list in Snowball format.
	  /// <para>
	  /// The snowball format is the following:
	  /// <ul>
	  /// <li>Lines may contain multiple words separated by whitespace.
	  /// <li>The comment character is the vertical line (&#124;).
	  /// <li>Lines may contain trailing comments.
	  /// </ul>
	  /// </para>
	  /// </summary>
	  /// <param name="reader"> Reader containing a Snowball stopword list </param>
	  /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
	  /// <returns> A <seealso cref="CharArraySet"/> with the reader's words </returns>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public static CharArraySet getSnowballWordSet(java.io.Reader reader, org.apache.lucene.util.Version matchVersion) throws java.io.IOException
	  public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion)
	  {
		return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
	  }
Example #35
0
 /// <summary>
 /// Creates NGramTokenFilter with default min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 public NGramTokenFilter(Version version, TokenStream input)
     : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
 {
 }
 /// <summary>
 /// Creates a new instance of the <seealso cref="StandardTokenizer"/>.  Attaches
 /// the <code>input</code> to the newly created JFlex scanner.
 /// </summary>
 /// <param name="input"> The input reader
 /// 
 /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
 public StandardTokenizer(Version matchVersion, Reader input)
     : base(input)
 {
     Init(matchVersion);
 }
 /// <summary>
 /// Returns a fully initialized TokenFilterFactory with the specified name and key-value arguments.
 /// <seealso cref="ClasspathResourceLoader"/> is used for loading resources, so any required ones should
 /// be on the test classpath.
 /// </summary>
 protected internal virtual TokenFilterFactory TokenFilterFactory(string name, Version version, params string[] keysAndValues)
 {
     return(TokenFilterFactory(name, version, GetCurrentTypeResourceLoader(), keysAndValues));
 }
        private void Init(Version matchVersion)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(Version.LUCENE_47))
            {
                this.scanner = new StandardTokenizerImpl(input);
            }
            else if (matchVersion.OnOrAfter(Version.LUCENE_40))
            {
                this.scanner = new StandardTokenizerImpl40(input);
            }
            else if (matchVersion.OnOrAfter(Version.LUCENE_34))
            {
                this.scanner = new StandardTokenizerImpl34(input);
            }
            else if (matchVersion.OnOrAfter(Version.LUCENE_31))
            {
                this.scanner = new StandardTokenizerImpl31(input);
            }
#pragma warning restore 612, 618
            else
            {
                this.scanner = new ClassicTokenizerImpl(input);
            }

            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
        }
Example #39
0
        /// <summary>
        /// Reads stopwords from a stopword list in Snowball format.
        /// <para>
        /// The snowball format is the following:
        /// <ul>
        /// <li>Lines may contain multiple words separated by whitespace.
        /// <li>The comment character is the vertical line (&#124;).
        /// <li>Lines may contain trailing comments.
        /// </ul>
        /// </para>
        /// </summary>
        /// <param name="reader"> Reader containing a Snowball stopword list </param>
        /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
        /// <returns> A <seealso cref="CharArraySet"/> with the reader's words </returns>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public static CharArraySet getSnowballWordSet(java.io.Reader reader, org.apache.lucene.util.Version matchVersion) throws java.io.IOException
        public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion)
        {
            return(getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false)));
        }
Example #40
0
 /// <summary>
 /// Returns a fully initialized CharFilterFactory with the specified name, version, resource loader,
 /// and key-value arguments.
 /// </summary>
 protected internal virtual CharFilterFactory CharFilterFactory(string name, Version matchVersion, IResourceLoader loader, params string[] keysAndValues)
 {
     return((CharFilterFactory)AnalysisFactory(Lucene.Net.Analysis.Util.CharFilterFactory.LookupClass(name), matchVersion, loader, keysAndValues));
 }
Example #41
0
 /// <summary>
 /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
 /// leading and trailing whitespace). Every line of the Reader should contain only
 /// one word. The words need to be in lowercase if you make use of an
 /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 /// </summary>
 /// <param name="reader"> Reader containing the wordlist </param>
 /// <param name="comment"> The string representing a comment. </param>
 /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
 /// <returns> A CharArraySet with the reader's words </returns>
 public static CharArraySet GetWordSet(TextReader reader, string comment, Version matchVersion)
 {
     return(GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false)));
 }
        private AbstractAnalysisFactory AnalysisFactory(Type clazz, Version matchVersion, IResourceLoader loader, params string[] keysAndValues)
        {
            if (keysAndValues.Length % 2 == 1)
            {
                throw new System.ArgumentException("invalid keysAndValues map");
            }
            string previous;
            IDictionary<string, string> args = new Dictionary<string, string>();
            for (int i = 0; i < keysAndValues.Length; i += 2)
            {
                if (args.TryGetValue(keysAndValues[i], out previous))
                {
                    fail("duplicate values for key: " + keysAndValues[i]);
                }
                args[keysAndValues[i]] = keysAndValues[i + 1];
            }

            if (args.TryGetValue("luceneMatchVersion", out previous))
            {
                fail("duplicate values for key: luceneMatchVersion");
            }
            args["luceneMatchVersion"] = matchVersion.ToString();

            AbstractAnalysisFactory factory = null;
            try
            {
                factory = (AbstractAnalysisFactory)Activator.CreateInstance(clazz,
                    BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance,
                    null, new object[] { args }, CultureInfo.InvariantCulture);
            }
            catch (TargetInvocationException e)
            {
                // to simplify tests that check for illegal parameters
                if (e.InnerException is System.ArgumentException)
                {
                    throw (System.ArgumentException)e.InnerException;
                }
                else
                {
                    throw e;
                }
            }
            if (factory is IResourceLoaderAware)
            {
                ((IResourceLoaderAware)factory).Inform(loader);
            }
            return factory;
        }
 public FilteringTokenFilter(Lucene.Net.Util.LuceneVersion version, bool enablePositionIncrements, TokenStream input)
     : this(version, input)
 {
     CheckPositionIncrement(version, enablePositionIncrements);
     this.enablePositionIncrements = enablePositionIncrements;
 }
 /// <summary>
 /// Returns a fully initialized TokenFilterFactory with the specified name and key-value arguments.
 /// <seealso cref="ClasspathResourceLoader"/> is used for loading resources, so any required ones should
 /// be on the test classpath.
 /// </summary>
 protected internal virtual TokenFilterFactory TokenFilterFactory(string name, Version version, params string[] keysAndValues)
 {
     return TokenFilterFactory(name, version, GetCurrentTypeResourceLoader(), keysAndValues);
 }
 /// <summary>
 /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
 /// </summary>
 /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram)
     : this(version, input, Side.FRONT, minGram, maxGram)
 {
 }