Inheritance: System.MarshalByRefObject, IDisposable
        public KeywordTokenizer(TextReader input, int bufferSize)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            if (bufferSize <= 0)
            {
                throw new System.ArgumentException("bufferSize must be > 0");
            }
            termAtt.ResizeBuffer(bufferSize);
        }
	  /// <summary>
	  /// Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
	  /// leading and trailing whitespace). Every line of the Reader should contain only
	  /// one word. The words need to be in lowercase if you make use of an
	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	  /// </summary>
	  /// <param name="reader"> Reader containing the wordlist </param>
	  /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
	  /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
	  public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
	  {
		BufferedReader br = null;
		try
		{
		  br = getBufferedReader(reader);
		  string word = null;
		  while ((word = br.readLine()) != null)
		  {
			result.add(word.Trim());
		  }
		}
		finally
		{
		  IOUtils.close(br);
		}
		return result;
	  }
 /// <summary>
 /// Creates NGramTokenizer with given min and max n-grams. </summary>
 /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram)
     : base(input)
 {
     init(minGram, maxGram);
 }
Exemple #4
0
		public iCalLexer(TextReader r) : this(new CharBuffer(r))
		{
		}
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new PathHierarchyTokenizer(reader);
     return new TokenStreamComponents(tokenizer, tokenizer);
 }
	  /// <summary>
	  /// Resets the scanner to read from a new input stream.
	  /// Does not close the old reader.
	  /// 
	  /// All internal variables are reset, the old input stream 
	  /// <b>cannot</b> be reused (internal buffer is discarded and lost).
	  /// Lexical state is set to <tt>ZZ_INITIAL</tt>.
	  /// 
	  /// Internal scan buffer is resized down to its initial length, if it has grown.
	  /// </summary>
	  /// <param name="reader">   the new input stream  </param>
	  public void yyreset(Reader reader)
	  {
		zzReader = reader;
		zzAtBOL = true;
		zzAtEOF = false;
		zzEOFDone = false;
		zzEndRead = zzStartRead = 0;
		zzCurrentPos = zzMarkedPos = 0;
		yyline = yychar_Renamed = yycolumn = 0;
		zzLexicalState = YYINITIAL;
		if (zzBuffer.Length > ZZ_BUFFERSIZE)
		{
		  zzBuffer = new char[ZZ_BUFFERSIZE];
		}
	  }
	  /// <summary>
	  /// Creates a new scanner
	  /// </summary>
	  /// <param name="in">  the java.io.Reader to read input from. </param>
	  internal ClassicTokenizerImpl(Reader @in)
	  {
		this.zzReader = @in;
	  }
	    // TODO: refactor to a shared readFully somewhere
	  // (NGramTokenizer does this too):
	  /// <summary>
	  /// commons-io's readFully, but without bugs if offset != 0 </summary>
	  private static int Read(TextReader input, char[] buffer, int offset, int length)
	  {
		Debug.Assert(length >= 0, "length must not be negative: " + length);

		int remaining = length;
		while (remaining > 0)
		{
		  int location = length - remaining;
		  int count = input.read(buffer, offset + location, remaining);
		  if (-1 == count) // EOF
		  {
			break;
		  }
		  remaining -= count;
		}
		return length - remaining;
	  }
	  /// <summary>
	  /// Creates NGramTokenizer with default min and max n-grams. </summary>
	  /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
	  /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param>
	  public NGramTokenizer(LuceneVersion version, TextReader input) : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
	  {
	  }
Exemple #10
0
	  /// <summary>
	  /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
	  /// leading and trailing whitespace). Every line of the Reader should contain only
	  /// one word. The words need to be in lowercase if you make use of an
	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	  /// </summary>
	  /// <param name="reader"> Reader containing the wordlist </param>
	  /// <param name="comment"> The string representing a comment. </param>
	  /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
	  /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
	  public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result)
	  {
		BufferedReader br = null;
		try
		{
		  br = getBufferedReader(reader);
		  string word = null;
		  while ((word = br.ReadLine()) != null)
		  {
			if (word.StartsWith(comment, StringComparison.Ordinal) == false)
			{
			  result.add(word.Trim());
			}
		  }
		}
		finally
		{
		  IOUtils.Close(br);
		}
		return result;
	  }
 public KeywordTokenizer(TextReader input)
     : this(input, DEFAULT_BUFFER_SIZE)
 {
 }
Exemple #12
0
	  internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly) : base(factory, input)
	  {
		init(version, minGram, maxGram, edgesOnly);
	  }
Exemple #13
0
	  /// <summary>
	  /// Creates NGramTokenizer with given min and max n-grams. </summary>
	  /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
	  /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param>
	  /// <param name="minGram"> the smallest n-gram to generate </param>
	  /// <param name="maxGram"> the largest n-gram to generate </param>
	  public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) : this(version, input, minGram, maxGram, false)
	  {
	  }
Exemple #14
0
	  internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly) : base(input)
	  {
		init(version, minGram, maxGram, edgesOnly);
	  }
 /// <summary>
 /// Creates NGramTokenizer with given min and max n-grams. </summary>
 /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
 /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
     : base(factory, input)
 {
     init(minGram, maxGram);
 }
Exemple #16
0
	  /// <summary>
	  /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
	  /// leading and trailing whitespace). Every line of the Reader should contain only
	  /// one word. The words need to be in lowercase if you make use of an
	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	  /// </summary>
	  /// <param name="reader"> Reader containing the wordlist </param>
	  /// <param name="comment"> The string representing a comment. </param>
	  /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
	  /// <returns> A CharArraySet with the reader's words </returns>
	  public static CharArraySet GetWordSet(TextReader reader, string comment, Version matchVersion)
	  {
		return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
	  }
 /// <summary>
 /// Creates NGramTokenizer with default min and max n-grams. </summary>
 /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param>
 public Lucene43NGramTokenizer(TextReader input)
     : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
 {
 }
Exemple #18
0
	  /// <summary>
	  /// Creates NGramTokenizer with given min and max n-grams. </summary>
	  /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	  /// <param name="minGram"> the smallest n-gram to generate </param>
	  /// <param name="maxGram"> the largest n-gram to generate </param>
	  public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) : this(version, factory, input, minGram, maxGram, false)
	  {
	  }