public KeywordTokenizer(TextReader input, int bufferSize) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); if (bufferSize <= 0) { throw new System.ArgumentException("bufferSize must be > 0"); } termAtt.ResizeBuffer(bufferSize); }
/// <summary> /// Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, CharArraySet result) { BufferedReader br = null; try { br = getBufferedReader(reader); string word = null; while ((word = br.readLine()) != null) { result.add(word.Trim()); } } finally { IOUtils.close(br); } return result; }
/// <summary> /// Creates NGramTokenizer with given min and max n-grams. </summary> /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram) : base(input) { init(minGram, maxGram); }
public iCalLexer(TextReader r) : this(new CharBuffer(r)) { }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new PathHierarchyTokenizer(reader); return new TokenStreamComponents(tokenizer, tokenizer); }
/// <summary> /// Resets the scanner to read from a new input stream. /// Does not close the old reader. /// /// All internal variables are reset, the old input stream /// <b>cannot</b> be reused (internal buffer is discarded and lost). /// Lexical state is set to <tt>ZZ_INITIAL</tt>. /// /// Internal scan buffer is resized down to its initial length, if it has grown. /// </summary> /// <param name="reader"> the new input stream </param> public void yyreset(Reader reader) { zzReader = reader; zzAtBOL = true; zzAtEOF = false; zzEOFDone = false; zzEndRead = zzStartRead = 0; zzCurrentPos = zzMarkedPos = 0; yyline = yychar_Renamed = yycolumn = 0; zzLexicalState = YYINITIAL; if (zzBuffer.Length > ZZ_BUFFERSIZE) { zzBuffer = new char[ZZ_BUFFERSIZE]; } }
/// <summary> /// Creates a new scanner /// </summary> /// <param name="in"> the java.io.Reader to read input from. </param> internal ClassicTokenizerImpl(Reader @in) { this.zzReader = @in; }
// TODO: refactor to a shared readFully somewhere // (NGramTokenizer does this too): /// <summary> /// commons-io's readFully, but without bugs if offset != 0 </summary> private static int Read(TextReader input, char[] buffer, int offset, int length) { Debug.Assert(length >= 0, "length must not be negative: " + length); int remaining = length; while (remaining > 0) { int location = length - remaining; int count = input.read(buffer, offset + location, remaining); if (-1 == count) // EOF { break; } remaining -= count; } return length - remaining; }
/// <summary> /// Creates NGramTokenizer with default min and max n-grams. </summary> /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param> /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param> public NGramTokenizer(LuceneVersion version, TextReader input) : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) { }
/// <summary> /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="comment"> The string representing a comment. </param> /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param> /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result) { BufferedReader br = null; try { br = getBufferedReader(reader); string word = null; while ((word = br.ReadLine()) != null) { if (word.StartsWith(comment, StringComparison.Ordinal) == false) { result.add(word.Trim()); } } } finally { IOUtils.Close(br); } return result; }
public KeywordTokenizer(TextReader input) : this(input, DEFAULT_BUFFER_SIZE) { }
internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly) : base(factory, input) { init(version, minGram, maxGram, edgesOnly); }
/// <summary> /// Creates NGramTokenizer with given min and max n-grams. </summary> /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param> /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) : this(version, input, minGram, maxGram, false) { }
internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly) : base(input) { init(version, minGram, maxGram, edgesOnly); }
/// <summary> /// Creates NGramTokenizer with given min and max n-grams. </summary> /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram) : base(factory, input) { init(minGram, maxGram); }
/// <summary> /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// </summary> /// <param name="reader"> Reader containing the wordlist </param> /// <param name="comment"> The string representing a comment. </param> /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param> /// <returns> A CharArraySet with the reader's words </returns> public static CharArraySet GetWordSet(TextReader reader, string comment, Version matchVersion) { return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false)); }
/// <summary> /// Creates NGramTokenizer with default min and max n-grams. </summary> /// <param name="input"> <seealso cref="TextReader"/> holding the input to be tokenized </param> public Lucene43NGramTokenizer(TextReader input) : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) { }
/// <summary> /// Creates NGramTokenizer with given min and max n-grams. </summary> /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param> /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param> /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) : this(version, factory, input, minGram, maxGram, false) { }