public KeywordTokenizer(TextReader input, int bufferSize) : base(input) { if (bufferSize <= 0) { throw new System.ArgumentException("bufferSize must be > 0"); } termAtt.ResizeBuffer(bufferSize); }
public override bool IncrementToken() { if (input.IncrementToken()) { int len = termAtt.Length; if (marker != NOMARKER) { len++; termAtt.ResizeBuffer(len); termAtt.Buffer()[len - 1] = marker; } reverse(matchVersion, termAtt.Buffer(), 0, len); termAtt.Length = len; return(true); } else { return(false); } }
private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }
/// <summary> /// Constructs a compound token. /// </summary> private void GramToken() { buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length); int endOffset = offsetAttribute.EndOffset(); ClearAttributes(); int length = buffer.Length; char[] termText = termAttribute.Buffer(); if (length > termText.Length) { termText = termAttribute.ResizeBuffer(length); } buffer.GetChars(0, length, termText, 0); termAttribute.Length = length; posIncAttribute.PositionIncrement = 0; posLenAttribute.PositionLength = 2; // bigram offsetAttribute.SetOffset(lastStartOffset, endOffset); typeAttribute.Type = GRAM_TYPE; buffer.Length = 0; }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = -1; // this variable is always initialized int end_Renamed = -1; char[] buffer = termAtt.Buffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils if (ioBuffer.Length == 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = CorrectOffset(offset); return(false); } } dataLen = ioBuffer.Length; bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone int c = charUtils.codePointAt(ioBuffer.Buffer, bufferIndex, ioBuffer.Length); int charCount = Character.CharCount(c); bufferIndex += charCount; if (isTokenChar(c)) // if it's a token char { if (length == 0) // start of token { Debug.Assert(start == -1); start = offset + bufferIndex - charCount; end_Renamed = start; } // check if a supplementary could run out of bounds else if (length >= buffer.Length - 1) { buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer } end_Renamed += charCount; length += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test { break; } } // at non-Letter w/ chars else if (length > 0) { break; // return 'em } } termAtt.Length = length; Debug.Assert(start != -1); offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end_Renamed)); return(true); }