/// <summary>Returns the next token in the stream, or null at EOS. /// <p/>Removes <tt>'s</tt> from the end of words. /// <p/>Removes dots from acronyms. /// </summary> public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] buffer = termAtt.TermBuffer(); int bufferLength = termAtt.TermLength(); System.String type = typeAtt.Type(); if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off termAtt.SetTermLength(bufferLength - 2); } else if ((System.Object)type == (System.Object)ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } termAtt.SetTermLength(upto); } return(true); }
public override bool IncrementToken() { if (!done) { ClearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.TermBuffer(); while (true) { int length = input.Read(buffer, upto, buffer.Length - upto); if (length == 0) { break; } upto += length; if (upto == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + buffer.Length); } } termAtt.SetTermLength(upto); finalOffset = CorrectOffset(upto); offsetAtt.SetOffset(CorrectOffset(0), finalOffset); return(true); } return(false); }
/* * (non-Javadoc) * * @see Lucene.Net.Analysis.TokenStream#next() */ public override bool IncrementToken() { ClearAttributes(); int posIncr = 1; while (true) { int tokenType = scanner.GetNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return(false); } if (scanner.Yylength() <= maxTokenLength) { posIncrAtt.SetPositionIncrement(posIncr); scanner.GetText(termAtt); int start = scanner.Yychar(); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.' } else { typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); } } else { typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } return(true); } // When we skip a too-long term, we still increment the // position increment else { posIncr++; } } }
public override bool IncrementToken() { int count = input.Read((System.Char[])buffer, 0, buffer.Length); if (done) { return(false); } else { ClearAttributes(); done = true; if (count == 1) { termAtt.TermBuffer()[0] = buffer[0]; termAtt.SetTermLength(1); } else { termAtt.SetTermLength(0); } return(true); } }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { return(false); } } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = offset + bufferIndex - 1; } else if (length == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) { // buffer overflow! break; } } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } termAtt.SetTermLength(length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); return(true); }