public override bool IncrementToken() { if (input.IncrementToken()) { int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength()); termAtt.SetTermLength(newlen); return(true); } else { return(false); } }
public override Boolean IncrementToken() { if (_buffer.Any()) { var nextStem = _buffer.Dequeue(); RestoreState(_savedState); _posIncAtt.PositionIncrement = 0; _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength); return(true); } if (!input.IncrementToken()) { return(false); } var newTerms = _dedup ? _stemmer.UniqueStems(_termAtt.Term) : _stemmer.Stem(_termAtt.Term); foreach (var newTerm in newTerms) { _buffer.Enqueue(newTerm); } if (_buffer.Count == 0) { // originaly: we do not know this word, return it unchanged // changed: apply SlovakStemmer on words not found in dictionary (possible named entities) var currentTerm = new string(_termAtt.TermBuffer(), 0, _termAtt.TermLength()); if (!string.IsNullOrEmpty(currentTerm)) { _slovakStemmer.Stem(_termAtt.TermBuffer(), _termAtt.TermLength(), out char[] newTerm, out var newLength); _termAtt.SetTermBuffer(newTerm, 0, newLength); _termAtt.SetTermLength(newLength); } return(true); } var stem = _buffer.Dequeue(); _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength); if (_buffer.Count > 0) { _savedState = CaptureState(); } return(true); }
private void SetTermText(string token) { // Record the term string if (termAtt.TermLength() < token.Length) { termAtt.SetTermBuffer(token); } else // Perform a copy to save on memory operations { char[] buf = termAtt.TermBuffer(); token.CopyTo(0, buf, 0, token.Length); } termAtt.SetTermLength(token.Length); }
///<summary> /// (non-Javadoc) /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" /> ///</summary> public override bool IncrementToken() { ClearAttributes(); int posIncr = 1; while (true) { int tokenType = scanner.GetNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return(false); } if (scanner.Yylength() <= maxTokenLength) { posIncrAtt.PositionIncrement = posIncr; scanner.GetText(termAtt); int start = scanner.Yychar(); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]; termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.' } else { typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; } } else { typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType]; } return(true); } // When we skip a too-long term, we still increment the // position increment else { posIncr++; } } }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int bufferLength = termAtt.TermLength(); if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || ((buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { termAtt.SetTermLength(bufferLength - 2); // Strip last 2 characters off } return(true); } return(false); }
public override bool IncrementToken() { int count = input.Read(buffer, 0, buffer.Length); if (done) { return(false); } else { ClearAttributes(); done = true; if (count == 1) { termAtt.TermBuffer()[0] = buffer[0]; termAtt.SetTermLength(1); } else { termAtt.SetTermLength(0); } return(true); } }
public override bool IncrementToken() { if (input.IncrementToken()) { var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (!string.IsNullOrEmpty(currentTerm)) { stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength(), out char[] newTerm, out var newLength); termAtt.SetTermBuffer(newTerm, 0, newLength); termAtt.SetTermLength(newLength); } return(true); } else { return(false); } }
public override bool IncrementToken() { if (input.IncrementToken()) { int len = termAtt.TermLength(); if (marker != NOMARKER) { len++; termAtt.ResizeTermBuffer(len); termAtt.TermBuffer()[len - 1] = marker; } Reverse(termAtt.TermBuffer(), len); termAtt.SetTermLength(len); return(true); } else { return(false); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { // reached EOS -- return null return(false); } // TODO: Limit this check to Hebrew Tokens only char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(), j = 0; for (int i = 0; i < length; i++) { if (buffer[i] < 1455 || buffer[i] > 1476) // current position is not a Niqqud character { buffer[j++] = buffer[i]; } } termAtt.SetTermLength(j); return(true); }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = bufferIndex; char[] ioBuffer = bufferPool.Allocate(); try { char[] buffer = termAtt.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } return(false); } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = offset + bufferIndex - 1; } else if (length == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Normalize(c); // buffer it, normalized } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } termAtt.SetTermLength(length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); } finally { if (ioBuffer != null) { bufferPool.Free(ioBuffer); } } return(true); }
public override bool IncrementToken() { ClearAttributes(); string nextToken; HebMorph.Tokenizer.TokenType tokenType; // Used to loop over certain noise cases while (true) { tokenType = hebMorphTokenizer.NextToken(out nextToken); if (tokenType == 0) { return(false); // EOS } // Ignore "words" which are actually only prefixes in a single word. // This first case is easy to spot, since the prefix and the following word will be // separated by a dash marked as a construct (סמיכות) by the Tokenizer if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0) { if (IsLegalPrefix(nextToken)) { continue; } } // This second case is a bit more complex. We take a risk of splitting a valid acronym or // abbrevated word into two, so we send it to an external function to analyze the word, and // get a possibly corrected word. Examples for words we expect to simplify by this operation // are ה"שטיח", ש"המידע. if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0) { nextToken = TryStrippingPrefix(nextToken); // Re-detect acronym, in case it was a false positive if (nextToken.IndexOf('"') == -1) { tokenType |= ~HebMorph.Tokenizer.TokenType.Acronym; } } break; } // Record the term string if (termAtt.TermLength() < nextToken.Length) { termAtt.SetTermBuffer(nextToken); } else // Perform a copy to save on memory operations { char[] buf = termAtt.TermBuffer(); nextToken.CopyTo(0, buf, 0, nextToken.Length); } termAtt.SetTermLength(nextToken.Length); offsetAtt.SetOffset(CorrectOffset(hebMorphTokenizer.Offset), CorrectOffset(hebMorphTokenizer.Offset + hebMorphTokenizer.LengthInSource)); if ((tokenType & HebMorph.Tokenizer.TokenType.Hebrew) > 0) { if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Acronym); } if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Construct); } else { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Hebrew); } } else if ((tokenType & HebMorph.Tokenizer.TokenType.Numeric) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Numeric); } else { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.NonHebrew); } return(true); }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = _bufferIndex; char[] buffer = _termAtt.TermBuffer(); while (true) { if (_bufferIndex >= _dataLen) { _offset += _dataLen; _dataLen = input.Read(_ioBuffer, 0, _ioBuffer.Length); if (_dataLen <= 0) { _dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { return(false); } } _bufferIndex = 0; } char c = _ioBuffer[_bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = _offset + _bufferIndex - 1; } else if (length == buffer.Length) { buffer = _termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Normalize(c); // buffer it, normalized if (length == MaxWordLen) { // buffer overflow! break; } } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } _termAtt.SetTermLength(length); _offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); return(true); }