public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } bool bufferUpdated = true; char[] buffer = termAtt.TermBuffer(); int bufferLength = termAtt.TermLength(); String type = typeAtt.Type; if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { for (int i = 0; i < bufferLength - 2; i++) { buffer[i] = ToLower(buffer[i]); } // Strip last 2 characters off termAtt.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = ToLower(c); } } termAtt.SetTermLength(upto); } else { do { //If we consumed a stop word we need to update the buffer and its length. if (!bufferUpdated) { bufferLength = termAtt.TermLength(); buffer = termAtt.TermBuffer(); } for (int i = 0; i < bufferLength; i++) { buffer[i] = ToLower(buffer[i]); } if (!stopWords.Contains(buffer, 0, bufferLength)) { return(true); } bufferUpdated = false; } while (input.IncrementToken()); return(false); } return(true); }
/// <summary> /// Basically is called on each token /// </summary> /// <returns></returns> public override bool IncrementToken() { if (input.IncrementToken()) { string term = new String(termAtt.TermBuffer(), 0, termAtt.TermLength()); string replaceterm = BestBetsWordForms.ResourceManager.GetString(term); if (!String.IsNullOrEmpty(replaceterm)) { //Replace Term was a match, so we need to replace this item. termAtt.SetTermBuffer(replaceterm); } char[] buffer = termAtt.TermBuffer(); /* * int length = termAtt.TermLength(); * for (int i = 0; i < length; i++) * buffer[i] = System.Char.ToLower(buffer[i]); */ return(true); } return(false); }
public override bool IncrementToken() { if (state != null) { RestoreState(state); payloadAtt.Payload = null; posIncrAtt.PositionIncrement = 0; termAtt.SetTermBuffer(new char[]{'b'}, 0, 1); state = null; return true; } bool hasNext = input.IncrementToken(); if (!hasNext) return false; if (System.Char.IsDigit(termAtt.TermBuffer()[0])) { posIncrAtt.PositionIncrement = termAtt.TermBuffer()[0] - '0'; } if (first) { // set payload on first position only payloadAtt.Payload = new Payload(new byte[]{100}); first = false; } // index a "synonym" for every token state = CaptureState(); return true; }
public override Boolean IncrementToken() { if (_buffer.Any()) { var nextStem = _buffer.Dequeue(); RestoreState(_savedState); _posIncAtt.PositionIncrement = 0; _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength); return(true); } if (!input.IncrementToken()) { return(false); } var newTerms = _dedup ? _stemmer.UniqueStems(_termAtt.Term) : _stemmer.Stem(_termAtt.Term); foreach (var newTerm in newTerms) { _buffer.Enqueue(newTerm); } if (_buffer.Count == 0) { // originaly: we do not know this word, return it unchanged // changed: apply SlovakStemmer on words not found in dictionary (possible named entities) var currentTerm = new string(_termAtt.TermBuffer(), 0, _termAtt.TermLength()); if (!string.IsNullOrEmpty(currentTerm)) { _slovakStemmer.Stem(_termAtt.TermBuffer(), _termAtt.TermLength(), out char[] newTerm, out var newLength); _termAtt.SetTermBuffer(newTerm, 0, newLength); _termAtt.SetTermLength(newLength); } return(true); } var stem = _buffer.Dequeue(); _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength); if (_buffer.Count > 0) { _savedState = CaptureState(); } return(true); }
public override bool IncrementToken() { int positionIncrement = 0; if (_buffer == null || _offset >= _length) { if (!_input.IncrementToken()) { return(false); } _offset = 0; _buffer = _termAttribute.TermBuffer(); _length = _termAttribute.TermLength(); positionIncrement++; _offsetInStream++; } _offsetAttribute.SetOffset(_offsetInStream, _offsetInStream + 1); _offsetInStream++; positionIncrement++; _positionIncrementAttribute.PositionIncrement = positionIncrement; _termAttribute.SetTermLength(1); _termAttribute.SetTermBuffer(_buffer[_offset++].ToString()); return(true); }
/* Returns the next token in the stream, or null at EOS. */ public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.TermBuffer().Clone(); curTermLength = termAtt.TermLength(); curGramSize = minGram; curPos = 0; tokStart = offsetAtt.StartOffset; } } while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) { // while there is input ClearAttributes(); termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize); offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } curTermBuffer = null; } }
public override bool IncrementToken() { while (input.IncrementToken()) { char[] text = termAtt.TermBuffer(); int termLength = termAtt.TermLength(); // why not key off token type here assuming ChineseTokenizer comes first? if (!stopTable.Contains(text, 0, termLength)) { switch (char.GetUnicodeCategory(text[0])) { case UnicodeCategory.LowercaseLetter: case UnicodeCategory.UppercaseLetter: // English word/token should larger than 1 char. if (termLength > 1) { return(true); } break; case UnicodeCategory.OtherLetter: // One Chinese char as one Chinese word. // Chinese word extraction to be added later here. return(true); } } } return(false); }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p/>Removes <tt>'s</tt> from the end of words. /// <p/>Removes dots from acronyms. /// </summary> public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] buffer = termAtt.TermBuffer(); int bufferLength = termAtt.TermLength(); System.String type = typeAtt.Type; if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off termAtt.SetTermLength(bufferLength - 2); } else if ((System.Object)type == (System.Object)ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } termAtt.SetTermLength(upto); } return(true); }
public override bool IncrementToken() { bool result = false; if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); //look for the delimiter bool seen = false; for (int i = 0; i < length; i++) { if (buffer[i] == delimiter) { termAtt.SetTermBuffer(buffer, 0, i); payAtt.Payload = encoder.Encode(buffer, i + 1, (length - (i + 1))); seen = true; break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same } } if (seen == false) { //no delimiter payAtt.Payload = null; } result = true; } return(result); }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } return(false); } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = offset + bufferIndex - 1; } else if (length == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) { // buffer overflow! break; } } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } termAtt.SetTermLength(length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); return(true); }
public sealed override bool IncrementToken() { if (tokens.Count > 0) { setToken((Token)tokens.First.Value); tokens.RemoveFirst(); return(true); } if (input.IncrementToken() == false) { return(false); } wrapper.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength()); wrapper.StartOffset = offsetAtt.StartOffset; wrapper.EndOffset = offsetAtt.EndOffset; wrapper.Flags = flagsAtt.Flags; wrapper.Type = typeAtt.Type; wrapper.PositionIncrement = posIncAtt.PositionIncrement; wrapper.Payload = payloadAtt.Payload; Decompose(wrapper); if (tokens.Count > 0) { setToken(tokens.First.Value); tokens.RemoveFirst(); return(true); } else { return(false); } }
public override bool IncrementToken() { if (splittedQueue.Count > 0) { string splitted = splittedQueue.Dequeue(); RestoreState(currentState); _termAttr.SetTermBuffer(splitted); _posAttr.PositionIncrement = 0; return(true); } if (!input.IncrementToken()) { return(false); } var currentTerm = new string(_termAttr.TermBuffer(), 0, _termAttr.TermLength()); IEnumerable <string> synonyms = SynonymEngine.GetSynonyms(currentTerm); if (synonyms == null) { return(false); } foreach (string syn in synonyms) { if (!currentTerm.Equals(syn)) { splittedQueue.Enqueue(syn); } } return(true); }
public override bool IncrementToken() { if (!done) { ClearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.TermBuffer(); while (true) { int length = input.Read(buffer, upto, buffer.Length - upto); if (length == 0) { break; } upto += length; if (upto == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + buffer.Length); } } termAtt.SetTermLength(upto); finalOffset = CorrectOffset(upto); offsetAtt.SetOffset(CorrectOffset(0), finalOffset); return(true); } return(false); }
public override bool IncrementToken() { if (!input.IncrementToken()) { // reached EOS -- return null return(false); } if (suffixByTokenType == null) { return(true); } char[] suffix; if (!suffixByTokenType.TryGetValue(typeAtt.Type, out suffix)) { return(true); } char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); if (buffer.Length <= length) { buffer = termAtt.ResizeTermBuffer(length + suffix.Length); } Array.Copy(suffix, 0, buffer, length, suffix.Length); termAtt.SetTermLength(length + suffix.Length); return(true); }
public override bool IncrementToken() { //Finchè ci sono termini nella coda ritorniamo e svuotiamo un temrine alla volta if (splittedQueue.Count > 0) { var splitted = splittedQueue.Dequeue(); RestoreState(currentState); termAtt.SetTermBuffer(splitted); posAtt.PositionIncrement = 0; return(true); } //Se siamo alla fine ci fermiamo subito if (!input.IncrementToken()) { return(false); } var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (!string.IsNullOrEmpty(currentTerm)) { var splittedWords = GetSplittedWord(currentTerm); //Non ci sono parole, non elaboriamo e proseguiamo if (splittedWords == null || splittedWords.Length == 0) { return(true); } foreach (var splittedWord in splittedWords) { splittedQueue.Enqueue(splittedWord); } } currentState = CaptureState(); return(true); }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = _bufferIndex; char[] buffer = _termAtt.TermBuffer(); while (true) { if (_bufferIndex >= _dataLen) { _offset += _dataLen; _dataLen = input.Read(_ioBuffer, 0, _ioBuffer.Length); if (_dataLen <= 0) { _dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } return(false); } _bufferIndex = 0; } char c = _ioBuffer[_bufferIndex++]; if (Helper.IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = _offset + _bufferIndex - 1; } else if (length == buffer.Length) { buffer = _termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Helper.Normalize(c); // buffer it, normalized } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } _termAtt.SetTermLength(length); _offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); return(true); }
public override bool IncrementToken() { if (input.IncrementToken()) { var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (!string.IsNullOrEmpty(currentTerm)) { stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength(), out char[] newTerm, out var newLength); termAtt.SetTermBuffer(newTerm, 0, newLength); termAtt.SetTermLength(newLength); } return(true); } else { return(false); } }
public override bool IncrementToken() { if (input.IncrementToken()) { int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength()); termAtt.SetTermLength(newlen); return(true); } return(false); }
public override bool IncrementToken() { if (input.IncrementToken()) { var newLength = _stemmer.Stem(_termAttr.TermBuffer(), _termAttr.TermLength()); _termAttr.SetTermLength(newLength); return(true); } return(false); }
public override bool IncrementToken() { if (input.IncrementToken()) { int len = termAtt.TermLength(); if (marker != NOMARKER) { len++; termAtt.ResizeTermBuffer(len); termAtt.TermBuffer()[len - 1] = marker; } Reverse(termAtt.TermBuffer(), len); termAtt.SetTermLength(len); return(true); } else { return(false); } }
public override bool IncrementToken() { if (input.IncrementToken()) { if (stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength())) { termAtt.SetTermBuffer(stemmer.ToString()); } return(true); } return(false); }
public override bool IncrementToken() { if (input.IncrementToken()) { typeAtt.Type = char.ToUpper(termAtt.TermBuffer()[0]).ToString(); return(true); } else { return(false); } }
public override bool IncrementToken() { if (input.IncrementToken()) { int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength()); termAtt.SetTermLength(newlen); return(true); } else { return(false); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength())) { termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength); } return(true); }
private Token GetNextPrefixInputToken(Token token) { if (!Prefix.IncrementToken()) { return(null); } token.SetTermBuffer(_pTermAtt.TermBuffer(), 0, _pTermAtt.TermLength()); token.PositionIncrement = _pPosIncrAtt.PositionIncrement; token.Flags = _pFlagsAtt.Flags; token.SetOffset(_pOffsetAtt.StartOffset, _pOffsetAtt.EndOffset); token.Type = _pTypeAtt.Type; token.Payload = _pPayloadAtt.Payload; return(token); }
private Token GetNextToken(Token token) { if (!this.IncrementToken()) { return(null); } token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength()); token.PositionIncrement = _posIncrAtt.PositionIncrement; token.Flags = _flagsAtt.Flags; token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset); token.Type = _typeAtt.Type; token.Payload = _payloadAtt.Payload; return(token); }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int bufferLength = termAtt.TermLength(); if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || ((buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { termAtt.SetTermLength(bufferLength - 2); // Strip last 2 characters off } return(true); } return(false); }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); for (int i = 0; i < length; i++) { buffer[i] = System.Char.ToLower(buffer[i]); } return(true); } return(false); }
/* * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start */ public override sealed bool IncrementToken() { if (input.IncrementToken()) { char[] termBuffer = termAtt.TermBuffer(); int termLength = termAtt.TermLength(); int minPoz = int.MaxValue; for (int i = 0; i < apostrophes.Length; i++) { char apos = apostrophes[i]; // The equivalent of String.indexOf(ch) for (int poz = 0; poz < termLength; poz++) { if (termBuffer[poz] == apos) { minPoz = Math.Min(poz, minPoz); break; } } } // An apostrophe has been found. If the prefix is an article strip it off. if (minPoz != int.MaxValue && articles.Contains(termAtt.TermBuffer(), 0, minPoz)) { termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1)); } return(true); } else { return(false); } }
private Token GetNextInputToken(Token token) { if (!_input.IncrementToken()) { return(null); } token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength()); token.PositionIncrement = _inPosIncrAtt.PositionIncrement; token.Flags = _inFlagsAtt.Flags; token.SetOffset(_inOffsetAtt.StartOffset, _inOffsetAtt.EndOffset); token.Type = _inTypeAtt.Type; token.Payload = _inPayloadAtt.Payload; return(token); }