public override bool IncrementToken() { ClearAttributes(); position++; if (position < tokens.Count) { var token = tokens[position]; termAtt.SetTermBuffer(token.Word); offsetAtt.SetOffset(token.StartIndex, token.EndIndex); typeAtt.Type = "Jieba"; return(true); } End(); return(false); }
/* * Returns the next token in the stream, or null at EOS */ public sealed override bool IncrementToken() { if (input.IncrementToken()) { String term = termAtt.Term; String s = stemmer.Stem(term); if (s != null && !s.Equals(term)) { termAtt.SetTermBuffer(s); } return(true); } else { return(false); } }
public override bool IncrementToken() { ClearAttributes(); Token token = NextToken(reusableToken); if (tokenQueue != null) { termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); offsetAtt.SetOffset(token.StartOffset, token.EndOffset); typeAtt.Type = token.Type; return(true); } else { End(); return(false); } }
public override bool IncrementToken() { ClearAttributes(); if (_index >= _testToken.Length) { return(false); } Token t = _testToken[_index++]; _termAtt.SetTermBuffer(t.TermBuffer(), 0, t.TermLength()); _offsetAtt.SetOffset(t.StartOffset, t.EndOffset); _posIncrAtt.PositionIncrement = t.PositionIncrement; _typeAtt.Type = TypeAttribute.DEFAULT_TYPE; return(true); }
public override bool IncrementToken() { if (input.IncrementToken()) { var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (!string.IsNullOrEmpty(currentTerm)) { stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength(), out char[] newTerm, out var newLength); termAtt.SetTermBuffer(newTerm, 0, newLength); termAtt.SetTermLength(newLength); } return(true); } else { return(false); } }
public override bool IncrementToken() { ClearAttributes(); Word word = mmSeg.Next(); if (word != null) { termAtt.SetTermBuffer(word.Sen, word.WordOffset, word.Length); offsetAtt.SetOffset(word.StartOffset, word.EndOffset); typeAtt.Type = word.Type; return(true); } else { End(); return(false); } }
/* * Get the next token from the input stream and push it on the token buffer. * If we encounter a token with position increment > 1, we put filler tokens * on the token buffer. * <p/> * Returns null when the end of the input stream is reached. * @return the next token, or null if at end of input stream * @throws IOException if the input stream has a problem */ private bool GetNextToken() { while (true) { if (numFillerTokensToInsert > 0) { if (currentToken == null) { currentToken = CaptureState(); } else { RestoreState(currentToken); } numFillerTokensToInsert--; // A filler token occupies no space offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset); termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length); return(true); } if (hasCurrentToken) { if (currentToken != null) { RestoreState(currentToken); currentToken = null; } hasCurrentToken = false; return(true); } if (!input.IncrementToken()) { return(false); } hasCurrentToken = true; if (posIncrAtt.PositionIncrement > 1) { numFillerTokensToInsert = posIncrAtt.PositionIncrement - 1; } } }
/// <summary>Returns the next input Token, after being stemmed </summary> public sealed override bool IncrementToken() { if (input.IncrementToken()) { String originalTerm = termAtt.Term; stemmer.SetCurrent(originalTerm); stemmer.Stem(); String finalTerm = stemmer.GetCurrent(); // Don't bother updating, if it is unchanged. if (!originalTerm.Equals(finalTerm)) { termAtt.SetTermBuffer(finalTerm); } return(true); } else { return(false); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } string version = _termAttribute.Term; NuGetVersion nuGetVersion; if (NuGetVersion.TryParse(version, out nuGetVersion)) { version = nuGetVersion.ToNormalizedString(); } _termAttribute.SetTermBuffer(version); return(true); }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param>NA</param> /// <returns></returns> public override bool IncrementToken() { if (synonymStack.Count > 0) { String syn = synonymStack.Pop(); RestoreState(current); termAtt.SetTermBuffer(syn); posIncrAtt.PositionIncrement = 0; return(true); } if (!input.IncrementToken()) { return(false); } if (addAliasesToStack()) { current = CaptureState(); } return(true); }
/* * <returns>Returns the next token in the stream, or null at EOS.</returns> */ public override bool IncrementToken() { if (input.IncrementToken()) { string term = termAtt.Term; // Check the exclusion table. if (exclusions == null || !exclusions.Contains(term)) { string s = stemmer.Stem(term); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.Equals(term)) { termAtt.SetTermBuffer(s); } } return(true); } else { return(false); } }
public override sealed bool IncrementToken() { if (Matrix == null) { Matrix = new Matrix.Matrix(); // fill matrix with maximumShingleSize columns while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn()) { // this loop looks ugly } } // This loop exists in order to avoid recursive calls to the next method // as the complexity of a large matrix // then would require a multi gigabyte sized stack. Token token; do { token = ProduceNextToken(_reusableToken); } while (token == _requestNextToken); if (token == null) { return(false); } ClearAttributes(); _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); _posIncrAtt.PositionIncrement = token.PositionIncrement; _flagsAtt.Flags = token.Flags; _offsetAtt.SetOffset(token.StartOffset, token.EndOffset); _typeAtt.Type = token.Type; _payloadAtt.Payload = token.Payload; return(true); }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); // If no characters actually require rewriting then we // just return token as-is: for (int i = 0; i < length; i++) { char c = buffer[i]; if (c >= '\u00c0' && c <= '\uFB06') { RemoveAccents(buffer, length); termAtt.SetTermBuffer(output, 0, outputPos); break; } } return(true); } return(false); }
public override bool IncrementToken() { if (!iter.MoveNext()) { return(false); } T obj = iter.Current; if (obj == null) { throw new ArgumentException("keyword must not be null"); } String term = obj.ToString(); ClearAttributes(); termAtt.SetTermBuffer(term); offsetAtt.SetOffset(start, start + termAtt.TermLength()); start += term.Length + 1; // separate words by 1 (blank) character return(true); }
public override bool IncrementToken() { if (Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken > 0) { termAtt.SetTermBuffer("multi" + (Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken + 1)); offsetAtt.SetOffset(prevStartOffset, prevEndOffset); typeAtt.Type = prevType; posIncrAtt.PositionIncrement = 0; Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken--; return(true); } else { bool next = input.IncrementToken(); if (next == false) { return(false); } prevType = typeAtt.Type; prevStartOffset = offsetAtt.StartOffset; prevEndOffset = offsetAtt.EndOffset; System.String text = termAtt.Term; if (text.Equals("triplemulti")) { Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken = 2; return(true); } else if (text.Equals("multi")) { Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken = 1; return(true); } else { return(true); } } }
public sealed override bool IncrementToken() { if (matcher == null) { return(false); } ClearAttributes(); while (true) { // loop takes care of leading and trailing boundary cases int start = pos; int end; bool isMatch = matcher.Success; if (isMatch) { end = matcher.Index; pos = matcher.Index + matcher.Length; matcher = matcher.NextMatch(); } else { end = str.Length; matcher = null; // we're finished } if (start != end) { // non-empty match (header/trailer) String text = str.Substring(start, end - start); if (toLowerCase) { text = text.ToLower(locale); } termAtt.SetTermBuffer(text); offsetAtt.SetOffset(start, end); return(true); } return(false); } }
public override bool IncrementToken() { if (tokenUpto >= Enclosing_Instance.tokens.Length) { return(false); } else { TestToken testToken = Enclosing_Instance.tokens[tokenUpto++]; ClearAttributes(); termAtt.SetTermBuffer(testToken.text); offsetAtt.SetOffset(testToken.startOffset, testToken.endOffset); if (tokenUpto > 1) { posIncrAtt.PositionIncrement = testToken.pos - Enclosing_Instance.tokens[tokenUpto - 2].pos; } else { posIncrAtt.PositionIncrement = testToken.pos + 1; } return(true); } }
public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.TermBuffer().Clone(); curTermLength = termAtt.TermLength(); curGramSize = minGram; tokStart = offsetAtt.StartOffset; } } if (curGramSize <= maxGram) { if (!(curGramSize > curTermLength || // if the remaining input is too short, we can't generate any n-grams curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; ClearAttributes(); offsetAtt.SetOffset(tokStart + start, tokStart + end); termAtt.SetTermBuffer(curTermBuffer, start, curGramSize); curGramSize++; return(true); } } curTermBuffer = null; } }
/* * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start */ public override sealed bool IncrementToken() { if (input.IncrementToken()) { char[] termBuffer = termAtt.TermBuffer(); int termLength = termAtt.TermLength(); int minPoz = int.MaxValue; for (int i = 0; i < apostrophes.Length; i++) { char apos = apostrophes[i]; // The equivalent of String.indexOf(ch) for (int poz = 0; poz < termLength; poz++) { if (termBuffer[poz] == apos) { minPoz = Math.Min(poz, minPoz); break; } } } // An apostrophe has been found. If the prefix is an article strip it off. if (minPoz != int.MaxValue && articles.Contains(termAtt.TermBuffer(), 0, minPoz)) { termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1)); } return(true); } else { return(false); } }
public override bool IncrementToken() { while (true) { if (_curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { _curTermBuffer = (char[])_termAtt.TermBuffer().Clone(); _curTermLength = _termAtt.TermLength(); _curGramSize = _minGram; _tokStart = _offsetAtt.StartOffset; } } if (_curGramSize <= _maxGram) { if (!(_curGramSize > _curTermLength || // if the remaining input is too short, we can't generate any n-grams _curGramSize > _maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = _side == Side.Front ? 0 : _curTermLength - _curGramSize; int end = start + _curGramSize; ClearAttributes(); _offsetAtt.SetOffset(_tokStart + start, _tokStart + end); _termAtt.SetTermBuffer(_curTermBuffer, start, _curGramSize); _curGramSize++; return(true); } } _curTermBuffer = null; } }
public override bool IncrementToken() { if (splittedQueue.Count > 0) { string splitted = splittedQueue.Dequeue(); RestoreState(currentState); termAtt.SetTermBuffer(splitted); posAtt.PositionIncrement = 0; return(true); } if (!input.IncrementToken()) { return(false); } string currentTerm = termAtt.Term; if (currentTerm != null) { var sb = new StringBuilder(); var synonyms = SynonymEngine.GetSynonyms(currentTerm); if (synonyms == null || synonyms.Any() == false) { return(true); } foreach (var synonym in synonyms) { splittedQueue.Enqueue(synonym.ToLower()); } } currentState = CaptureState(); return(true); }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); for (int i = 0; i < length; ++i) { char c = buffer[i]; if (c >= '\u0080') { FoldToASCII(buffer, length); termAtt.SetTermBuffer(output, 0, outputPos); break; } } return(true); } else { return(false); } }
public override bool IncrementToken() { ClearAttributes(); /* how many character(s) has been stored in buffer */ while (true) { // loop until we find a non-empty token int length = 0; /* the position used to create Token */ int start = offset; while (true) { // loop until we've found a full token /* current character */ char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); bufferIndex = 0; } if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java { if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { offset--; } break; } else { offset--; return(false); } } else { //get current character c = ioBuffer[bufferIndex++]; } //TODO: Using a Regex to determine the UnicodeCategory is probably slower than // If we just created a small class that would look it up for us, which // would likely be trivial, however time-consuming. I can't imagine a Regex // being fast for this, considering we have to pull a char from the buffer, // and convert it to a string before we run a regex on it. - cc bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success; //if the current character is ASCII or Extend ASCII if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm)) { if (isHalfFullForm) { int i = (int)c; if (i >= 65281 && i <= 65374) { // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN i = i - 65248; c = (char)i; } } // if the current character is a letter or "_" "+" "#" if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')) ) { if (length == 0) { // "javaC1C2C3C4linux" <br> // ^--: the current character begin to token the ASCII // letter start = offset - 1; } else if (tokenType == DOUBLE_TOKEN_TYPE) { // "javaC1C2C3C4linux" <br> // ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; break; } else { break; } } // store the LowerCase(c) in the buffer buffer[length++] = char.ToLower(c); // TODO: is java invariant? If so, this should be ToLowerInvariant() tokenType = SINGLE_TOKEN_TYPE; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { break; } } else if (length > 0) { if (preIsTokened) { length = 0; preIsTokened = false; } else { break; } } } else { // non-ASCII letter, e.g."C1C2C3C4" if (char.IsLetter(c)) { if (length == 0) { start = offset - 1; buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; } else { if (tokenType == SINGLE_TOKEN_TYPE) { offset--; bufferIndex--; //return the previous ASCII characters break; } else { buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; if (length == 2) { offset--; bufferIndex--; preIsTokened = true; break; } } } } else if (length > 0) { if (preIsTokened == true) { // empty the buffer length = 0; preIsTokened = false; } else { break; } } } } if (length > 0) { termAtt.SetTermBuffer(buffer, 0, length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); typeAtt.Type = TOKEN_TYPE_NAMES[tokenType]; return(true); } else if (dataLen == 0) { offset--; return(false); } // Cycle back and try for the next token (don't // return an empty string) } }
public static void Append(this ITermAttribute termAtt, char ch) { termAtt.SetTermBuffer(termAtt.Term + new string(new[] { ch })); // TODO: Not optimal, but works }
public static void Append(this ITermAttribute termAtt, string str) { termAtt.SetTermBuffer(termAtt.Term + str); // TODO: Not optimal, but works }
public void Reinit(string stringValue, int startOffset, int endOffset) { termAttribute.SetTermBuffer(stringValue); offsetAttribute.SetOffset(startOffset, endOffset); }
/// <summary> Fills TermAttribute with the current token text.</summary> internal void GetText(ITermAttribute t) { t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); }
public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset) { Assert.IsNotNull(output); ICheckClearAttributesAttribute checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>(); Assert.IsTrue(ts.HasAttribute <ITermAttribute>(), "has no TermAttribute"); ITermAttribute termAtt = ts.GetAttribute <ITermAttribute>(); IOffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute"); offsetAtt = ts.GetAttribute <IOffsetAttribute>(); } ITypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute"); typeAtt = ts.GetAttribute <ITypeAttribute>(); } IPositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute"); posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>(); } ts.Reset(); for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetTermBuffer("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.Term, "term " + i); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset, "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset, "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type, "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i); } } Assert.IsFalse(ts.IncrementToken(), "end of stream"); ts.End(); if (finalOffset.HasValue) { Assert.AreEqual(finalOffset, offsetAtt.EndOffset, "finalOffset "); } ts.Close(); }
public override bool IncrementToken() { ClearAttributes(); string nextToken; HebMorph.Tokenizer.TokenType tokenType; // Used to loop over certain noise cases while (true) { tokenType = hebMorphTokenizer.NextToken(out nextToken); if (tokenType == 0) { return(false); // EOS } // Ignore "words" which are actually only prefixes in a single word. // This first case is easy to spot, since the prefix and the following word will be // separated by a dash marked as a construct (סמיכות) by the Tokenizer if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0) { if (IsLegalPrefix(nextToken)) { continue; } } // This second case is a bit more complex. We take a risk of splitting a valid acronym or // abbrevated word into two, so we send it to an external function to analyze the word, and // get a possibly corrected word. Examples for words we expect to simplify by this operation // are ה"שטיח", ש"המידע. if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0) { nextToken = TryStrippingPrefix(nextToken); // Re-detect acronym, in case it was a false positive if (nextToken.IndexOf('"') == -1) { tokenType |= ~HebMorph.Tokenizer.TokenType.Acronym; } } break; } // Record the term string if (termAtt.TermLength() < nextToken.Length) { termAtt.SetTermBuffer(nextToken); } else // Perform a copy to save on memory operations { char[] buf = termAtt.TermBuffer(); nextToken.CopyTo(0, buf, 0, nextToken.Length); } termAtt.SetTermLength(nextToken.Length); offsetAtt.SetOffset(CorrectOffset(hebMorphTokenizer.Offset), CorrectOffset(hebMorphTokenizer.Offset + hebMorphTokenizer.LengthInSource)); if ((tokenType & HebMorph.Tokenizer.TokenType.Hebrew) > 0) { if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Acronym); } if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Construct); } else { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Hebrew); } } else if ((tokenType & HebMorph.Tokenizer.TokenType.Numeric) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Numeric); } else { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.NonHebrew); } return(true); }
private void SetAttributes(Tuple <string, int, int, int> next) { _termAttribute.SetTermBuffer(next.Item1); _offsetAttribute.SetOffset(next.Item2, next.Item3); _positionIncrementAttribute.PositionIncrement = next.Item4; }