public override sealed bool IncrementToken() { if (multiToken > 0) { termAtt.SetEmpty().Append("multi" + (multiToken + 1)); offsetAtt.SetOffset(prevStartOffset, prevEndOffset); typeAtt.Type = (prevType); posIncrAtt.PositionIncrement = (0); multiToken--; return(true); } else { bool next = m_input.IncrementToken(); if (!next) { return(false); } prevType = typeAtt.Type; prevStartOffset = offsetAtt.StartOffset; prevEndOffset = offsetAtt.EndOffset; String text = termAtt.toString(); if (text.equals("triplemulti")) { multiToken = 2; return(true); } else if (text.equals("multi")) { multiToken = 1; return(true); } else { return(true); } } }
public override bool IncrementToken() { if (Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken > 0) { termAtt.SetTermBuffer("multi" + (Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken + 1)); offsetAtt.SetOffset(prevStartOffset, prevEndOffset); typeAtt.Type = prevType; posIncrAtt.PositionIncrement = 0; Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken--; return(true); } else { bool next = input.IncrementToken(); if (next == false) { return(false); } prevType = typeAtt.Type; prevStartOffset = offsetAtt.StartOffset; prevEndOffset = offsetAtt.EndOffset; System.String text = termAtt.Term; if (text.Equals("triplemulti")) { Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken = 2; return(true); } else if (text.Equals("multi")) { Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken = 1; return(true); } else { return(true); } } }
public sealed override bool IncrementToken() { if (matcher == null) { return(false); } ClearAttributes(); while (true) { // loop takes care of leading and trailing boundary cases int start = pos; int end; bool isMatch = matcher.Success; if (isMatch) { end = matcher.Index; pos = matcher.Index + matcher.Length; matcher = matcher.NextMatch(); } else { end = str.Length; matcher = null; // we're finished } if (start != end) { // non-empty match (header/trailer) String text = str.Substring(start, end - start); if (toLowerCase) { text = text.ToLower(locale); } termAtt.SetTermBuffer(text); offsetAtt.SetOffset(start, end); return(true); } return(false); } }
public override bool IncrementToken() { if (currentToken >= tokens.Length) { return(false); } Token token = tokens[currentToken++]; ClearAttributes(); termAtt.SetEmpty().Append(token); offsetAtt.SetOffset(token.StartOffset, token.EndOffset); BytesRef payload = token.Payload; if (payload != null) { payloadAtt.Payload = payload; } posincAtt.PositionIncrement = (currentToken <= 1 || tokens[currentToken - 1].StartOffset > tokens[currentToken - 2].StartOffset ? 1 : 0); return(true); }
public sealed override bool IncrementToken() { if (TokenUpto >= OuterInstance.Tokens.Length) { return(false); } else { TestToken testToken = OuterInstance.Tokens[TokenUpto++]; ClearAttributes(); TermAtt.Append(testToken.Text); OffsetAtt.SetOffset(testToken.StartOffset, testToken.EndOffset); if (TokenUpto > 1) { PosIncrAtt.PositionIncrement = testToken.Pos - OuterInstance.Tokens[TokenUpto - 2].Pos; } else { PosIncrAtt.PositionIncrement = testToken.Pos + 1; } return(true); } }
public override bool IncrementToken() { if (tokenUpto >= Enclosing_Instance.tokens.Length) { return(false); } else { TestToken testToken = Enclosing_Instance.tokens[tokenUpto++]; ClearAttributes(); termAtt.SetTermBuffer(testToken.text); offsetAtt.SetOffset(testToken.startOffset, testToken.endOffset); if (tokenUpto > 1) { posIncrAtt.PositionIncrement = testToken.pos - Enclosing_Instance.tokens[tokenUpto - 2].pos; } else { posIncrAtt.PositionIncrement = testToken.pos + 1; } return(true); } }
/// <summary> /// Constructs a compound token. /// </summary> private void GramToken() { buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length); int endOffset = offsetAttribute.EndOffset(); ClearAttributes(); var length = buffer.Length; var termText = termAttribute.Buffer(); if (length > termText.Length) { termText = termAttribute.ResizeBuffer(length); } buffer.GetChars(0, length, termText, 0); termAttribute.Length = length; posIncAttribute.PositionIncrement = 0; posLenAttribute.PositionLength = 2; // bigram offsetAttribute.SetOffset(lastStartOffset, endOffset); typeAttribute.Type = GRAM_TYPE; buffer.Length = 0; }
public override bool IncrementToken() { while (true) { if (_curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { _curTermBuffer = (char[])_termAtt.TermBuffer().Clone(); _curTermLength = _termAtt.TermLength(); _curGramSize = _minGram; _tokStart = _offsetAtt.StartOffset; } } if (_curGramSize <= _maxGram) { if (!(_curGramSize > _curTermLength || // if the remaining input is too short, we can't generate any n-grams _curGramSize > _maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = _side == Side.Front ? 0 : _curTermLength - _curGramSize; int end = start + _curGramSize; ClearAttributes(); _offsetAtt.SetOffset(_tokStart + start, _tokStart + end); _termAtt.SetTermBuffer(_curTermBuffer, start, _curGramSize); _curGramSize++; return(true); } } _curTermBuffer = null; } }
protected override bool IncrementWord() { int start, end; UninterruptableMonitor.Enter(syncLock); try { start = wordBreaker.Current; if (start == BreakIterator.Done) { return(false); // BreakIterator exhausted } // find the next set of boundaries, skipping over non-tokens end = wordBreaker.Next(); while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd))) { start = end; end = wordBreaker.Next(); } if (end == BreakIterator.Done) { return(false); // BreakIterator exhausted } ClearAttributes(); termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start); offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end)); return(true); } finally { UninterruptableMonitor.Exit(syncLock); } }
public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.TermBuffer().Clone(); curTermLength = termAtt.TermLength(); curGramSize = minGram; tokStart = offsetAtt.StartOffset; } } if (curGramSize <= maxGram) { if (!(curGramSize > curTermLength || // if the remaining input is too short, we can't generate any n-grams curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; ClearAttributes(); offsetAtt.SetOffset(tokStart + start, tokStart + end); termAtt.SetTermBuffer(curTermBuffer, start, curGramSize); curGramSize++; return(true); } } curTermBuffer = null; } }
public override sealed bool IncrementToken() { ClearAttributes(); int length = 0; int start = -1; // this variable is always initialized int end = -1; char[] buffer = termAtt.Buffer; while (true) { if (bufferIndex >= dataLen) { offset += dataLen; charUtils.Fill(ioBuffer, m_input); // read supplementary char aware with CharacterUtils if (ioBuffer.Length == 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = CorrectOffset(offset); return(false); } } dataLen = ioBuffer.Length; bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone int c = charUtils.CodePointAt(ioBuffer.Buffer, bufferIndex, ioBuffer.Length); int charCount = Character.CharCount(c); bufferIndex += charCount; if (IsTokenChar(c)) // if it's a token char { if (length == 0) // start of token { Debug.Assert(start == -1); start = offset + bufferIndex - charCount; end = start; } // check if a supplementary could run out of bounds else if (length >= buffer.Length - 1) { buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer } end += charCount; length += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test { break; } } // at non-Letter w/ chars else if (length > 0) { break; // return 'em } } termAtt.Length = length; Debug.Assert(start != -1); offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end)); return(true); }
public override bool IncrementToken() { ClearAttributes(); buffer.Length = 0; int ci; char ch, pch; bool atBegin = true; tokenStart = tokenEnd; ci = m_input.Read(); ch = (char)ci; while (true) { if (ci == -1) { break; } else if (PUNCTION.IndexOf(ch) != -1) { // End of a sentence buffer.Append(ch); tokenEnd++; break; } else if (atBegin && Utility.SPACES.IndexOf(ch) != -1) { tokenStart++; tokenEnd++; ci = m_input.Read(); ch = (char)ci; } else { buffer.Append(ch); atBegin = false; tokenEnd++; pch = ch; ci = m_input.Read(); ch = (char)ci; // Two spaces, such as CR, LF if (Utility.SPACES.IndexOf(ch) != -1 && Utility.SPACES.IndexOf(pch) != -1) { // buffer.append(ch); tokenEnd++; break; } } } if (buffer.Length == 0) { return(false); } else { termAtt.SetEmpty().Append(buffer); offsetAtt.SetOffset(CorrectOffset(tokenStart), CorrectOffset(tokenEnd)); typeAtt.Type = "sentence"; return(true); } }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = bufferIndex; char[] ioBuffer = bufferPool.Allocate(); try { char[] buffer = termAtt.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } return(false); } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = offset + bufferIndex - 1; } else if (length == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Normalize(c); // buffer it, normalized } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } termAtt.SetTermLength(length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); } finally { if (ioBuffer != null) { bufferPool.Free(ioBuffer); } } return(true); }
/// <summary> /// Returns the next token in the stream, or null at EOS. </summary> public override bool IncrementToken() { ClearAttributes(); if (!started) { started = true; gramSize = minGram; char[] chars = new char[1024]; charsRead = 0; // TODO: refactor to a shared readFully somewhere: while (charsRead < chars.Length) { int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); if (inc == -1) { break; } charsRead += inc; } inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings if (charsRead == chars.Length) { // Read extra throwaway chars so that on end() we // report the correct offset: var throwaway = new char[1024]; while (true) { int inc = m_input.Read(throwaway, 0, throwaway.Length); if (inc == -1) { break; } charsRead += inc; } } inLen = inStr.Length; if (inLen == 0) { return(false); } } if (pos + gramSize > inLen) // if we hit the end of the string { pos = 0; // reset to beginning of string gramSize++; // increase n-gram size if (gramSize > maxGram) // we are done { return(false); } if (pos + gramSize > inLen) { return(false); } } int oldPos = pos; pos++; termAtt.SetEmpty().Append(inStr, oldPos, gramSize); // LUCENENET: Corrected 3rd parameter offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize)); return(true); }
public override void CopyTo(Attribute target) { IOffsetAttribute t = (IOffsetAttribute)target; t.SetOffset(startOffset, endOffset); }
public override sealed bool IncrementToken() { ClearAttributes(); termAtt.Append(resultToken); if (resultToken.Length == 0) { posAtt.PositionIncrement = 1; } else { posAtt.PositionIncrement = 0; } int length = 0; bool added = false; if (endDelimiter) { termAtt.Append(replacement); length++; endDelimiter = false; added = true; } while (true) { int c = m_input.Read(); if (c >= 0) { charsRead++; } else { if (skipped > skip) { length += resultToken.Length; termAtt.Length = length; offsetAtt.SetOffset(CorrectOffset(startPosition), CorrectOffset(startPosition + length)); if (added) { resultToken.Length = 0; resultToken.Append(termAtt.Buffer, 0, length); } return(added); } else { return(false); } } if (!added) { added = true; skipped++; if (skipped > skip) { termAtt.Append(c == delimiter ? replacement : (char)c); length++; } else { startPosition++; } } else { if (c == delimiter) { if (skipped > skip) { endDelimiter = true; break; } skipped++; if (skipped > skip) { termAtt.Append(replacement); length++; } else { startPosition++; } } else { if (skipped > skip) { termAtt.Append((char)c); length++; } else { startPosition++; } } } } length += resultToken.Length; termAtt.Length = length; offsetAtt.SetOffset(CorrectOffset(startPosition), CorrectOffset(startPosition + length)); resultToken.Length = 0; resultToken.Append(termAtt.Buffer, 0, length); return(true); }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null; if (result == null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result == null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? IList <AttributeSource> generated = new JCG.List <AttributeSource>(result.Synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.Synonyms.Length; i++) { Token repTok = result.Synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset); newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
public override bool IncrementToken() { if (hasMoreTokensInClone) { int start = breaker.Current; int end = breaker.Next(); if (end != BreakIterator.Done) { clonedToken.CopyTo(this); termAtt.CopyBuffer(clonedTermAtt.Buffer, start, end - start); if (hasIllegalOffsets) { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset); } else { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset + start, clonedOffsetAtt.StartOffset + end); } if (handlePosIncr) { posAtt.PositionIncrement = 1; } return(true); } hasMoreTokensInClone = false; } if (!m_input.IncrementToken()) { return(false); } if (termAtt.Length == 0 || !thaiPattern.IsMatch(string.Empty + termAtt[0])) { return(true); } hasMoreTokensInClone = true; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = offsetAtt.EndOffset - offsetAtt.StartOffset != termAtt.Length; // we lazy init the cloned token, as in ctor not all attributes may be added if (clonedToken == null) { clonedToken = CloneAttributes(); clonedTermAtt = clonedToken.GetAttribute <ICharTermAttribute>(); clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>(); } else { this.CopyTo(clonedToken); } // reinit CharacterIterator charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length); breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length)); int end2 = breaker.Next(); if (end2 != BreakIterator.Done) { termAtt.Length = end2; if (hasIllegalOffsets) { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset); } else { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.StartOffset + end2); } // position increment keeps as it is for first token return(true); } return(false); }
public override bool IncrementToken() { ClearAttributes(); if (delimitersCount == -1) { int length = 0; delimiterPositions.Add(0); while (true) { int c = input.Read(); if (c < 0) { break; } length++; if (c == delimiter) { delimiterPositions.Add(length); resultToken.Append(replacement); } else { resultToken.Append((char)c); } } delimitersCount = delimiterPositions.Count; if (delimiterPositions[delimitersCount - 1] < length) { delimiterPositions.Add(length); delimitersCount++; } if (resultTokenBuffer.Length < resultToken.Length) { resultTokenBuffer = new char[resultToken.Length]; } resultToken.GetChars(0, resultToken.Length, resultTokenBuffer, 0); resultToken.Length = 0; int idx = delimitersCount - 1 - skip; if (idx >= 0) { // otherwise its ok, because we will skip and return false endPosition = delimiterPositions[idx]; } finalOffset = CorrectOffset(length); posAtt.PositionIncrement = 1; } else { posAtt.PositionIncrement = 0; } while (skipped < delimitersCount - skip - 1) { var start = delimiterPositions[skipped] ?? 0; termAtt.CopyBuffer(resultTokenBuffer, start, endPosition - start); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(endPosition)); skipped++; return(true); } return(false); }
//~ Methods ---------------------------------------------------------------- /// <summary> /// Returns true for the next token in the stream, or false at EOS. /// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html /// for detail. /// </summary> /// <returns> false for end of stream, true otherwise /// </returns> /// <exception cref="IOException"> when read error /// happened in the InputStream /// </exception> public override bool IncrementToken() { ClearAttributes(); // how many character(s) has been stored in buffer while (true) // loop until we find a non-empty token { int length = 0; // the position used to create Token int start = offset; while (true) // loop until we've found a full token { // current character char c; offset++; if (bufferIndex >= dataLen) { dataLen = m_input.Read(ioBuffer, 0, ioBuffer.Length); bufferIndex = 0; } if (dataLen <= 0) { if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { offset--; } break; } else { offset--; return(false); } } else { //get current character c = ioBuffer[bufferIndex++]; } //if the current character is ASCII or Extend ASCII // LUCENENET Port Reference: https://msdn.microsoft.com/en-us/library/20bw873z.aspx#SupportedNamedBlocks string charAsString = new string(new char[] { c }); bool isHalfwidthAndFullwidthForms = Regex.IsMatch(charAsString, @"\p{IsHalfwidthandFullwidthForms}"); if (Regex.IsMatch(charAsString, @"\p{IsBasicLatin}") || isHalfwidthAndFullwidthForms) { if (isHalfwidthAndFullwidthForms) { int i = (int)c; if (i >= 65281 && i <= 65374) { // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN i = i - 65248; c = (char)i; } } // if the current character is a letter or "_" "+" "#" if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#'))) { if (length == 0) { // "javaC1C2C3C4linux" <br> // ^--: the current character begin to token the ASCII // letter start = offset - 1; } else if (tokenType == DOUBLE_TOKEN_TYPE) { // "javaC1C2C3C4linux" <br> // ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; break; } else { break; } } // store the LowerCase(c) in the buffer buffer[length++] = char.ToLowerInvariant(c); tokenType = SINGLE_TOKEN_TYPE; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { break; } } else if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { break; } } } else { // non-ASCII letter, e.g."C1C2C3C4" if (Character.IsLetter(c)) { if (length == 0) { start = offset - 1; buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; } else { if (tokenType == SINGLE_TOKEN_TYPE) { offset--; bufferIndex--; //return the previous ASCII characters break; } else { buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; if (length == 2) { offset--; bufferIndex--; preIsTokened = true; break; } } } } else if (length > 0) { if (preIsTokened == true) { // empty the buffer length = 0; preIsTokened = false; } else { break; } } } } if (length > 0) { termAtt.CopyBuffer(buffer, 0, length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); typeAtt.Type = TOKEN_TYPE_NAMES[tokenType]; return(true); } else if (dataLen <= 0) { offset--; return(false); } // Cycle back and try for the next token (don't // return an empty string) } }
/// <summary> /// Returns the next token in the stream, or null at EOS. /// </summary> public override sealed bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!m_input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.Buffer.Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.PositionIncrement; curPosLen = posLenAtt.PositionLength; tokStart = offsetAtt.StartOffset; tokEnd = offsetAtt.EndOffset; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { ++curPos; curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { ClearAttributes(); int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.CopyBuffer(curTermBuffer, start, end - start); posIncAtt.PositionIncrement = curPosInc; curPosInc = 0; posLenAtt.PositionLength = curPosLen; offsetAtt.SetOffset(tokStart, tokEnd); curGramSize++; return(true); } } else { while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) // while there is input { ClearAttributes(); termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); } curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } } curTermBuffer = null; } }
public void Reinit(string stringValue, int startOffset, int endOffset) { termAttribute.SetTermBuffer(stringValue); offsetAttribute.SetOffset(startOffset, endOffset); }
public override bool IncrementToken() { //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); while (true) { // First play back any buffered future inputs/outputs // w/o running parsing again: while (inputSkipCount != 0) { // At each position, we first output the original // token // TODO: maybe just a PendingState class, holding // both input & outputs? PendingInput input = futureInputs[nextRead]; PendingOutputs outputs = futureOutputs[nextRead]; //System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state); if (!input.consumed && (input.keepOrig || !input.matched)) { if (input.state != null) { // Return a previously saved token (because we // had to lookahead): RestoreState(input.state); } else { // Pass-through case: return token we just pulled // but didn't capture: if (Debugging.AssertsEnabled) { Debugging.Assert(inputSkipCount == 1, "inputSkipCount={0} nextRead={1}", inputSkipCount, nextRead); } } input.Reset(); if (outputs.count > 0) { outputs.posIncr = 0; } else { nextRead = RollIncr(nextRead); inputSkipCount--; } //System.out.println(" return token=" + termAtt.toString()); return(true); } else if (outputs.upto < outputs.count) { // Still have pending outputs to replay at this // position input.Reset(); int posIncr = outputs.posIncr; CharsRef output = outputs.PullNext(); ClearAttributes(); termAtt.CopyBuffer(output.Chars, output.Offset, output.Length); typeAtt.Type = TYPE_SYNONYM; int endOffset = outputs.LastEndOffset; if (endOffset == -1) { endOffset = input.endOffset; } offsetAtt.SetOffset(input.startOffset, endOffset); posIncrAtt.PositionIncrement = posIncr; posLenAtt.PositionLength = outputs.LastPosLength; if (outputs.count == 0) { // Done with the buffered input and all outputs at // this position nextRead = RollIncr(nextRead); inputSkipCount--; } //System.out.println(" return token=" + termAtt.toString()); return(true); } else { // Done with the buffered input and all outputs at // this position input.Reset(); nextRead = RollIncr(nextRead); inputSkipCount--; } } if (finished && nextRead == nextWrite) { // End case: if any output syns went beyond end of // input stream, enumerate them now: PendingOutputs outputs = futureOutputs[nextRead]; if (outputs.upto < outputs.count) { int posIncr = outputs.posIncr; CharsRef output = outputs.PullNext(); futureInputs[nextRead].Reset(); if (outputs.count == 0) { nextWrite = nextRead = RollIncr(nextRead); } ClearAttributes(); // Keep offset from last input token: offsetAtt.SetOffset(lastStartOffset, lastEndOffset); termAtt.CopyBuffer(output.Chars, output.Offset, output.Length); typeAtt.Type = TYPE_SYNONYM; //System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs); posIncrAtt.PositionIncrement = posIncr; //System.out.println(" return token=" + termAtt.toString()); return(true); } else { return(false); } } // Find new synonym matches: Parse(); } }
/// <summary> /// Returns the next token in the stream, or null at EOS. </summary> public override bool IncrementToken() { ClearAttributes(); // if we are just starting, read the whole input if (!started) { started = true; gramSize = minGram; int limit = side == Side.FRONT ? maxGram : 1024; char[] chars = new char[Math.Min(1024, limit)]; charsRead = 0; // TODO: refactor to a shared readFully somewhere: bool exhausted = false; while (charsRead < limit) { int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); if (inc <= 0) { exhausted = true; break; } charsRead += inc; if (charsRead == chars.Length && charsRead < limit) { chars = ArrayUtil.Grow(chars); } } inStr = new string(chars, 0, charsRead); inStr = inStr.Trim(); if (!exhausted) { // Read extra throwaway chars so that on end() we // report the correct offset: var throwaway = new char[1024]; while (true) { int inc = m_input.Read(throwaway, 0, throwaway.Length); if (inc <= 0) { break; } charsRead += inc; } } inLen = inStr.Length; if (inLen == 0) { return(false); } posIncrAtt.PositionIncrement = 1; } else { posIncrAtt.PositionIncrement = 0; } // if the remaining input is too short, we can't generate any n-grams if (gramSize > inLen) { return(false); } // if we have hit the end of our n-gram size range, quit if (gramSize > maxGram || gramSize > inLen) { return(false); } // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : inLen - gramSize; int end = start + gramSize; termAtt.SetEmpty().Append(inStr, start, end - start); // LUCENENET: Corrected 3rd parameter offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end)); gramSize++; return(true); }
public sealed override bool IncrementToken() { Debug.Assert(!enableChecks || (streamState == State.RESET || streamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + streamState); ClearAttributes(); for (; ;) { int startOffset; int cp; if (bufferedCodePoint >= 0) { cp = bufferedCodePoint; startOffset = bufferedOff; bufferedCodePoint = -1; } else { startOffset = off; cp = ReadCodePoint(); } if (cp < 0) { break; } else if (IsTokenChar(cp)) { int endOffset; do { char[] chars = Character.ToChars(Normalize(cp)); for (int i = 0; i < chars.Length; i++) { termAtt.Append(chars[i]); } endOffset = off; if (termAtt.Length >= maxTokenLength) { break; } cp = ReadCodePoint(); } while (cp >= 0 && IsTokenChar(cp)); if (termAtt.Length < maxTokenLength) { // buffer up, in case the "rejected" char can start a new word of its own bufferedCodePoint = cp; bufferedOff = endOffset; } else { // otherwise, its because we hit term limit. bufferedCodePoint = -1; } int correctedStartOffset = CorrectOffset(startOffset); int correctedEndOffset = CorrectOffset(endOffset); Assert.True(correctedStartOffset >= 0); Assert.True(correctedEndOffset >= 0); Assert.True(correctedStartOffset >= lastOffset); lastOffset = correctedStartOffset; Assert.True(correctedEndOffset >= correctedStartOffset); offsetAtt.SetOffset(correctedStartOffset, correctedEndOffset); if (state == -1 || runAutomaton.IsAccept(state)) { // either we hit a reject state (longest match), or end-of-text, but in an accept state streamState = State.INCREMENT; return(true); } } } streamState = State.INCREMENT_FALSE; return(false); }
public override bool IncrementToken() { ClearAttributes(); string nextToken; HebMorph.Tokenizer.TokenType tokenType; // Used to loop over certain noise cases while (true) { tokenType = hebMorphTokenizer.NextToken(out nextToken); if (tokenType == 0) { return(false); // EOS } // Ignore "words" which are actually only prefixes in a single word. // This first case is easy to spot, since the prefix and the following word will be // separated by a dash marked as a construct (סמיכות) by the Tokenizer if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0) { if (IsLegalPrefix(nextToken)) { continue; } } // This second case is a bit more complex. We take a risk of splitting a valid acronym or // abbrevated word into two, so we send it to an external function to analyze the word, and // get a possibly corrected word. Examples for words we expect to simplify by this operation // are ה"שטיח", ש"המידע. if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0) { nextToken = TryStrippingPrefix(nextToken); // Re-detect acronym, in case it was a false positive if (nextToken.IndexOf('"') == -1) { tokenType |= ~HebMorph.Tokenizer.TokenType.Acronym; } } break; } // Record the term string if (termAtt.TermLength() < nextToken.Length) { termAtt.SetTermBuffer(nextToken); } else // Perform a copy to save on memory operations { char[] buf = termAtt.TermBuffer(); nextToken.CopyTo(0, buf, 0, nextToken.Length); } termAtt.SetTermLength(nextToken.Length); offsetAtt.SetOffset(CorrectOffset(hebMorphTokenizer.Offset), CorrectOffset(hebMorphTokenizer.Offset + hebMorphTokenizer.LengthInSource)); if ((tokenType & HebMorph.Tokenizer.TokenType.Hebrew) > 0) { if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Acronym); } if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Construct); } else { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Hebrew); } } else if ((tokenType & HebMorph.Tokenizer.TokenType.Numeric) > 0) { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Numeric); } else { typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.NonHebrew); } return(true); }
public override bool IncrementToken() { if (index >= str.Length) { return(false); } ClearAttributes(); if (group >= 0) { // match a specific group if (matcher.Success) { do { // We have alredy parsed from this index, go to the next token. if (!isReset && matcher.Groups[group].Index == index) { continue; } isReset = false; index = matcher.Groups[group].Index; int endIndex = matcher.Groups[group].Index + matcher.Groups[group].Length; if (index == endIndex) { continue; } termAtt.SetEmpty().Append(str.ToString(), index, endIndex - index); // LUCENENET: Corrected 3rd parameter offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(endIndex)); return(true); } while ((matcher = matcher.NextMatch()).Success); } index = int.MaxValue; // mark exhausted return(false); } else { // String.split() functionality if (matcher.Success) { do { if (matcher.Index - index > 0) { // found a non-zero-length token termAtt.SetEmpty().Append(str.ToString(), index, matcher.Index - index); // LUCENENET: Corrected 3rd parameter offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(matcher.Index)); index = matcher.Index + matcher.Length; return(true); } isReset = false; index = matcher.Index + matcher.Length; } while ((matcher = matcher.NextMatch()).Success); } if (str.Length - index == 0) { index = int.MaxValue; // mark exhausted return(false); } termAtt.SetEmpty().Append(str.ToString(), index, str.Length - index); // LUCENENET: Corrected 3rd parameter offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(str.Length)); index = int.MaxValue; // mark exhausted return(true); } }
public override void End() { base.End(); posIncrAtt.PositionIncrement = finalPosInc; offsetAtt.SetOffset(finalOffset, finalOffset); }
/// <summary> /// <para>Get the next token from the input stream. /// </para> /// <para>If the next token has <c>positionIncrement > 1</c>, /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are /// inserted first. /// </para> /// </summary> /// <param name="target"> Where to put the new token; if null, a new instance is created. </param> /// <returns> On success, the populated token; null otherwise </returns> /// <exception cref="IOException"> if the input stream has a problem </exception> private InputWindowToken GetNextToken(InputWindowToken target) { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (m_input.IncrementToken()) { if (null == target) { newTarget = new InputWindowToken(this, CloneAttributes()); } else { this.CopyTo(target.attSource); } if (posIncrAtt.PositionIncrement > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = CloneAttributes(); } else { this.CopyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; m_input.End(); endState = CaptureState(); numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(this.GetAttributeFactory()); nextInputStreamToken.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset); // Recurse/loop just once: return(GetNextToken(target)); } else { newTarget = null; } } } else { newTarget = null; } return(newTarget); }
public override void End() { base.End(); PosIncrAtt.PositionIncrement = FinalPosInc; OffsetAtt.SetOffset(FinalOffset, FinalOffset); }