Exemplo n.º 1
0
 public override sealed bool IncrementToken()
 {
     if (multiToken > 0)
     {
         termAtt.SetEmpty().Append("multi" + (multiToken + 1));
         offsetAtt.SetOffset(prevStartOffset, prevEndOffset);
         typeAtt.Type = (prevType);
         posIncrAtt.PositionIncrement = (0);
         multiToken--;
         return(true);
     }
     else
     {
         bool next = m_input.IncrementToken();
         if (!next)
         {
             return(false);
         }
         prevType        = typeAtt.Type;
         prevStartOffset = offsetAtt.StartOffset;
         prevEndOffset   = offsetAtt.EndOffset;
         String text = termAtt.toString();
         if (text.equals("triplemulti"))
         {
             multiToken = 2;
             return(true);
         }
         else if (text.equals("multi"))
         {
             multiToken = 1;
             return(true);
         }
         else
         {
             return(true);
         }
     }
 }
 public override bool IncrementToken()
 {
     if (Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken > 0)
     {
         termAtt.SetTermBuffer("multi" + (Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken + 1));
         offsetAtt.SetOffset(prevStartOffset, prevEndOffset);
         typeAtt.Type = prevType;
         posIncrAtt.PositionIncrement = 0;
         Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken--;
         return(true);
     }
     else
     {
         bool next = input.IncrementToken();
         if (next == false)
         {
             return(false);
         }
         prevType        = typeAtt.Type;
         prevStartOffset = offsetAtt.StartOffset;
         prevEndOffset   = offsetAtt.EndOffset;
         System.String text = termAtt.Term;
         if (text.Equals("triplemulti"))
         {
             Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken = 2;
             return(true);
         }
         else if (text.Equals("multi"))
         {
             Lucene.Net.QueryParsers.TestMultiAnalyzer.multiToken = 1;
             return(true);
         }
         else
         {
             return(true);
         }
     }
 }
Exemplo n.º 3
0
            public sealed override bool IncrementToken()
            {
                if (matcher == null)
                {
                    return(false);
                }
                ClearAttributes();
                while (true)
                { // loop takes care of leading and trailing boundary cases
                    int  start = pos;
                    int  end;
                    bool isMatch = matcher.Success;
                    if (isMatch)
                    {
                        end     = matcher.Index;
                        pos     = matcher.Index + matcher.Length;
                        matcher = matcher.NextMatch();
                    }
                    else
                    {
                        end     = str.Length;
                        matcher = null; // we're finished
                    }

                    if (start != end)
                    { // non-empty match (header/trailer)
                        String text = str.Substring(start, end - start);
                        if (toLowerCase)
                        {
                            text = text.ToLower(locale);
                        }
                        termAtt.SetTermBuffer(text);
                        offsetAtt.SetOffset(start, end);
                        return(true);
                    }
                    return(false);
                }
            }
Exemplo n.º 4
0
            public override bool IncrementToken()
            {
                if (currentToken >= tokens.Length)
                {
                    return(false);
                }
                Token token = tokens[currentToken++];

                ClearAttributes();
                termAtt.SetEmpty().Append(token);
                offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
                BytesRef payload = token.Payload;

                if (payload != null)
                {
                    payloadAtt.Payload = payload;
                }
                posincAtt.PositionIncrement =
                    (currentToken <= 1 ||
                     tokens[currentToken - 1].StartOffset > tokens[currentToken - 2].StartOffset
                    ? 1 : 0);
                return(true);
            }
Exemplo n.º 5
0
 public sealed override bool IncrementToken()
 {
     if (TokenUpto >= OuterInstance.Tokens.Length)
     {
         return(false);
     }
     else
     {
         TestToken testToken = OuterInstance.Tokens[TokenUpto++];
         ClearAttributes();
         TermAtt.Append(testToken.Text);
         OffsetAtt.SetOffset(testToken.StartOffset, testToken.EndOffset);
         if (TokenUpto > 1)
         {
             PosIncrAtt.PositionIncrement = testToken.Pos - OuterInstance.Tokens[TokenUpto - 2].Pos;
         }
         else
         {
             PosIncrAtt.PositionIncrement = testToken.Pos + 1;
         }
         return(true);
     }
 }
Exemplo n.º 6
0
 public override bool IncrementToken()
 {
     if (tokenUpto >= Enclosing_Instance.tokens.Length)
     {
         return(false);
     }
     else
     {
         TestToken testToken = Enclosing_Instance.tokens[tokenUpto++];
         ClearAttributes();
         termAtt.SetTermBuffer(testToken.text);
         offsetAtt.SetOffset(testToken.startOffset, testToken.endOffset);
         if (tokenUpto > 1)
         {
             posIncrAtt.PositionIncrement = testToken.pos - Enclosing_Instance.tokens[tokenUpto - 2].pos;
         }
         else
         {
             posIncrAtt.PositionIncrement = testToken.pos + 1;
         }
         return(true);
     }
 }
Exemplo n.º 7
0
        /// <summary>
        /// Constructs a compound token.
        /// </summary>
        private void GramToken()
        {
            buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length);
            int endOffset = offsetAttribute.EndOffset();

            ClearAttributes();

            var length   = buffer.Length;
            var termText = termAttribute.Buffer();

            if (length > termText.Length)
            {
                termText = termAttribute.ResizeBuffer(length);
            }

            buffer.GetChars(0, length, termText, 0);
            termAttribute.Length = length;
            posIncAttribute.PositionIncrement = 0;
            posLenAttribute.PositionLength    = 2; // bigram
            offsetAttribute.SetOffset(lastStartOffset, endOffset);
            typeAttribute.Type = GRAM_TYPE;
            buffer.Length      = 0;
        }
 public override bool IncrementToken()
 {
     while (true)
     {
         if (_curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 _curTermBuffer = (char[])_termAtt.TermBuffer().Clone();
                 _curTermLength = _termAtt.TermLength();
                 _curGramSize   = _minGram;
                 _tokStart      = _offsetAtt.StartOffset;
             }
         }
         if (_curGramSize <= _maxGram)
         {
             if (!(_curGramSize > _curTermLength || // if the remaining input is too short, we can't generate any n-grams
                   _curGramSize > _maxGram))
             {                                      // if we have hit the end of our n-gram size range, quit
                 // grab gramSize chars from front or back
                 int start = _side == Side.Front ? 0 : _curTermLength - _curGramSize;
                 int end   = start + _curGramSize;
                 ClearAttributes();
                 _offsetAtt.SetOffset(_tokStart + start, _tokStart + end);
                 _termAtt.SetTermBuffer(_curTermBuffer, start, _curGramSize);
                 _curGramSize++;
                 return(true);
             }
         }
         _curTermBuffer = null;
     }
 }
Exemplo n.º 9
0
        protected override bool IncrementWord()
        {
            int start, end;

            UninterruptableMonitor.Enter(syncLock);
            try
            {
                start = wordBreaker.Current;
                if (start == BreakIterator.Done)
                {
                    return(false); // BreakIterator exhausted
                }

                // find the next set of boundaries, skipping over non-tokens
                end = wordBreaker.Next();
                while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
                {
                    start = end;
                    end   = wordBreaker.Next();
                }

                if (end == BreakIterator.Done)
                {
                    return(false); // BreakIterator exhausted
                }

                ClearAttributes();
                termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
                offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
                return(true);
            }
            finally
            {
                UninterruptableMonitor.Exit(syncLock);
            }
        }
Exemplo n.º 10
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer = (char[])termAtt.TermBuffer().Clone();
                 curTermLength = termAtt.TermLength();
                 curGramSize   = minGram;
                 tokStart      = offsetAtt.StartOffset;
             }
         }
         if (curGramSize <= maxGram)
         {
             if (!(curGramSize > curTermLength || // if the remaining input is too short, we can't generate any n-grams
                   curGramSize > maxGram))
             {                                    // if we have hit the end of our n-gram size range, quit
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
                 int end   = start + curGramSize;
                 ClearAttributes();
                 offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
Exemplo n.º 11
0
        public override sealed bool IncrementToken()
        {
            ClearAttributes();
            int length = 0;
            int start  = -1; // this variable is always initialized
            int end    = -1;

            char[] buffer = termAtt.Buffer;
            while (true)
            {
                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    charUtils.Fill(ioBuffer, m_input); // read supplementary char aware with CharacterUtils
                    if (ioBuffer.Length == 0)
                    {
                        dataLen = 0; // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        else
                        {
                            finalOffset = CorrectOffset(offset);
                            return(false);
                        }
                    }
                    dataLen     = ioBuffer.Length;
                    bufferIndex = 0;
                }
                // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
                int c         = charUtils.CodePointAt(ioBuffer.Buffer, bufferIndex, ioBuffer.Length);
                int charCount = Character.CharCount(c);
                bufferIndex += charCount;

                if (IsTokenChar(c))  // if it's a token char
                {
                    if (length == 0) // start of token
                    {
                        Debug.Assert(start == -1);
                        start = offset + bufferIndex - charCount;
                        end   = start;
                    } // check if a supplementary could run out of bounds
                    else if (length >= buffer.Length - 1)
                    {
                        buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer
                    }
                    end    += charCount;
                    length += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized
                    if (length >= MAX_WORD_LEN)                                // buffer overflow! make sure to check for >= surrogate pair could break == test
                    {
                        break;
                    }
                } // at non-Letter w/ chars
                else if (length > 0)
                {
                    break; // return 'em
                }
            }

            termAtt.Length = length;
            Debug.Assert(start != -1);
            offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end));
            return(true);
        }
Exemplo n.º 12
0
        public override bool IncrementToken()
        {
            ClearAttributes();
            buffer.Length = 0;
            int  ci;
            char ch, pch;
            bool atBegin = true;

            tokenStart = tokenEnd;
            ci         = m_input.Read();
            ch         = (char)ci;

            while (true)
            {
                if (ci == -1)
                {
                    break;
                }
                else if (PUNCTION.IndexOf(ch) != -1)
                {
                    // End of a sentence
                    buffer.Append(ch);
                    tokenEnd++;
                    break;
                }
                else if (atBegin && Utility.SPACES.IndexOf(ch) != -1)
                {
                    tokenStart++;
                    tokenEnd++;
                    ci = m_input.Read();
                    ch = (char)ci;
                }
                else
                {
                    buffer.Append(ch);
                    atBegin = false;
                    tokenEnd++;
                    pch = ch;
                    ci  = m_input.Read();
                    ch  = (char)ci;
                    // Two spaces, such as CR, LF
                    if (Utility.SPACES.IndexOf(ch) != -1 &&
                        Utility.SPACES.IndexOf(pch) != -1)
                    {
                        // buffer.append(ch);
                        tokenEnd++;
                        break;
                    }
                }
            }
            if (buffer.Length == 0)
            {
                return(false);
            }
            else
            {
                termAtt.SetEmpty().Append(buffer);
                offsetAtt.SetOffset(CorrectOffset(tokenStart), CorrectOffset(tokenEnd));
                typeAtt.Type = "sentence";
                return(true);
            }
        }
Exemplo n.º 13
0
        public override bool IncrementToken()
        {
            ClearAttributes();

            int length = 0;
            int start  = bufferIndex;

            char[] ioBuffer = bufferPool.Allocate();
            try
            {
                char[] buffer = termAtt.TermBuffer();
                while (true)
                {
                    if (bufferIndex >= dataLen)
                    {
                        offset += dataLen;
                        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                        if (dataLen <= 0)
                        {
                            dataLen = 0; // so next offset += dataLen won't decrement offset
                            if (length > 0)
                            {
                                break;
                            }
                            return(false);
                        }
                        bufferIndex = 0;
                    }

                    char c = ioBuffer[bufferIndex++];

                    if (IsTokenChar(c))
                    {
                        // if it's a token char

                        if (length == 0)
                        {
                            // start of token
                            start = offset + bufferIndex - 1;
                        }
                        else if (length == buffer.Length)
                        {
                            buffer = termAtt.ResizeTermBuffer(1 + length);
                        }

                        buffer[length++] = Normalize(c); // buffer it, normalized
                    }
                    else if (length > 0)
                    {
                        // at non-Letter w/ chars
                        break; // return 'em
                    }
                }

                termAtt.SetTermLength(length);
                offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
            }
            finally
            {
                if (ioBuffer != null)
                {
                    bufferPool.Free(ioBuffer);
                }
            }

            return(true);
        }
Exemplo n.º 14
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
        public override bool IncrementToken()
        {
            ClearAttributes();
            if (!started)
            {
                started  = true;
                gramSize = minGram;
                char[] chars = new char[1024];
                charsRead = 0;
                // TODO: refactor to a shared readFully somewhere:
                while (charsRead < chars.Length)
                {
                    int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
                    if (inc == -1)
                    {
                        break;
                    }
                    charsRead += inc;
                }
                inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings

                if (charsRead == chars.Length)
                {
                    // Read extra throwaway chars so that on end() we
                    // report the correct offset:
                    var throwaway = new char[1024];
                    while (true)
                    {
                        int inc = m_input.Read(throwaway, 0, throwaway.Length);
                        if (inc == -1)
                        {
                            break;
                        }
                        charsRead += inc;
                    }
                }

                inLen = inStr.Length;
                if (inLen == 0)
                {
                    return(false);
                }
            }

            if (pos + gramSize > inLen) // if we hit the end of the string
            {
                pos = 0;                // reset to beginning of string
                gramSize++;             // increase n-gram size
                if (gramSize > maxGram) // we are done
                {
                    return(false);
                }
                if (pos + gramSize > inLen)
                {
                    return(false);
                }
            }

            int oldPos = pos;

            pos++;
            termAtt.SetEmpty().Append(inStr, oldPos, gramSize); // LUCENENET: Corrected 3rd parameter
            offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
            return(true);
        }
Exemplo n.º 15
0
        public override void  CopyTo(Attribute target)
        {
            IOffsetAttribute t = (IOffsetAttribute)target;

            t.SetOffset(startOffset, endOffset);
        }
Exemplo n.º 16
0
        public override sealed bool IncrementToken()
        {
            ClearAttributes();
            termAtt.Append(resultToken);
            if (resultToken.Length == 0)
            {
                posAtt.PositionIncrement = 1;
            }
            else
            {
                posAtt.PositionIncrement = 0;
            }
            int  length = 0;
            bool added  = false;

            if (endDelimiter)
            {
                termAtt.Append(replacement);
                length++;
                endDelimiter = false;
                added        = true;
            }

            while (true)
            {
                int c = m_input.Read();
                if (c >= 0)
                {
                    charsRead++;
                }
                else
                {
                    if (skipped > skip)
                    {
                        length        += resultToken.Length;
                        termAtt.Length = length;
                        offsetAtt.SetOffset(CorrectOffset(startPosition), CorrectOffset(startPosition + length));
                        if (added)
                        {
                            resultToken.Length = 0;
                            resultToken.Append(termAtt.Buffer, 0, length);
                        }
                        return(added);
                    }
                    else
                    {
                        return(false);
                    }
                }
                if (!added)
                {
                    added = true;
                    skipped++;
                    if (skipped > skip)
                    {
                        termAtt.Append(c == delimiter ? replacement : (char)c);
                        length++;
                    }
                    else
                    {
                        startPosition++;
                    }
                }
                else
                {
                    if (c == delimiter)
                    {
                        if (skipped > skip)
                        {
                            endDelimiter = true;
                            break;
                        }
                        skipped++;
                        if (skipped > skip)
                        {
                            termAtt.Append(replacement);
                            length++;
                        }
                        else
                        {
                            startPosition++;
                        }
                    }
                    else
                    {
                        if (skipped > skip)
                        {
                            termAtt.Append((char)c);
                            length++;
                        }
                        else
                        {
                            startPosition++;
                        }
                    }
                }
            }
            length        += resultToken.Length;
            termAtt.Length = length;
            offsetAtt.SetOffset(CorrectOffset(startPosition), CorrectOffset(startPosition + length));
            resultToken.Length = 0;
            resultToken.Append(termAtt.Buffer, 0, length);
            return(true);
        }
Exemplo n.º 17
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    Copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = NextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null;

                if (result == null)
                {
                    Copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <AttributeSource>();

                result = Match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    Copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                IList <AttributeSource> generated = new JCG.List <AttributeSource>(result.Synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource             origTok        = includeOrig ? firstTok : null;
                IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>();
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.Synonyms.Length; i++)
                {
                    Token                       repTok       = result.Synonyms[i];
                    AttributeSource             newTok       = firstTok.CloneAttributes();
                    ICharTermAttribute          newTermAtt   = newTok.AddAttribute <ICharTermAttribute>();
                    IOffsetAttribute            newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>();
                    IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>();

                    IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>();

                    newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset);
                    newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length);
                    repPos += repTok.PositionIncrement;
                    if (i == 0) // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos += origPosInc.PositionIncrement;
                        //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (matched.Count == 0)
                        {
                            origTok = null;
                        }
                        else
                        {
                            origTok = matched.First.Value;
                            matched.Remove(origTok);
                        }
                        if (origTok != null)
                        {
                            origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos += origPosInc.PositionIncrement;
                    if (matched.Count == 0)
                    {
                        origTok = null;
                    }
                    else
                    {
                        origTok = matched.First.Value;
                        matched.Remove(origTok);
                    }
                    if (origTok != null)
                    {
                        origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Exemplo n.º 18
0
        public override bool IncrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.Current;
                int end   = breaker.Next();
                if (end != BreakIterator.Done)
                {
                    clonedToken.CopyTo(this);
                    termAtt.CopyBuffer(clonedTermAtt.Buffer, start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
                    }
                    else
                    {
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset + start, clonedOffsetAtt.StartOffset + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!m_input.IncrementToken())
            {
                return(false);
            }

            if (termAtt.Length == 0 || !thaiPattern.IsMatch(string.Empty + termAtt[0]))
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.EndOffset - offsetAtt.StartOffset != termAtt.Length;

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = CloneAttributes();
                clonedTermAtt   = clonedToken.GetAttribute <ICharTermAttribute>();
                clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>();
            }
            else
            {
                this.CopyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length);
            breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
            int end2 = breaker.Next();

            if (end2 != BreakIterator.Done)
            {
                termAtt.Length = end2;
                if (hasIllegalOffsets)
                {
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
                }
                else
                {
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.StartOffset + end2);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }
Exemplo n.º 19
0
        public override bool IncrementToken()
        {
            ClearAttributes();
            if (delimitersCount == -1)
            {
                int length = 0;
                delimiterPositions.Add(0);
                while (true)
                {
                    int c = input.Read();
                    if (c < 0)
                    {
                        break;
                    }
                    length++;
                    if (c == delimiter)
                    {
                        delimiterPositions.Add(length);
                        resultToken.Append(replacement);
                    }
                    else
                    {
                        resultToken.Append((char)c);
                    }
                }
                delimitersCount = delimiterPositions.Count;
                if (delimiterPositions[delimitersCount - 1] < length)
                {
                    delimiterPositions.Add(length);
                    delimitersCount++;
                }
                if (resultTokenBuffer.Length < resultToken.Length)
                {
                    resultTokenBuffer = new char[resultToken.Length];
                }
                resultToken.GetChars(0, resultToken.Length, resultTokenBuffer, 0);
                resultToken.Length = 0;
                int idx = delimitersCount - 1 - skip;
                if (idx >= 0)
                {
                    // otherwise its ok, because we will skip and return false
                    endPosition = delimiterPositions[idx];
                }
                finalOffset = CorrectOffset(length);
                posAtt.PositionIncrement = 1;
            }
            else
            {
                posAtt.PositionIncrement = 0;
            }

            while (skipped < delimitersCount - skip - 1)
            {
                var start = delimiterPositions[skipped] ?? 0;
                termAtt.CopyBuffer(resultTokenBuffer, start, endPosition - start);
                offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(endPosition));
                skipped++;
                return(true);
            }

            return(false);
        }
Exemplo n.º 20
0
        //~ Methods ----------------------------------------------------------------

        /// <summary>
        /// Returns true for the next token in the stream, or false at EOS.
        /// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
        /// for detail.
        /// </summary>
        /// <returns> false for end of stream, true otherwise
        /// </returns>
        /// <exception cref="IOException"> when read error
        ///         happened in the InputStream
        ///  </exception>
        public override bool IncrementToken()
        {
            ClearAttributes();

            // how many character(s) has been stored in buffer

            while (true) // loop until we find a non-empty token
            {
                int length = 0;

                // the position used to create Token
                int start = offset;

                while (true) // loop until we've found a full token
                {
                    // current character
                    char c;

                    offset++;

                    if (bufferIndex >= dataLen)
                    {
                        dataLen     = m_input.Read(ioBuffer, 0, ioBuffer.Length);
                        bufferIndex = 0;
                    }

                    if (dataLen <= 0)
                    {
                        if (length > 0)
                        {
                            if (preIsTokened == true)
                            {
                                length       = 0;
                                preIsTokened = false;
                            }
                            else
                            {
                                offset--;
                            }

                            break;
                        }
                        else
                        {
                            offset--;
                            return(false);
                        }
                    }
                    else
                    {
                        //get current character
                        c = ioBuffer[bufferIndex++];
                    }

                    //if the current character is ASCII or Extend ASCII
                    // LUCENENET Port Reference: https://msdn.microsoft.com/en-us/library/20bw873z.aspx#SupportedNamedBlocks
                    string charAsString = new string(new char[] { c });
                    bool   isHalfwidthAndFullwidthForms = Regex.IsMatch(charAsString, @"\p{IsHalfwidthandFullwidthForms}");
                    if (Regex.IsMatch(charAsString, @"\p{IsBasicLatin}") || isHalfwidthAndFullwidthForms)
                    {
                        if (isHalfwidthAndFullwidthForms)
                        {
                            int i = (int)c;
                            if (i >= 65281 && i <= 65374)
                            {
                                // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
                                i = i - 65248;
                                c = (char)i;
                            }
                        }

                        // if the current character is a letter or "_" "+" "#"
                        if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
                        {
                            if (length == 0)
                            {
                                // "javaC1C2C3C4linux" <br>
                                //      ^--: the current character begin to token the ASCII
                                // letter
                                start = offset - 1;
                            }
                            else if (tokenType == DOUBLE_TOKEN_TYPE)
                            {
                                // "javaC1C2C3C4linux" <br>
                                //              ^--: the previous non-ASCII
                                // : the current character
                                offset--;
                                bufferIndex--;

                                if (preIsTokened == true)
                                {
                                    // there is only one non-ASCII has been stored
                                    length       = 0;
                                    preIsTokened = false;
                                    break;
                                }
                                else
                                {
                                    break;
                                }
                            }

                            // store the LowerCase(c) in the buffer
                            buffer[length++] = char.ToLowerInvariant(c);
                            tokenType        = SINGLE_TOKEN_TYPE;

                            // break the procedure if buffer overflowed!
                            if (length == MAX_WORD_LEN)
                            {
                                break;
                            }
                        }
                        else if (length > 0)
                        {
                            if (preIsTokened == true)
                            {
                                length       = 0;
                                preIsTokened = false;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }
                    else
                    {
                        // non-ASCII letter, e.g."C1C2C3C4"
                        if (Character.IsLetter(c))
                        {
                            if (length == 0)
                            {
                                start            = offset - 1;
                                buffer[length++] = c;
                                tokenType        = DOUBLE_TOKEN_TYPE;
                            }
                            else
                            {
                                if (tokenType == SINGLE_TOKEN_TYPE)
                                {
                                    offset--;
                                    bufferIndex--;

                                    //return the previous ASCII characters
                                    break;
                                }
                                else
                                {
                                    buffer[length++] = c;
                                    tokenType        = DOUBLE_TOKEN_TYPE;

                                    if (length == 2)
                                    {
                                        offset--;
                                        bufferIndex--;
                                        preIsTokened = true;

                                        break;
                                    }
                                }
                            }
                        }
                        else if (length > 0)
                        {
                            if (preIsTokened == true)
                            {
                                // empty the buffer
                                length       = 0;
                                preIsTokened = false;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }
                }

                if (length > 0)
                {
                    termAtt.CopyBuffer(buffer, 0, length);
                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
                    typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
                    return(true);
                }
                else if (dataLen <= 0)
                {
                    offset--;
                    return(false);
                }

                // Cycle back and try for the next token (don't
                // return an empty string)
            }
        }
Exemplo n.º 21
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS.
        /// </summary>
        public override sealed bool IncrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!m_input.IncrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = (char[])termAtt.Buffer.Clone();
                        curTermLength     = termAtt.Length;
                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
                        curGramSize       = minGram;
                        curPos            = 0;
                        curPosInc         = posIncAtt.PositionIncrement;
                        curPosLen         = posLenAtt.PositionLength;
                        tokStart          = offsetAtt.StartOffset;
                        tokEnd            = offsetAtt.EndOffset;
                        // if length by start + end offsets doesn't match the term text then assume
                        // this is a synonym and don't adjust the offsets.
                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
                    }
                }
#pragma warning disable 612, 618
                if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
                {
                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
                    {
                        ++curPos;
                        curGramSize = minGram;
                    }
                    if ((curPos + curGramSize) <= curCodePointCount)
                    {
                        ClearAttributes();
                        int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                        int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
                        posIncAtt.PositionIncrement = curPosInc;
                        curPosInc = 0;
                        posLenAtt.PositionLength = curPosLen;
                        offsetAtt.SetOffset(tokStart, tokEnd);
                        curGramSize++;
                        return(true);
                    }
                }
                else
                {
                    while (curGramSize <= maxGram)
                    {
                        while (curPos + curGramSize <= curTermLength) // while there is input
                        {
                            ClearAttributes();
                            termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                            if (hasIllegalOffsets)
                            {
                                offsetAtt.SetOffset(tokStart, tokEnd);
                            }
                            else
                            {
                                offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                            }
                            curPos++;
                            return(true);
                        }
                        curGramSize++; // increase n-gram size
                        curPos = 0;
                    }
                }
                curTermBuffer = null;
            }
        }
Exemplo n.º 22
0
 public void  Reinit(string stringValue, int startOffset, int endOffset)
 {
     termAttribute.SetTermBuffer(stringValue);
     offsetAttribute.SetOffset(startOffset, endOffset);
 }
Exemplo n.º 23
0
        public override bool IncrementToken()
        {
            //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

            while (true)
            {
                // First play back any buffered future inputs/outputs
                // w/o running parsing again:
                while (inputSkipCount != 0)
                {
                    // At each position, we first output the original
                    // token

                    // TODO: maybe just a PendingState class, holding
                    // both input & outputs?
                    PendingInput   input   = futureInputs[nextRead];
                    PendingOutputs outputs = futureOutputs[nextRead];

                    //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

                    if (!input.consumed && (input.keepOrig || !input.matched))
                    {
                        if (input.state != null)
                        {
                            // Return a previously saved token (because we
                            // had to lookahead):
                            RestoreState(input.state);
                        }
                        else
                        {
                            // Pass-through case: return token we just pulled
                            // but didn't capture:
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(inputSkipCount == 1, "inputSkipCount={0} nextRead={1}", inputSkipCount, nextRead);
                            }
                        }
                        input.Reset();
                        if (outputs.count > 0)
                        {
                            outputs.posIncr = 0;
                        }
                        else
                        {
                            nextRead = RollIncr(nextRead);
                            inputSkipCount--;
                        }
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else if (outputs.upto < outputs.count)
                    {
                        // Still have pending outputs to replay at this
                        // position
                        input.Reset();
                        int      posIncr = outputs.posIncr;
                        CharsRef output  = outputs.PullNext();
                        ClearAttributes();
                        termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
                        typeAtt.Type = TYPE_SYNONYM;
                        int endOffset = outputs.LastEndOffset;
                        if (endOffset == -1)
                        {
                            endOffset = input.endOffset;
                        }
                        offsetAtt.SetOffset(input.startOffset, endOffset);
                        posIncrAtt.PositionIncrement = posIncr;
                        posLenAtt.PositionLength     = outputs.LastPosLength;
                        if (outputs.count == 0)
                        {
                            // Done with the buffered input and all outputs at
                            // this position
                            nextRead = RollIncr(nextRead);
                            inputSkipCount--;
                        }
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else
                    {
                        // Done with the buffered input and all outputs at
                        // this position
                        input.Reset();
                        nextRead = RollIncr(nextRead);
                        inputSkipCount--;
                    }
                }

                if (finished && nextRead == nextWrite)
                {
                    // End case: if any output syns went beyond end of
                    // input stream, enumerate them now:
                    PendingOutputs outputs = futureOutputs[nextRead];
                    if (outputs.upto < outputs.count)
                    {
                        int      posIncr = outputs.posIncr;
                        CharsRef output  = outputs.PullNext();
                        futureInputs[nextRead].Reset();
                        if (outputs.count == 0)
                        {
                            nextWrite = nextRead = RollIncr(nextRead);
                        }
                        ClearAttributes();
                        // Keep offset from last input token:
                        offsetAtt.SetOffset(lastStartOffset, lastEndOffset);
                        termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
                        typeAtt.Type = TYPE_SYNONYM;
                        //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
                        posIncrAtt.PositionIncrement = posIncr;
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else
                    {
                        return(false);
                    }
                }

                // Find new synonym matches:
                Parse();
            }
        }
Exemplo n.º 24
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
        public override bool IncrementToken()
        {
            ClearAttributes();
            // if we are just starting, read the whole input
            if (!started)
            {
                started  = true;
                gramSize = minGram;
                int    limit = side == Side.FRONT ? maxGram : 1024;
                char[] chars = new char[Math.Min(1024, limit)];
                charsRead = 0;
                // TODO: refactor to a shared readFully somewhere:
                bool exhausted = false;
                while (charsRead < limit)
                {
                    int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
                    if (inc <= 0)
                    {
                        exhausted = true;
                        break;
                    }
                    charsRead += inc;
                    if (charsRead == chars.Length && charsRead < limit)
                    {
                        chars = ArrayUtil.Grow(chars);
                    }
                }

                inStr = new string(chars, 0, charsRead);
                inStr = inStr.Trim();

                if (!exhausted)
                {
                    // Read extra throwaway chars so that on end() we
                    // report the correct offset:
                    var throwaway = new char[1024];
                    while (true)
                    {
                        int inc = m_input.Read(throwaway, 0, throwaway.Length);
                        if (inc <= 0)
                        {
                            break;
                        }
                        charsRead += inc;
                    }
                }

                inLen = inStr.Length;
                if (inLen == 0)
                {
                    return(false);
                }
                posIncrAtt.PositionIncrement = 1;
            }
            else
            {
                posIncrAtt.PositionIncrement = 0;
            }

            // if the remaining input is too short, we can't generate any n-grams
            if (gramSize > inLen)
            {
                return(false);
            }

            // if we have hit the end of our n-gram size range, quit
            if (gramSize > maxGram || gramSize > inLen)
            {
                return(false);
            }

            // grab gramSize chars from front or back
            int start = side == Side.FRONT ? 0 : inLen - gramSize;
            int end   = start + gramSize;

            termAtt.SetEmpty().Append(inStr, start, end - start); // LUCENENET: Corrected 3rd parameter
            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
            gramSize++;
            return(true);
        }
Exemplo n.º 25
0
        public sealed override bool IncrementToken()
        {
            Debug.Assert(!enableChecks || (streamState == State.RESET || streamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + streamState);
            ClearAttributes();
            for (; ;)
            {
                int startOffset;
                int cp;
                if (bufferedCodePoint >= 0)
                {
                    cp                = bufferedCodePoint;
                    startOffset       = bufferedOff;
                    bufferedCodePoint = -1;
                }
                else
                {
                    startOffset = off;
                    cp          = ReadCodePoint();
                }
                if (cp < 0)
                {
                    break;
                }
                else if (IsTokenChar(cp))
                {
                    int endOffset;
                    do
                    {
                        char[] chars = Character.ToChars(Normalize(cp));
                        for (int i = 0; i < chars.Length; i++)
                        {
                            termAtt.Append(chars[i]);
                        }
                        endOffset = off;
                        if (termAtt.Length >= maxTokenLength)
                        {
                            break;
                        }
                        cp = ReadCodePoint();
                    } while (cp >= 0 && IsTokenChar(cp));

                    if (termAtt.Length < maxTokenLength)
                    {
                        // buffer up, in case the "rejected" char can start a new word of its own
                        bufferedCodePoint = cp;
                        bufferedOff       = endOffset;
                    }
                    else
                    {
                        // otherwise, its because we hit term limit.
                        bufferedCodePoint = -1;
                    }
                    int correctedStartOffset = CorrectOffset(startOffset);
                    int correctedEndOffset   = CorrectOffset(endOffset);
                    Assert.True(correctedStartOffset >= 0);
                    Assert.True(correctedEndOffset >= 0);
                    Assert.True(correctedStartOffset >= lastOffset);
                    lastOffset = correctedStartOffset;
                    Assert.True(correctedEndOffset >= correctedStartOffset);
                    offsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
                    if (state == -1 || runAutomaton.IsAccept(state))
                    {
                        // either we hit a reject state (longest match), or end-of-text, but in an accept state
                        streamState = State.INCREMENT;
                        return(true);
                    }
                }
            }
            streamState = State.INCREMENT_FALSE;
            return(false);
        }
Exemplo n.º 26
0
        public override bool IncrementToken()
        {
            ClearAttributes();

            string nextToken;

            HebMorph.Tokenizer.TokenType tokenType;

            // Used to loop over certain noise cases
            while (true)
            {
                tokenType = hebMorphTokenizer.NextToken(out nextToken);
                if (tokenType == 0)
                {
                    return(false); // EOS
                }
                // Ignore "words" which are actually only prefixes in a single word.
                // This first case is easy to spot, since the prefix and the following word will be
                // separated by a dash marked as a construct (סמיכות) by the Tokenizer
                if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0)
                {
                    if (IsLegalPrefix(nextToken))
                    {
                        continue;
                    }
                }

                // This second case is a bit more complex. We take a risk of splitting a valid acronym or
                // abbrevated word into two, so we send it to an external function to analyze the word, and
                // get a possibly corrected word. Examples for words we expect to simplify by this operation
                // are ה"שטיח", ש"המידע.
                if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0)
                {
                    nextToken = TryStrippingPrefix(nextToken);

                    // Re-detect acronym, in case it was a false positive
                    if (nextToken.IndexOf('"') == -1)
                    {
                        tokenType |= ~HebMorph.Tokenizer.TokenType.Acronym;
                    }
                }

                break;
            }

            // Record the term string
            if (termAtt.TermLength() < nextToken.Length)
            {
                termAtt.SetTermBuffer(nextToken);
            }
            else // Perform a copy to save on memory operations
            {
                char[] buf = termAtt.TermBuffer();
                nextToken.CopyTo(0, buf, 0, nextToken.Length);
            }
            termAtt.SetTermLength(nextToken.Length);

            offsetAtt.SetOffset(CorrectOffset(hebMorphTokenizer.Offset), CorrectOffset(hebMorphTokenizer.Offset + hebMorphTokenizer.LengthInSource));

            if ((tokenType & HebMorph.Tokenizer.TokenType.Hebrew) > 0)
            {
                if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0)
                {
                    typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Acronym);
                }
                if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0)
                {
                    typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Construct);
                }
                else
                {
                    typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Hebrew);
                }
            }
            else if ((tokenType & HebMorph.Tokenizer.TokenType.Numeric) > 0)
            {
                typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Numeric);
            }
            else
            {
                typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.NonHebrew);
            }

            return(true);
        }
Exemplo n.º 27
0
        public override bool IncrementToken()
        {
            if (index >= str.Length)
            {
                return(false);
            }
            ClearAttributes();

            if (group >= 0)
            {
                // match a specific group
                if (matcher.Success)
                {
                    do
                    {
                        // We have alredy parsed from this index, go to the next token.
                        if (!isReset && matcher.Groups[group].Index == index)
                        {
                            continue;
                        }
                        isReset = false;

                        index = matcher.Groups[group].Index;
                        int endIndex = matcher.Groups[group].Index + matcher.Groups[group].Length;

                        if (index == endIndex)
                        {
                            continue;
                        }

                        termAtt.SetEmpty().Append(str.ToString(), index, endIndex - index); // LUCENENET: Corrected 3rd parameter
                        offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(endIndex));
                        return(true);
                    } while ((matcher = matcher.NextMatch()).Success);
                }


                index = int.MaxValue; // mark exhausted
                return(false);
            }
            else
            {
                // String.split() functionality
                if (matcher.Success)
                {
                    do
                    {
                        if (matcher.Index - index > 0)
                        {
                            // found a non-zero-length token
                            termAtt.SetEmpty().Append(str.ToString(), index, matcher.Index - index); // LUCENENET: Corrected 3rd parameter
                            offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(matcher.Index));
                            index = matcher.Index + matcher.Length;
                            return(true);
                        }

                        isReset = false;
                        index   = matcher.Index + matcher.Length;
                    } while ((matcher = matcher.NextMatch()).Success);
                }

                if (str.Length - index == 0)
                {
                    index = int.MaxValue; // mark exhausted
                    return(false);
                }

                termAtt.SetEmpty().Append(str.ToString(), index, str.Length - index); // LUCENENET: Corrected 3rd parameter
                offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(str.Length));
                index = int.MaxValue;                                                 // mark exhausted
                return(true);
            }
        }
Exemplo n.º 28
0
 public override void End()
 {
     base.End();
     posIncrAtt.PositionIncrement = finalPosInc;
     offsetAtt.SetOffset(finalOffset, finalOffset);
 }
Exemplo n.º 29
0
        /// <summary>
        /// <para>Get the next token from the input stream.
        /// </para>
        /// <para>If the next token has <c>positionIncrement > 1</c>,
        /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are
        /// inserted first.
        /// </para>
        /// </summary>
        /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
        /// <returns> On success, the populated token; null otherwise </returns>
        /// <exception cref="IOException"> if the input stream has a problem </exception>
        private InputWindowToken GetNextToken(InputWindowToken target)
        {
            InputWindowToken newTarget = target;

            if (numFillerTokensToInsert > 0)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes());
                }
                else
                {
                    nextInputStreamToken.CopyTo(target.attSource);
                }
                // A filler token occupies no space
                newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset);
                newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            }
            else if (isNextInputStreamToken)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes());
                }
                else
                {
                    nextInputStreamToken.CopyTo(target.attSource);
                }
                isNextInputStreamToken = false;
                newTarget.isFiller     = false;
            }
            else if (!exhausted)
            {
                if (m_input.IncrementToken())
                {
                    if (null == target)
                    {
                        newTarget = new InputWindowToken(this, CloneAttributes());
                    }
                    else
                    {
                        this.CopyTo(target.attSource);
                    }
                    if (posIncrAtt.PositionIncrement > 1)
                    {
                        // Each output shingle must contain at least one input token,
                        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                        numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                        // Save the current token as the next input stream token
                        if (null == nextInputStreamToken)
                        {
                            nextInputStreamToken = CloneAttributes();
                        }
                        else
                        {
                            this.CopyTo(nextInputStreamToken);
                        }
                        isNextInputStreamToken = true;
                        // A filler token occupies no space
                        newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
                        newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                        newTarget.isFiller = true;
                        --numFillerTokensToInsert;
                    }
                    else
                    {
                        newTarget.isFiller = false;
                    }
                }
                else
                {
                    exhausted = true;
                    m_input.End();
                    endState = CaptureState();
                    numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
                    if (numFillerTokensToInsert > 0)
                    {
                        nextInputStreamToken = new AttributeSource(this.GetAttributeFactory());
                        nextInputStreamToken.AddAttribute <ICharTermAttribute>();
                        IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>();
                        newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset);
                        // Recurse/loop just once:
                        return(GetNextToken(target));
                    }
                    else
                    {
                        newTarget = null;
                    }
                }
            }
            else
            {
                newTarget = null;
            }
            return(newTarget);
        }
Exemplo n.º 30
0
 public override void End()
 {
     base.End();
     PosIncrAtt.PositionIncrement = FinalPosInc;
     OffsetAtt.SetOffset(FinalOffset, FinalOffset);
 }