Beispiel #1
0
        public override bool incrementToken()
        {
            if (index >= str.Length)
            {
                return(false);
            }
            clearAttributes();
            if (group >= 0)
            {
                // match a specific group
                while (matcher.find())
                {
                    index = matcher.start(group);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int endIndex = matcher.end(group);
                    int endIndex = matcher.end(group);
                    if (index == endIndex)
                    {
                        continue;
                    }
                    termAtt.setEmpty().append(str, index, endIndex);
                    offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
                    return(true);
                }

                index = int.MaxValue;   // mark exhausted
                return(false);
            }
            else
            {
                // String.split() functionality
                while (matcher.find())
                {
                    if (matcher.start() - index > 0)
                    {
                        // found a non-zero-length token
                        termAtt.setEmpty().append(str, index, matcher.start());
                        offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
                        index = matcher.end();
                        return(true);
                    }

                    index = matcher.end();
                }

                if (str.Length - index == 0)
                {
                    index = int.MaxValue;     // mark exhausted
                    return(false);
                }

                termAtt.setEmpty().append(str, index, str.Length);
                offsetAtt.setOffset(correctOffset(index), correctOffset(str.Length));
                index = int.MaxValue;   // mark exhausted
                return(true);
            }
        }
Beispiel #2
0
            public override bool incrementToken()
            {
                if (str == null)
                {
                    throw new System.InvalidOperationException("Consumer did not call reset().");
                }
                clearAttributes();
                // cache loop instance vars (performance)
                string s      = str;
                int    len    = s.Length;
                int    i      = pos;
                bool   letter = isLetter;

                int    start = 0;
                string text;

                do
                {
                    // find beginning of token
                    text = null;
                    while (i < len && !isTokenChar(s[i], letter))
                    {
                        i++;
                    }

                    if (i < len)     // found beginning; now find end of token
                    {
                        start = i;
                        while (i < len && isTokenChar(s[i], letter))
                        {
                            i++;
                        }

                        text = s.Substring(start, i - start);
                        if (toLowerCase)
                        {
                            text = text.ToLower(locale);
                        }
                        //          if (toLowerCase) {
                        ////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
                        ////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
                        //            text = s.substring(start, i).toLowerCase();
                        ////            char[] chars = new char[i-start];
                        ////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
                        ////            text = new String(chars);
                        //          } else {
                        //            text = s.substring(start, i);
                        //          }
                    }
                } while (text != null && isStopWord(text));

                pos = i;
                if (text == null)
                {
                    return(false);
                }
                termAtt.setEmpty().append(text);
                offsetAtt.setOffset(correctOffset(start), correctOffset(i));
                return(true);
            }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void collapseAndSaveTokens(int tokenType, String type) throws java.io.IOException
        private void collapseAndSaveTokens(int tokenType, string type)
        {
            //collapse
            StringBuilder buffer   = new StringBuilder(32);
            int           numAdded = scanner.setText(buffer);
            //TODO: how to know how much whitespace to add
            int theStart = scanner.yychar();
            int lastPos  = theStart + numAdded;
            int tmpTokType;
            int numSeen = 0;
            IList <AttributeSource.State> tmp = new List <AttributeSource.State>();

            setupSavedToken(0, type);
            tmp.Add(captureState());
            //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
            while ((tmpTokType = scanner.NextToken) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
            {
                int currPos = scanner.yychar();
                //append whitespace
                for (int i = 0; i < (currPos - lastPos); i++)
                {
                    buffer.Append(' ');
                }
                numAdded = scanner.setText(buffer);
                setupSavedToken(scanner.PositionIncrement, type);
                tmp.Add(captureState());
                numSeen++;
                lastPos = currPos + numAdded;
            }
            //trim the buffer
            // TODO: this is inefficient
            string s = buffer.ToString().Trim();

            termAtt.setEmpty().append(s);
            offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.Length));
            flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
            //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
            if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
            {
                scanner.yypushback(scanner.yylength());
            }
            tokens = tmp.GetEnumerator();
        }
Beispiel #4
0
 public override bool incrementToken()
 {
     clearAttributes();
     termAtt.setEmpty().append("accents");
     offsetAtt.setOffset(2, 7);
     typeAtt.Type = "wrd";
     posIncAtt.PositionIncrement = 3;
     payloadAtt.Payload          = new BytesRef(new sbyte[] { 0, 1, 2, 3 });
     flagsAtt.Flags = 77;
     return(true);
 }
Beispiel #5
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] termBuffer = termAtt.buffer();
            int    len        = termAtt.length();

            //TODO: Is this the right behavior or should we return false?  Currently, "  ", returns true, so I think this should
            //also return true
            if (len == 0)
            {
                return(true);
            }
            int start  = 0;
            int end    = 0;
            int endOff = 0;

            // eat the first characters
            for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++)
            {
            }
            // eat the end characters
            for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--)
            {
                endOff++;
            }
            if (start > 0 || end < len)
            {
                if (start < end)
                {
                    termAtt.copyBuffer(termBuffer, start, (end - start));
                }
                else
                {
                    termAtt.setEmpty();
                }
                if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset())
                {
                    int newStart = offsetAtt.startOffset() + start;
                    int newEnd   = offsetAtt.endOffset() - (start < end ? endOff:0);
                    offsetAtt.setOffset(newStart, newEnd);
                }
            }

            return(true);
        }
        /// <summary>
        /// TODO: rewrite tests not to use string comparison.
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private static String tsToString(org.apache.lucene.analysis.TokenStream in) throws java.io.IOException
        private static string tsToString(TokenStream @in)
        {
            StringBuilder     @out    = new StringBuilder();
            CharTermAttribute termAtt = @in.addAttribute(typeof(CharTermAttribute));

            // extra safety to enforce, that the state is not preserved and also
            // assign bogus values
            @in.clearAttributes();
            termAtt.setEmpty().append("bogusTerm");
            @in.reset();
            while (@in.incrementToken())
            {
                if (@out.Length > 0)
                {
                    @out.Append(' ');
                }
                @out.Append(termAtt.ToString());
                @in.clearAttributes();
                termAtt.setEmpty().append("bogusTerm");
            }

            @in.close();
            return(@out.ToString());
        }
            public override bool incrementToken()
            {
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
                if (toks.hasNext())
                {
                    clearAttributes();
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
                    Token tok = toks.next();
                    termAtt.setEmpty().append(tok);
                    offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
                    posIncAtt.PositionIncrement = tok.PositionIncrement;
                    return(true);
                }
                else
                {
                    return(false);
                }
            }
Beispiel #8
0
            public override bool incrementToken()
            {
                if (!initialized)
                {
                    throw new System.InvalidOperationException("Consumer did not call reset().");
                }
                if (matcher == null)
                {
                    return(false);
                }
                clearAttributes();
                while (true)   // loop takes care of leading and trailing boundary cases
                {
                    int  start = pos;
                    int  end_Renamed;
                    bool isMatch = matcher.find();
                    if (isMatch)
                    {
                        end_Renamed = matcher.start();
                        pos         = matcher.end();
                    }
                    else
                    {
                        end_Renamed = str.Length;
                        matcher     = null; // we're finished
                    }

                    if (start != end_Renamed)     // non-empty match (header/trailer)
                    {
                        string text = str.Substring(start, end_Renamed - start);
                        if (toLowerCase)
                        {
                            text = text.ToLower(locale);
                        }
                        termAtt.setEmpty().append(text);
                        offsetAtt.setOffset(correctOffset(start), correctOffset(end_Renamed));
                        return(true);
                    }
                    if (!isMatch)
                    {
                        return(false);
                    }
                }
            }
Beispiel #9
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
            public override bool incrementToken()
            {
                if (index >= tokens.Length)
                {
                    return(false);
                }
                else
                {
                    clearAttributes();
                    Token token = tokens[index++];
                    termAtt.setEmpty().append(token);
                    offsetAtt.setOffset(token.startOffset(), token.endOffset());
                    posIncAtt.PositionIncrement = token.PositionIncrement;
                    flagsAtt.Flags     = token.Flags;
                    typeAtt.Type       = token.type();
                    payloadAtt.Payload = token.Payload;
                    return(true);
                }
            }
Beispiel #10
0
        /// <returns>  Returns true for the next token in the stream, or false at EOS </returns>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                string term = termAtt.ToString();

                // Check the exclusion table
                if (!keywordAttr.Keyword)
                {
                    string s = stemmer.stem(term);
                    // If not stemmed, don't waste the time  adjusting the token.
                    if ((s != null) && !s.Equals(term))
                    {
                        termAtt.setEmpty().append(s);
                    }
                }
                return(true);
            }
            else
            {
                return(false);
            }
        }
Beispiel #11
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
            public override bool incrementToken()
            {
                if (bufferedState != null)
                {
                    restoreState(bufferedState);
                    posIncAtt.PositionIncrement = 0;
                    termAtt.setEmpty().append("hte");
                    bufferedState = null;
                    return(true);
                }
                else if (input.incrementToken())
                {
                    if (termAtt.ToString().Equals("the"))
                    {
                        bufferedState = captureState();
                    }
                    return(true);
                }
                else
                {
                    return(false);
                }
            }
Beispiel #12
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            clearAttributes();
            if (!started)
            {
                started  = true;
                gramSize = minGram;
                char[] chars = new char[1024];
                charsRead = 0;
                // TODO: refactor to a shared readFully somewhere:
                while (charsRead < chars.Length)
                {
                    int inc = input.read(chars, charsRead, chars.Length - charsRead);
                    if (inc == -1)
                    {
                        break;
                    }
                    charsRead += inc;
                }
                inStr = (new string(chars, 0, charsRead)).Trim();   // remove any trailing empty strings

                if (charsRead == chars.Length)
                {
                    // Read extra throwaway chars so that on end() we
                    // report the correct offset:
                    char[] throwaway = new char[1024];
                    while (true)
                    {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int inc = input.read(throwaway, 0, throwaway.length);
                        int inc = input.read(throwaway, 0, throwaway.Length);
                        if (inc == -1)
                        {
                            break;
                        }
                        charsRead += inc;
                    }
                }

                inLen = inStr.Length;
                if (inLen == 0)
                {
                    return(false);
                }
            }

            if (pos + gramSize > inLen) // if we hit the end of the string
            {
                pos = 0;                // reset to beginning of string
                gramSize++;             // increase n-gram size
                if (gramSize > maxGram) // we are done
                {
                    return(false);
                }
                if (pos + gramSize > inLen)
                {
                    return(false);
                }
            }

            int oldPos = pos;

            pos++;
            termAtt.setEmpty().append(inStr, oldPos, oldPos + gramSize);
            offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos + gramSize));
            return(true);
        }