Esempio n. 1
0
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] termBuffer = termAtt.termBuffer();
            int    len        = termAtt.termLength();

            // if protected, don't stem.  use this to avoid stemming collisions.
            if (protWords != null && protWords.contains(termBuffer, 0, len))
            {
                return(true);
            }

            stemmer.setCurrent(new string(termBuffer, 0, len));//ugh, wish the Stemmer took a char array
            stemmer.stem();
            string newstr = stemmer.getCurrent();

            termAtt.setTermBuffer(newstr.ToCharArray(), 0, newstr.Length);

            return(true);
        }
Esempio n. 2
0
#pragma warning disable 672
        public override Token next(Token @in)
        {
            // check the queue first
            if (queuePos < queue.size())
            {
                return((Token)queue.get(queuePos++));
            }

            // reset the queue if it had been previously used
            if (queuePos != 0)
            {
                queuePos = 0;
                queue.clear();
            }


            // optimize for the common case: assume there will be
            // no subwords (just a simple word)
            //
            // Would it actually be faster to check for the common form
            // of isLetter() isLower()*, and then backtrack if it doesn't match?

            int   origPosIncrement = 0;
            Token t;

            while (true)
            {
                // t is either returned, or a new token is made from it, so it should
                // be safe to use the next(Token) method.
#pragma warning disable 612
                t = input.next(@in);
#pragma warning restore 612
                if (t == null)
                {
                    return(null);
                }

                char [] termBuffer = t.termBuffer();
                int     len        = t.termLength();
                int     start      = 0;
                if (len == 0)
                {
                    continue;
                }

                int posInc = t.getPositionIncrement();
                origPosIncrement += posInc;

                //skip protected tokens
                if (protWords != null && protWords.contains(termBuffer, 0, len))
                {
                    t.setPositionIncrement(origPosIncrement);
                    return(t);
                }

                // Avoid calling charType more than once for each char (basically
                // avoid any backtracking).
                // makes code slightly more difficult, but faster.
                int ch   = termBuffer[start];
                int type = charType(ch);

                int numWords = 0;

                while (start < len)
                {
                    // first eat delimiters at the start of this subword
                    while ((type & SUBWORD_DELIM) != 0 && ++start < len)
                    {
                        ch   = termBuffer[start];
                        type = charType(ch);
                    }

                    int pos = start;

                    // save the type of the first char of the subword
                    // as a way to tell what type of subword token this is (number, word, etc)
                    int firstType = type;
                    int lastType  = type; // type of the previously read char


                    while (pos < len)
                    {
                        if ((type & lastType) == 0) // no overlap in character type
                        // check and remove "'s" from the end of a token.
                        // the pattern to check for is
                        //   ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
                        {
                            if (stemEnglishPossessive != 0 && ((lastType & ALPHA) != 0))
                            {
                                if (ch == '\'' && pos + 1 < len &&
                                    (termBuffer[pos + 1] == 's' || termBuffer[pos + 1] == 'S'))
                                {
                                    int subWordEnd = pos;
                                    if (pos + 2 >= len)
                                    {
                                        // end of string detected after "'s"
                                        pos += 2;
                                    }
                                    else
                                    {
                                        // make sure that a delimiter follows "'s"
                                        int ch2   = termBuffer[pos + 2];
                                        int type2 = charType(ch2);
                                        if ((type2 & SUBWORD_DELIM) != 0)
                                        {
                                            // if delimiter, move position pointer
                                            // to it (skipping over "'s"
                                            ch   = ch2;
                                            type = type2;
                                            pos += 2;
                                        }
                                    }

                                    queue.add(newTok(t, start, subWordEnd));
                                    if ((firstType & ALPHA) != 0)
                                    {
                                        numWords++;
                                    }
                                    break;
                                }
                            }

                            // For case changes, only split on a transition from
                            // lower to upper case, not vice-versa.
                            // That will correctly handle the
                            // case of a word starting with a capital (won't split).
                            // It will also handle pluralization of
                            // an uppercase word such as FOOs (won't split).

                            if (splitOnCaseChange == 0 &&
                                (lastType & ALPHA) != 0 && (type & ALPHA) != 0)
                            {
                                // ALPHA->ALPHA: always ignore if case isn't considered.
                            }
                            else if ((lastType & UPPER) != 0 && (type & ALPHA) != 0)
                            {
                                // UPPER->letter: Don't split
                            }
                            else if (splitOnNumerics == 0 &&
                                     (((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0)))
                            {
                                // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
                            }
                            else
                            {
                                // NOTE: this code currently assumes that only one flag
                                // is set for each character now, so we don't have
                                // to explicitly check for all the classes of transitions
                                // listed below.

                                // LOWER->UPPER
                                // ALPHA->NUMERIC
                                // NUMERIC->ALPHA
                                // *->DELIMITER
                                queue.add(newTok(t, start, pos));
                                if ((firstType & ALPHA) != 0)
                                {
                                    numWords++;
                                }
                                break;
                            }
                        }

                        if (++pos >= len)
                        {
                            if (start == 0)
                            {
                                // the subword is the whole original token, so
                                // return it unchanged.
                                t.setPositionIncrement(origPosIncrement);
                                return(t);
                            }

                            // optimization... if this is the only token,
                            // return it immediately.
                            if (queue.size() == 0 && preserveOriginal == 0)
                            {
                                // just adjust the text w/o changing the rest
                                // of the original token
                                t.setTermBuffer(termBuffer, start, len - start);
                                t.setStartOffset(t.startOffset() + start);
                                t.setPositionIncrement(origPosIncrement);
                                return(t);
                            }

                            Token newtok = newTok(t, start, pos);

                            queue.add(newtok);
                            if ((firstType & ALPHA) != 0)
                            {
                                numWords++;
                            }
                            break;
                        }

                        lastType = type;
                        ch       = termBuffer[pos];
                        type     = charType(ch);
                    }

                    // start of the next subword is the current position
                    start = pos;
                }

                // System.out.println("##########TOKEN=" + s + " ######### WORD DELIMITER QUEUE=" + str(queue));

                int numtok = queue.size();

                // We reached the end of the current token.
                // If the queue is empty, we should continue by reading
                // the next token
                if (numtok == 0)
                {
                    // the token might have been all delimiters, in which
                    // case return it if we're meant to preserve it
                    if (preserveOriginal != 0)
                    {
                        return(t);
                    }

                    // if this token had a "normal" gap of 1, remove it.
                    if (posInc == 1)
                    {
                        origPosIncrement -= 1;
                    }
                    continue;
                }

                // if number of tokens is 1, there are no catenations to be done.
                if (numtok == 1)
                {
                    break;
                }


                int numNumbers = numtok - numWords;

                // check conditions under which the current token
                // queue may be used as-is (no catenations needed)
                if (catenateAll == 0 &&                            // no "everything" to catenate
                    (catenateWords == 0 || numWords <= 1) &&       // no words to catenate
                    (catenateNumbers == 0 || numNumbers <= 1) &&   // no numbers to catenate
                    (generateWordParts != 0 || numWords == 0) && // word generation is on
                    (generateNumberParts != 0 || numNumbers == 0)) // number generation is on
                {
                    break;
                }


                // swap queue and the temporary working list, then clear the
                // queue in preparation for adding all combinations back to it.
                ArrayList /*<Token>*/ tmp = tlist;
                tlist = queue;
                queue = tmp;
                queue.clear();

                if (numWords == 0)
                {
                    // all numbers
                    addCombos(tlist, 0, numtok, generateNumberParts != 0, catenateNumbers != 0 || catenateAll != 0, 1);
                    if (queue.size() > 0 || preserveOriginal != 0)
                    {
                        break;
                    }
                    else
                    {
                        continue;
                    }
                }
                else if (numNumbers == 0)
                {
                    // all words
                    addCombos(tlist, 0, numtok, generateWordParts != 0, catenateWords != 0 || catenateAll != 0, 1);
                    if (queue.size() > 0 || preserveOriginal != 0)
                    {
                        break;
                    }
                    else
                    {
                        continue;
                    }
                }
                else if (generateNumberParts == 0 && generateWordParts == 0 && catenateNumbers == 0 && catenateWords == 0)
                {
                    // catenate all *only*
                    // OPT:could be optimized to add to current queue...
                    addCombos(tlist, 0, numtok, false, catenateAll != 0, 1);
                    if (queue.size() > 0 || preserveOriginal != 0)
                    {
                        break;
                    }
                    else
                    {
                        continue;
                    }
                }

                //
                // Find all adjacent tokens of the same type.
                //
                Token tok     = (Token)tlist.get(0);
                bool  isWord  = (tokType(tok) & ALPHA) != 0;
                bool  wasWord = isWord;

                for (int i = 0; i < numtok;)
                {
                    int j;
                    for (j = i + 1; j < numtok; j++)
                    {
                        wasWord = isWord;
                        tok     = (Token)tlist.get(j);
                        isWord  = (tokType(tok) & ALPHA) != 0;
                        if (isWord != wasWord)
                        {
                            break;
                        }
                    }
                    if (wasWord)
                    {
                        addCombos(tlist, i, j, generateWordParts != 0, catenateWords != 0, 1);
                    }
                    else
                    {
                        addCombos(tlist, i, j, generateNumberParts != 0, catenateNumbers != 0, 1);
                    }
                    i = j;
                }

                // take care catenating all subwords
                if (catenateAll != 0)
                {
                    addCombos(tlist, 0, numtok, false, true, 0);
                }

                // NOTE: in certain cases, queue may be empty (for instance, if catenate
                // and generate are both set to false).  Only exit the loop if the queue
                // is not empty.
                if (queue.size() > 0 || preserveOriginal != 0)
                {
                    break;
                }
            }

            // System.out.println("##########AFTER COMBINATIONS:"+ str(queue));

            if (preserveOriginal != 0)
            {
                queuePos = 0;
                if (queue.size() > 0)
                {
                    // overlap first token with the original
                    ((Token)queue.get(0)).setPositionIncrement(0);
                }
                return(t); // return the original
            }
            else
            {
                queuePos = 1;
                Token tok = (Token)queue.get(0);
                tok.setPositionIncrement(origPosIncrement);
                return(tok);
            }
        }