public override bool incrementToken() { if (!input.incrementToken()) { return(false); } char[] termBuffer = termAtt.termBuffer(); int len = termAtt.termLength(); // if protected, don't stem. use this to avoid stemming collisions. if (protWords != null && protWords.contains(termBuffer, 0, len)) { return(true); } stemmer.setCurrent(new string(termBuffer, 0, len));//ugh, wish the Stemmer took a char array stemmer.stem(); string newstr = stemmer.getCurrent(); termAtt.setTermBuffer(newstr.ToCharArray(), 0, newstr.Length); return(true); }
#pragma warning disable 672 public override Token next(Token @in) { // check the queue first if (queuePos < queue.size()) { return((Token)queue.get(queuePos++)); } // reset the queue if it had been previously used if (queuePos != 0) { queuePos = 0; queue.clear(); } // optimize for the common case: assume there will be // no subwords (just a simple word) // // Would it actually be faster to check for the common form // of isLetter() isLower()*, and then backtrack if it doesn't match? int origPosIncrement = 0; Token t; while (true) { // t is either returned, or a new token is made from it, so it should // be safe to use the next(Token) method. #pragma warning disable 612 t = input.next(@in); #pragma warning restore 612 if (t == null) { return(null); } char [] termBuffer = t.termBuffer(); int len = t.termLength(); int start = 0; if (len == 0) { continue; } int posInc = t.getPositionIncrement(); origPosIncrement += posInc; //skip protected tokens if (protWords != null && protWords.contains(termBuffer, 0, len)) { t.setPositionIncrement(origPosIncrement); return(t); } // Avoid calling charType more than once for each char (basically // avoid any backtracking). // makes code slightly more difficult, but faster. int ch = termBuffer[start]; int type = charType(ch); int numWords = 0; while (start < len) { // first eat delimiters at the start of this subword while ((type & SUBWORD_DELIM) != 0 && ++start < len) { ch = termBuffer[start]; type = charType(ch); } int pos = start; // save the type of the first char of the subword // as a way to tell what type of subword token this is (number, word, etc) int firstType = type; int lastType = type; // type of the previously read char while (pos < len) { if ((type & lastType) == 0) // no overlap in character type // check and remove "'s" from the end of a token. // the pattern to check for is // ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END) { if (stemEnglishPossessive != 0 && ((lastType & ALPHA) != 0)) { if (ch == '\'' && pos + 1 < len && (termBuffer[pos + 1] == 's' || termBuffer[pos + 1] == 'S')) { int subWordEnd = pos; if (pos + 2 >= len) { // end of string detected after "'s" pos += 2; } else { // make sure that a delimiter follows "'s" int ch2 = termBuffer[pos + 2]; int type2 = charType(ch2); if ((type2 & SUBWORD_DELIM) != 0) { // if delimiter, move position pointer // to it (skipping over "'s" ch = ch2; type = type2; pos += 2; } } queue.add(newTok(t, start, subWordEnd)); if ((firstType & ALPHA) != 0) { numWords++; } break; } } // For case changes, only split on a transition from // lower to upper case, not vice-versa. // That will correctly handle the // case of a word starting with a capital (won't split). // It will also handle pluralization of // an uppercase word such as FOOs (won't split). if (splitOnCaseChange == 0 && (lastType & ALPHA) != 0 && (type & ALPHA) != 0) { // ALPHA->ALPHA: always ignore if case isn't considered. } else if ((lastType & UPPER) != 0 && (type & ALPHA) != 0) { // UPPER->letter: Don't split } else if (splitOnNumerics == 0 && (((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split } else { // NOTE: this code currently assumes that only one flag // is set for each character now, so we don't have // to explicitly check for all the classes of transitions // listed below. // LOWER->UPPER // ALPHA->NUMERIC // NUMERIC->ALPHA // *->DELIMITER queue.add(newTok(t, start, pos)); if ((firstType & ALPHA) != 0) { numWords++; } break; } } if (++pos >= len) { if (start == 0) { // the subword is the whole original token, so // return it unchanged. t.setPositionIncrement(origPosIncrement); return(t); } // optimization... if this is the only token, // return it immediately. if (queue.size() == 0 && preserveOriginal == 0) { // just adjust the text w/o changing the rest // of the original token t.setTermBuffer(termBuffer, start, len - start); t.setStartOffset(t.startOffset() + start); t.setPositionIncrement(origPosIncrement); return(t); } Token newtok = newTok(t, start, pos); queue.add(newtok); if ((firstType & ALPHA) != 0) { numWords++; } break; } lastType = type; ch = termBuffer[pos]; type = charType(ch); } // start of the next subword is the current position start = pos; } // System.out.println("##########TOKEN=" + s + " ######### WORD DELIMITER QUEUE=" + str(queue)); int numtok = queue.size(); // We reached the end of the current token. // If the queue is empty, we should continue by reading // the next token if (numtok == 0) { // the token might have been all delimiters, in which // case return it if we're meant to preserve it if (preserveOriginal != 0) { return(t); } // if this token had a "normal" gap of 1, remove it. if (posInc == 1) { origPosIncrement -= 1; } continue; } // if number of tokens is 1, there are no catenations to be done. if (numtok == 1) { break; } int numNumbers = numtok - numWords; // check conditions under which the current token // queue may be used as-is (no catenations needed) if (catenateAll == 0 && // no "everything" to catenate (catenateWords == 0 || numWords <= 1) && // no words to catenate (catenateNumbers == 0 || numNumbers <= 1) && // no numbers to catenate (generateWordParts != 0 || numWords == 0) && // word generation is on (generateNumberParts != 0 || numNumbers == 0)) // number generation is on { break; } // swap queue and the temporary working list, then clear the // queue in preparation for adding all combinations back to it. ArrayList /*<Token>*/ tmp = tlist; tlist = queue; queue = tmp; queue.clear(); if (numWords == 0) { // all numbers addCombos(tlist, 0, numtok, generateNumberParts != 0, catenateNumbers != 0 || catenateAll != 0, 1); if (queue.size() > 0 || preserveOriginal != 0) { break; } else { continue; } } else if (numNumbers == 0) { // all words addCombos(tlist, 0, numtok, generateWordParts != 0, catenateWords != 0 || catenateAll != 0, 1); if (queue.size() > 0 || preserveOriginal != 0) { break; } else { continue; } } else if (generateNumberParts == 0 && generateWordParts == 0 && catenateNumbers == 0 && catenateWords == 0) { // catenate all *only* // OPT:could be optimized to add to current queue... addCombos(tlist, 0, numtok, false, catenateAll != 0, 1); if (queue.size() > 0 || preserveOriginal != 0) { break; } else { continue; } } // // Find all adjacent tokens of the same type. // Token tok = (Token)tlist.get(0); bool isWord = (tokType(tok) & ALPHA) != 0; bool wasWord = isWord; for (int i = 0; i < numtok;) { int j; for (j = i + 1; j < numtok; j++) { wasWord = isWord; tok = (Token)tlist.get(j); isWord = (tokType(tok) & ALPHA) != 0; if (isWord != wasWord) { break; } } if (wasWord) { addCombos(tlist, i, j, generateWordParts != 0, catenateWords != 0, 1); } else { addCombos(tlist, i, j, generateNumberParts != 0, catenateNumbers != 0, 1); } i = j; } // take care catenating all subwords if (catenateAll != 0) { addCombos(tlist, 0, numtok, false, true, 0); } // NOTE: in certain cases, queue may be empty (for instance, if catenate // and generate are both set to false). Only exit the loop if the queue // is not empty. if (queue.size() > 0 || preserveOriginal != 0) { break; } } // System.out.println("##########AFTER COMBINATIONS:"+ str(queue)); if (preserveOriginal != 0) { queuePos = 0; if (queue.size() > 0) { // overlap first token with the original ((Token)queue.get(0)).setPositionIncrement(0); } return(t); // return the original } else { queuePos = 1; Token tok = (Token)queue.get(0); tok.setPositionIncrement(origPosIncrement); return(tok); } }