#pragma warning restore 672 // index "a","b","c" as pos0="a", pos1="b", pos2="c", pos2="abc" private void addCombos(List /*<Token>*/ lst, int start, int end, bool generateSubwords, bool catenateSubwords, int posOffset) { if (end - start == 1) { // always generate a word alone, even if generateSubwords=0 because // the catenation of all the subwords *is* the subword. queue.add(lst.get(start)); return; } StringBuilder sb = null; if (catenateSubwords) { sb = new StringBuilder(); } Token firstTok = null; Token tok = null; for (int i = start; i < end; i++) { tok = (Token)lst.get(i); if (catenateSubwords) { if (i == start) { firstTok = tok; } sb.append(tok.termBuffer(), 0, tok.termLength()); } if (generateSubwords) { queue.add(tok); } } if (catenateSubwords) { Token concatTok = new Token(sb.toString(), firstTok.startOffset(), tok.endOffset(), firstTok.type()); // if we indexed some other tokens, then overlap concatTok with the last. // Otherwise, use the value passed in as the position offset. concatTok.setPositionIncrement(generateSubwords == true ? 0 : posOffset); queue.add(concatTok); } }
/** * Converts the original query string to a collection of Lucene Tokens. * @param original the original query string * @return a Collection of Lucene Tokens */ public override Collection /*<Token>*/ convert(string original) { if (original == null) // this can happen with q.alt = and no query { return(Collections.emptyList()); } Collection /*<Token>*/ result = new ArrayList/*<Token>*/ (); //TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream Matcher matcher = QUERY_REGEX.matcher(original); TokenStream stream; while (matcher.find()) { string word = matcher.group(0); if (word.Equals("AND") == false && word.Equals("OR") == false) { try { stream = analyzer.reusableTokenStream("", new StringReader(word)); // TODO: support custom attributes TermAttribute termAtt = (TermAttribute)stream.addAttribute(typeof(TermAttribute)); FlagsAttribute flagsAtt = (FlagsAttribute)stream.addAttribute(typeof(FlagsAttribute)); TypeAttribute typeAtt = (TypeAttribute)stream.addAttribute(typeof(TypeAttribute)); PayloadAttribute payloadAtt = (PayloadAttribute)stream.addAttribute(typeof(PayloadAttribute)); PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute)stream.addAttribute(typeof(PositionIncrementAttribute)); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); token.setStartOffset(matcher.start()); token.setEndOffset(matcher.end()); token.setFlags(flagsAtt.getFlags()); token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } } #pragma warning disable 168 catch (IOException e) { } #pragma warning restore 168 } } return(result); }
#pragma warning disable 672 public override Token next(Token @in) { // check the queue first if (queuePos < queue.size()) { return((Token)queue.get(queuePos++)); } // reset the queue if it had been previously used if (queuePos != 0) { queuePos = 0; queue.clear(); } // optimize for the common case: assume there will be // no subwords (just a simple word) // // Would it actually be faster to check for the common form // of isLetter() isLower()*, and then backtrack if it doesn't match? int origPosIncrement = 0; Token t; while (true) { // t is either returned, or a new token is made from it, so it should // be safe to use the next(Token) method. #pragma warning disable 612 t = input.next(@in); #pragma warning restore 612 if (t == null) { return(null); } char [] termBuffer = t.termBuffer(); int len = t.termLength(); int start = 0; if (len == 0) { continue; } int posInc = t.getPositionIncrement(); origPosIncrement += posInc; //skip protected tokens if (protWords != null && protWords.contains(termBuffer, 0, len)) { t.setPositionIncrement(origPosIncrement); return(t); } // Avoid calling charType more than once for each char (basically // avoid any backtracking). // makes code slightly more difficult, but faster. int ch = termBuffer[start]; int type = charType(ch); int numWords = 0; while (start < len) { // first eat delimiters at the start of this subword while ((type & SUBWORD_DELIM) != 0 && ++start < len) { ch = termBuffer[start]; type = charType(ch); } int pos = start; // save the type of the first char of the subword // as a way to tell what type of subword token this is (number, word, etc) int firstType = type; int lastType = type; // type of the previously read char while (pos < len) { if ((type & lastType) == 0) // no overlap in character type // check and remove "'s" from the end of a token. // the pattern to check for is // ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END) { if (stemEnglishPossessive != 0 && ((lastType & ALPHA) != 0)) { if (ch == '\'' && pos + 1 < len && (termBuffer[pos + 1] == 's' || termBuffer[pos + 1] == 'S')) { int subWordEnd = pos; if (pos + 2 >= len) { // end of string detected after "'s" pos += 2; } else { // make sure that a delimiter follows "'s" int ch2 = termBuffer[pos + 2]; int type2 = charType(ch2); if ((type2 & SUBWORD_DELIM) != 0) { // if delimiter, move position pointer // to it (skipping over "'s" ch = ch2; type = type2; pos += 2; } } queue.add(newTok(t, start, subWordEnd)); if ((firstType & ALPHA) != 0) { numWords++; } break; } } // For case changes, only split on a transition from // lower to upper case, not vice-versa. // That will correctly handle the // case of a word starting with a capital (won't split). // It will also handle pluralization of // an uppercase word such as FOOs (won't split). if (splitOnCaseChange == 0 && (lastType & ALPHA) != 0 && (type & ALPHA) != 0) { // ALPHA->ALPHA: always ignore if case isn't considered. } else if ((lastType & UPPER) != 0 && (type & ALPHA) != 0) { // UPPER->letter: Don't split } else if (splitOnNumerics == 0 && (((lastType & ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType & DIGIT) != 0 && (type & ALPHA) != 0))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split } else { // NOTE: this code currently assumes that only one flag // is set for each character now, so we don't have // to explicitly check for all the classes of transitions // listed below. // LOWER->UPPER // ALPHA->NUMERIC // NUMERIC->ALPHA // *->DELIMITER queue.add(newTok(t, start, pos)); if ((firstType & ALPHA) != 0) { numWords++; } break; } } if (++pos >= len) { if (start == 0) { // the subword is the whole original token, so // return it unchanged. t.setPositionIncrement(origPosIncrement); return(t); } // optimization... if this is the only token, // return it immediately. if (queue.size() == 0 && preserveOriginal == 0) { // just adjust the text w/o changing the rest // of the original token t.setTermBuffer(termBuffer, start, len - start); t.setStartOffset(t.startOffset() + start); t.setPositionIncrement(origPosIncrement); return(t); } Token newtok = newTok(t, start, pos); queue.add(newtok); if ((firstType & ALPHA) != 0) { numWords++; } break; } lastType = type; ch = termBuffer[pos]; type = charType(ch); } // start of the next subword is the current position start = pos; } // System.out.println("##########TOKEN=" + s + " ######### WORD DELIMITER QUEUE=" + str(queue)); int numtok = queue.size(); // We reached the end of the current token. // If the queue is empty, we should continue by reading // the next token if (numtok == 0) { // the token might have been all delimiters, in which // case return it if we're meant to preserve it if (preserveOriginal != 0) { return(t); } // if this token had a "normal" gap of 1, remove it. if (posInc == 1) { origPosIncrement -= 1; } continue; } // if number of tokens is 1, there are no catenations to be done. if (numtok == 1) { break; } int numNumbers = numtok - numWords; // check conditions under which the current token // queue may be used as-is (no catenations needed) if (catenateAll == 0 && // no "everything" to catenate (catenateWords == 0 || numWords <= 1) && // no words to catenate (catenateNumbers == 0 || numNumbers <= 1) && // no numbers to catenate (generateWordParts != 0 || numWords == 0) && // word generation is on (generateNumberParts != 0 || numNumbers == 0)) // number generation is on { break; } // swap queue and the temporary working list, then clear the // queue in preparation for adding all combinations back to it. ArrayList /*<Token>*/ tmp = tlist; tlist = queue; queue = tmp; queue.clear(); if (numWords == 0) { // all numbers addCombos(tlist, 0, numtok, generateNumberParts != 0, catenateNumbers != 0 || catenateAll != 0, 1); if (queue.size() > 0 || preserveOriginal != 0) { break; } else { continue; } } else if (numNumbers == 0) { // all words addCombos(tlist, 0, numtok, generateWordParts != 0, catenateWords != 0 || catenateAll != 0, 1); if (queue.size() > 0 || preserveOriginal != 0) { break; } else { continue; } } else if (generateNumberParts == 0 && generateWordParts == 0 && catenateNumbers == 0 && catenateWords == 0) { // catenate all *only* // OPT:could be optimized to add to current queue... addCombos(tlist, 0, numtok, false, catenateAll != 0, 1); if (queue.size() > 0 || preserveOriginal != 0) { break; } else { continue; } } // // Find all adjacent tokens of the same type. // Token tok = (Token)tlist.get(0); bool isWord = (tokType(tok) & ALPHA) != 0; bool wasWord = isWord; for (int i = 0; i < numtok;) { int j; for (j = i + 1; j < numtok; j++) { wasWord = isWord; tok = (Token)tlist.get(j); isWord = (tokType(tok) & ALPHA) != 0; if (isWord != wasWord) { break; } } if (wasWord) { addCombos(tlist, i, j, generateWordParts != 0, catenateWords != 0, 1); } else { addCombos(tlist, i, j, generateNumberParts != 0, catenateNumbers != 0, 1); } i = j; } // take care catenating all subwords if (catenateAll != 0) { addCombos(tlist, 0, numtok, false, true, 0); } // NOTE: in certain cases, queue may be empty (for instance, if catenate // and generate are both set to false). Only exit the loop if the queue // is not empty. if (queue.size() > 0 || preserveOriginal != 0) { break; } } // System.out.println("##########AFTER COMBINATIONS:"+ str(queue)); if (preserveOriginal != 0) { queuePos = 0; if (queue.size() > 0) { // overlap first token with the original ((Token)queue.get(0)).setPositionIncrement(0); } return(t); // return the original } else { queuePos = 1; Token tok = (Token)queue.get(0); tok.setPositionIncrement(origPosIncrement); return(tok); } }