/** * Inserts bigrams for common words into a token stream. For each input token, * output the token. If the token and/or the following token are in the list * of common words also output a bigram with position increment 0 and * type="gram" */ /* * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated * Token.next() TODO:Consider adding an option to not emit unigram stopwords * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be * changed to work with this. TODO: Consider optimizing for the case of three * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of", * "of-the", "the-year" but with proper management of positions we could * eliminate the middle bigram "of-the"and save a disk seek and a whole set of * position lookups. */ protected override Token process(Token token) { Token next = peek(1); // if this is the last token just spit it out. Any commongram would have // been output in the previous call if (next == null) { return(token); } /** * if this token or next are common then construct a bigram with type="gram" * position increment = 0, and put it in the output queue. It will be * returned when super.next() is called, before this method gets called with * a new token from the input stream See implementation of next() in * BufferedTokenStream */ if (isCommon(token) || isCommon(next)) { Token gram = gramToken(token, next); write(gram); } // we always return the unigram token return(token); }
/** Construct a compound token. */ private Token gramToken(Token first, Token second) { buffer.setLength(0); #pragma warning disable 612 buffer.append(first.termText()); #pragma warning restore 612 buffer.append(SEPARATOR); #pragma warning disable 612 buffer.append(second.termText()); #pragma warning restore 612 Token result = new Token(buffer.toString(), first.startOffset(), second .endOffset(), "gram"); result.setPositionIncrement(0); return(result); }
/** True if token is for a common term. */ private bool isCommon(Token token) { return(commonWords != null && commonWords.contains(token.termBuffer(), 0, token.termLength())); }