Ejemplo n.º 1
0
        /**
         * Inserts bigrams for common words into a token stream. For each input token,
         * output the token. If the token and/or the following token are in the list
         * of common words also output a bigram with position increment 0 and
         * type="gram"
         */
        /*
         * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
         * Token.next() TODO:Consider adding an option to not emit unigram stopwords
         * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
         * changed to work with this. TODO: Consider optimizing for the case of three
         * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
         * "of-the", "the-year" but with proper management of positions we could
         * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
         * position lookups.
         */

        protected override Token process(Token token)
        {
            Token next = peek(1);

            // if this is the last token just spit it out. Any commongram would have
            // been output in the previous call
            if (next == null)
            {
                return(token);
            }

            /**
             * if this token or next are common then construct a bigram with type="gram"
             * position increment = 0, and put it in the output queue. It will be
             * returned when super.next() is called, before this method gets called with
             * a new token from the input stream See implementation of next() in
             * BufferedTokenStream
             */

            if (isCommon(token) || isCommon(next))
            {
                Token gram = gramToken(token, next);
                write(gram);
            }
            // we always return the unigram token
            return(token);
        }
Ejemplo n.º 2
0
        /** Construct a compound token. */
        private Token gramToken(Token first, Token second)
        {
            buffer.setLength(0);
#pragma warning disable 612
            buffer.append(first.termText());
#pragma warning restore 612
            buffer.append(SEPARATOR);
#pragma warning disable 612
            buffer.append(second.termText());
#pragma warning restore 612
            Token result = new Token(buffer.toString(), first.startOffset(), second
                                     .endOffset(), "gram");
            result.setPositionIncrement(0);
            return(result);
        }
Ejemplo n.º 3
0
 /** True if token is for a common term. */
 private bool isCommon(Token token)
 {
     return(commonWords != null &&
            commonWords.contains(token.termBuffer(), 0, token.termLength()));
 }