A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream. In other words, it creates combinations of tokens as a single token.

For example, the sentence "please divide this sentence into shingles" might be tokenized into shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles".

Using a shingle filter at index and query time can in some instances be used to replace phrase queries, especially them with 0 slop.

Without a spacer character it can be used to handle composition and decomposition of words such as searching for "multi dimensional" instead of "multidimensional". It is a rather common human problem at query time in several languages, notably the northern Germanic branch.

Shingles are amongst many things also known to solve problems in spell checking, language detection and document clustering.

This filter is backed by a three dimensional column oriented matrix used to create permutations of the second dimension, the rows, and leaves the third, the z-axis, for for multi token synonyms.

In order to use this filter you need to define a way of positioning the input stream tokens in the matrix. This is done using a ShingleMatrixFilter.TokenSettingsCodec. There are three simple implementations for demonstrational purposes, see ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec, ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec and ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec.

Consider this token matrix:

 Token[column][row][z-axis]{ {{hello}, {greetings, and, salutations}}, {{world}, {earth}, {tellus}} }; 
It would produce the following 2-3 gram sized shingles:
 "hello_world" "greetings_and" "greetings_and_salutations" "and_salutations" "and_salutations_world" "salutations_world" "hello_earth" "and_salutations_earth" "salutations_earth" "hello_tellus" "and_salutations_tellus" "salutations_tellus" 

This implementation can be rather heap demanding if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns, or if each column contains a great number of rows.

The problem is that in order avoid producing duplicates the filter needs to keep track of any shingle already produced and returned to the consumer.

There is a bit of resource management to handle this but it would of course be much better if the filter was written so it never created the same shingle more than once in the first place.

The filter also has basic support for calculating weights for the shingles based on the weights of the tokens from the input stream, output shingle size, etc. See CalculateShingleWeight.

NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than the ones located in org.apache.lucene.analysis.tokenattributes.

Inheritance: Lucene.Net.Analysis.TokenStream
Ejemplo n.º 1
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader);
     var shingleMatrix = new ShingleMatrixFilter(tokenizer, 2, 8, ' ');
     var lowerCaseFilter = new LowerCaseFilter(shingleMatrix);
     return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 }
Ejemplo n.º 2
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // This should be a good tokenizer for most European-language documents:
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            var intput = new StandardTokenizer(Version.LUCENE_30, reader);

            // A ShingleMatrixFilter constructs shingles from a token stream.
            // "2010 Audi RS5 Quattro Coupe" => "2010 Audi", "Audi RS5", "RS5 Quattro", "Quattro Coupe"
            var shingleMatrixOutput = new ShingleMatrixFilter(
                                                // stream from which to construct the matrix
                                                intput,
                                                // minimum number of tokens in any shingle
                                                2,
                                                // maximum number of tokens in any shingle.
                                                8,
                                                // character to use between texts of the token parts in a shingle.
                                                ' ');

            // Normalizes token text to lower case.
            var lowerCaseFilter = new LowerCaseFilter(shingleMatrixOutput);

            // Removes stop words from a token stream.
            return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
        public void TestIterator()
        {
            var wst = new WhitespaceTokenizer(new StringReader("one two three four five"));
            var smf = new ShingleMatrixFilter(wst, 2, 2, '_', false,
                                              new OneDimensionalNonWeightedTokenSettingsCodec());

            int i;
            for (i = 0; smf.IncrementToken(); i++) { }

            Assert.AreEqual(4, i);

            // call next once more. this should return false again rather than throwing an exception (LUCENE-1939)
            Assert.IsFalse(smf.IncrementToken());

            //System.DateTime.Now;
        }
        public void TestBehavingAsShingleFilter()
        {
            ShingleMatrixFilter.DefaultSettingsCodec = null;

            TokenStream ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, ' ', false,
                                                     new OneDimensionalNonWeightedTokenSettingsCodec
                                                         ());
            Assert.IsFalse(ts.IncrementToken());

            // test a plain old token stream with synonyms translated to rows.

            var tokens = new LinkedList<Token>();
            tokens.AddLast(CreateToken("please", 0, 6));
            tokens.AddLast(CreateToken("divide", 7, 13));
            tokens.AddLast(CreateToken("this", 14, 18));
            tokens.AddLast(CreateToken("sentence", 19, 27));
            tokens.AddLast(CreateToken("into", 28, 32));
            tokens.AddLast(CreateToken("shingles", 33, 39));

            var tls = new TokenListStream(tokens);

            // bi-grams

            ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new OneDimensionalNonWeightedTokenSettingsCodec());

            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
            //{
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertTokenStreamContents(ts,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this",
                                              "this", "this sentence", "sentence", "sentence into", "into",
                                              "into shingles", "shingles"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33},
                                      new[] {6, 13, 13, 18, 18, 27, 27, 32, 32, 39, 39});
        }
        public void TestMatrix()
        {
            // some other tests set this to null.
            // set it here in case tests are run out of the usual order.
            ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec();

            var matrix = new Matrix.Matrix();

            new Column(TokenFactory("no", 1), matrix);
            new Column(TokenFactory("surprise", 1), matrix);
            new Column(TokenFactory("to", 1), matrix);
            new Column(TokenFactory("see", 1), matrix);
            new Column(TokenFactory("england", 1), matrix);
            new Column(TokenFactory("manager", 1), matrix);

            var col = new Column(matrix);

            // sven göran eriksson is a multi token synonym to svennis
            new Row(col).Tokens.AddLast(TokenFactory("svennis", 1));

            var row = new Row(col);
            row.Tokens.AddLast(TokenFactory("sven", 1));
            row.Tokens.AddLast(TokenFactory("göran", 1));
            row.Tokens.AddLast(TokenFactory("eriksson", 1));

            new Column(TokenFactory("in", 1), matrix);
            new Column(TokenFactory("the", 1), matrix);
            new Column(TokenFactory("croud", 1), matrix);

            TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true,
                                                     new SimpleThreeDimensionalTokenSettingsCodec());

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            AssertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
            AssertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
            AssertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
            AssertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
            AssertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
            AssertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
            AssertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
            AssertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
            AssertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
            AssertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
            AssertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
            AssertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
            AssertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);

            Assert.IsFalse(ts.IncrementToken());
        }
        public void TestTokenStream()
        {
            ShingleMatrixFilter.DefaultSettingsCodec = null;
            //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();

            // test a plain old token stream with synonyms tranlated to rows.

            var tokens = new LinkedList<Token>();
            tokens.AddLast(TokenFactory("hello", 1, 0, 4));
            tokens.AddLast(TokenFactory("greetings", 0, 0, 4));
            tokens.AddLast(TokenFactory("world", 1, 5, 10));
            tokens.AddLast(TokenFactory("earth", 0, 5, 10));
            tokens.AddLast(TokenFactory("tellus", 0, 5, 10));

            TokenStream tls = new TokenListStream(tokens);

            // bi-grams

            TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false,
                                                     new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());

            AssertNext(ts, "hello_world");
            AssertNext(ts, "greetings_world");
            AssertNext(ts, "hello_earth");
            AssertNext(ts, "greetings_earth");
            AssertNext(ts, "hello_tellus");
            AssertNext(ts, "greetings_tellus");
            Assert.IsFalse(ts.IncrementToken());

            // bi-grams with no spacer character, start offset, end offset

            tls.Reset();
            ts = new ShingleMatrixFilter(tls, 2, 2, null, false,
                                         new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
            AssertNext(ts, "helloworld", 0, 10);
            AssertNext(ts, "greetingsworld", 0, 10);
            AssertNext(ts, "helloearth", 0, 10);
            AssertNext(ts, "greetingsearth", 0, 10);
            AssertNext(ts, "hellotellus", 0, 10);
            AssertNext(ts, "greetingstellus", 0, 10);
            Assert.IsFalse(ts.IncrementToken());


            // add ^_prefix_and_suffix_$
            //
            // using 3d codec as it supports weights

            ShingleMatrixFilter.DefaultSettingsCodec =
                new SimpleThreeDimensionalTokenSettingsCodec();

            tokens = new LinkedList<Token>();
            tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow));

            tls = new TokenListStream(tokens);

            // bi-grams, position incrememnt, weight, start offset, end offset

            ts = new PrefixAndSuffixAwareTokenFilter(
                new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)),
                tls,
                new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0))
                );
            tls = new CachingTokenFilter(ts);

            ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);

            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) {
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            Assert.IsFalse(ts.IncrementToken());

            // test unlimited size and allow single boundary token as shingle
            tls.Reset();

            ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false);


            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
            //{
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertNext(ts, "^", 1, 10.0f, 0, 0);
            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello", 1, 1.0f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "world", 1, 1.0f, 5, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "$", 1, 7.071068f, 10, 10);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "earth", 1, 1.0f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);

            Assert.IsFalse(ts.IncrementToken());

            // test unlimited size but don't allow single boundary token as shingle

            tls.Reset();
            ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true);

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello", 1, 1.0f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "world", 1, 1.0f, 5, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "earth", 1, 1.0f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);


            Assert.IsFalse(ts.IncrementToken());

            //System.currentTimeMillis();

            // multi-token synonyms
            //
            // Token[][][] {
            //    {{hello}, {greetings, and, salutations},
            //    {{world}, {earth}, {tellus}}
            // }
            //


            tokens = new LinkedList<Token>();
            tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow));
            tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow));
            tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow));

            tls = new TokenListStream(tokens);

            // 2-3 grams

            ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            // shingle, position increment, weight, start offset, end offset

            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
            AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
            AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
            AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);

            Assert.IsFalse(ts.IncrementToken());

            //System.currentTimeMillis();
        }