Beispiel #1
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // This should be a good tokenizer for most European-language documents:
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            var intput = new StandardTokenizer(Version.LUCENE_30, reader);

            // A ShingleMatrixFilter constructs shingles from a token stream.
            // "2010 Audi RS5 Quattro Coupe" => "2010 Audi", "Audi RS5", "RS5 Quattro", "Quattro Coupe"
            var shingleMatrixOutput = new ShingleMatrixFilter(
                // stream from which to construct the matrix
                intput,
                // minimum number of tokens in any shingle
                2,
                // maximum number of tokens in any shingle.
                8,
                // character to use between texts of the token parts in a shingle.
                ' ');

            // Normalizes token text to lower case.
            var lowerCaseFilter = new LowerCaseFilter(shingleMatrixOutput);

            // Removes stop words from a token stream.
            return(new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var tokenizer       = new StandardTokenizer(Version.LUCENE_30, reader);
            var shingleMatrix   = new ShingleMatrixFilter(tokenizer, 2, 8, ' ');
            var lowerCaseFilter = new LowerCaseFilter(shingleMatrix);

            return(new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
        }
        public void TestBehavingAsShingleFilter()
        {
            ShingleMatrixFilter.DefaultSettingsCodec = null;

            TokenStream ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, ' ', false,
                                                     new OneDimensionalNonWeightedTokenSettingsCodec
                                                         ());

            Assert.IsFalse(ts.IncrementToken());

            // test a plain old token stream with synonyms translated to rows.

            var tokens = new LinkedList <Token>();

            tokens.AddLast(CreateToken("please", 0, 6));
            tokens.AddLast(CreateToken("divide", 7, 13));
            tokens.AddLast(CreateToken("this", 14, 18));
            tokens.AddLast(CreateToken("sentence", 19, 27));
            tokens.AddLast(CreateToken("into", 28, 32));
            tokens.AddLast(CreateToken("shingles", 33, 39));

            var tls = new TokenListStream(tokens);

            // bi-grams

            ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new OneDimensionalNonWeightedTokenSettingsCodec());

            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
            //{
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertTokenStreamContents(ts,
                                      new[]
            {
                "please", "please divide", "divide", "divide this",
                "this", "this sentence", "sentence", "sentence into", "into",
                "into shingles", "shingles"
            },
                                      new[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33 },
                                      new[] { 6, 13, 13, 18, 18, 27, 27, 32, 32, 39, 39 });
        }
        public void TestIterator()
        {
            var wst = new WhitespaceTokenizer(new StringReader("one two three four five"));
            var smf = new ShingleMatrixFilter(wst, 2, 2, '_', false,
                                              new OneDimensionalNonWeightedTokenSettingsCodec());

            int i;

            for (i = 0; smf.IncrementToken(); i++)
            {
            }

            Assert.AreEqual(4, i);

            // call next once more. this should return false again rather than throwing an exception (LUCENE-1939)
            Assert.IsFalse(smf.IncrementToken());

            //System.DateTime.Now;
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new ShingleMatrixFilter(new ASCIIFoldingFilter(new LowerCaseFilter(new SpecialNoneWhiteSpaceFilter(new CDRWhitespaceTokenizer(reader)))), minShingles, 2, '_', false, new Lucene.Net.Analysis.Shingle.Codec.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());

            return(result);
        }
        public void TestMatrix()
        {
            // some other tests set this to null.
            // set it here in case tests are run out of the usual order.
            ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec();

            var matrix = new Matrix();

            new Column(TokenFactory("no", 1), matrix);
            new Column(TokenFactory("surprise", 1), matrix);
            new Column(TokenFactory("to", 1), matrix);
            new Column(TokenFactory("see", 1), matrix);
            new Column(TokenFactory("england", 1), matrix);
            new Column(TokenFactory("manager", 1), matrix);

            var col = new Column(matrix);

            // sven göran eriksson is a multi token synonym to svennis
            new Row(col).Tokens.AddLast(TokenFactory("svennis", 1));

            var row = new Row(col);

            row.Tokens.AddLast(TokenFactory("sven", 1));
            row.Tokens.AddLast(TokenFactory("göran", 1));
            row.Tokens.AddLast(TokenFactory("eriksson", 1));

            new Column(TokenFactory("in", 1), matrix);
            new Column(TokenFactory("the", 1), matrix);
            new Column(TokenFactory("croud", 1), matrix);

            TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true,
                                                     new SimpleThreeDimensionalTokenSettingsCodec());

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            AssertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
            AssertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
            AssertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
            AssertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
            AssertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
            AssertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
            AssertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
            AssertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
            AssertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
            AssertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
            AssertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
            AssertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
            AssertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);

            Assert.IsFalse(ts.IncrementToken());
        }
        public void TestTokenStream()
        {
            ShingleMatrixFilter.DefaultSettingsCodec = null;
            //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();

            // test a plain old token stream with synonyms tranlated to rows.

            var tokens = new LinkedList <Token>();

            tokens.AddLast(TokenFactory("hello", 1, 0, 4));
            tokens.AddLast(TokenFactory("greetings", 0, 0, 4));
            tokens.AddLast(TokenFactory("world", 1, 5, 10));
            tokens.AddLast(TokenFactory("earth", 0, 5, 10));
            tokens.AddLast(TokenFactory("tellus", 0, 5, 10));

            TokenStream tls = new TokenListStream(tokens);

            // bi-grams

            TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false,
                                                     new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());

            AssertNext(ts, "hello_world");
            AssertNext(ts, "greetings_world");
            AssertNext(ts, "hello_earth");
            AssertNext(ts, "greetings_earth");
            AssertNext(ts, "hello_tellus");
            AssertNext(ts, "greetings_tellus");
            Assert.IsFalse(ts.IncrementToken());

            // bi-grams with no spacer character, start offset, end offset

            tls.Reset();
            ts = new ShingleMatrixFilter(tls, 2, 2, null, false,
                                         new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
            AssertNext(ts, "helloworld", 0, 10);
            AssertNext(ts, "greetingsworld", 0, 10);
            AssertNext(ts, "helloearth", 0, 10);
            AssertNext(ts, "greetingsearth", 0, 10);
            AssertNext(ts, "hellotellus", 0, 10);
            AssertNext(ts, "greetingstellus", 0, 10);
            Assert.IsFalse(ts.IncrementToken());


            // add ^_prefix_and_suffix_$
            //
            // using 3d codec as it supports weights

            ShingleMatrixFilter.DefaultSettingsCodec =
                new SimpleThreeDimensionalTokenSettingsCodec();

            tokens = new LinkedList <Token>();
            tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow));

            tls = new TokenListStream(tokens);

            // bi-grams, position incrememnt, weight, start offset, end offset

            ts = new PrefixAndSuffixAwareTokenFilter(
                new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)),
                tls,
                new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0))
                );
            tls = new CachingTokenFilter(ts);

            ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);

            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) {
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            Assert.IsFalse(ts.IncrementToken());

            // test unlimited size and allow single boundary token as shingle
            tls.Reset();

            ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false);


            //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
            //{
            //    Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
            //    token.Clear();
            //}

            AssertNext(ts, "^", 1, 10.0f, 0, 0);
            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello", 1, 1.0f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "world", 1, 1.0f, 5, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "$", 1, 7.071068f, 10, 10);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "earth", 1, 1.0f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);

            Assert.IsFalse(ts.IncrementToken());

            // test unlimited size but don't allow single boundary token as shingle

            tls.Reset();
            ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true);

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello", 1, 1.0f, 0, 4);
            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "world", 1, 1.0f, 5, 10);
            AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
            AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
            AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "earth", 1, 1.0f, 5, 10);
            AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
            AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
            AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
            AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
            AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
            AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);


            Assert.IsFalse(ts.IncrementToken());

            //System.currentTimeMillis();

            // multi-token synonyms
            //
            // Token[][][] {
            //    {{hello}, {greetings, and, salutations},
            //    {{world}, {earth}, {tellus}}
            // }
            //


            tokens = new LinkedList <Token>();
            tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow));
            tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow));
            tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
            tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow));
            tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow));

            tls = new TokenListStream(tokens);

            // 2-3 grams

            ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            // shingle, position increment, weight, start offset, end offset

            AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
            AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
            AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
            AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
            AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
            AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);

            Assert.IsFalse(ts.IncrementToken());

            //System.currentTimeMillis();
        }