public virtual void Test() { PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(CreateToken("^", 0, 0)), new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false), new SingleTokenTokenStream(CreateToken("$", 0, 0))); AssertTokenStreamContents(ts, new string[] { "^", "hello", "world", "$" }, new int[] { 0, 0, 6, 11 }, new int[] { 0, 5, 11, 11 }); }
public void TestTokenStreamContents() { var ts = new PrefixAndSuffixAwareTokenFilter( new SingleTokenTokenStream(CreateToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")), new SingleTokenTokenStream(CreateToken("$", 0, 0))); AssertTokenStreamContents(ts, new[] {"^", "hello", "world", "$"}, new[] {0, 0, 6, 11}, new[] {0, 5, 11, 11}); }
public virtual void Test() { PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(CreateToken("^", 0, 0)), new MockTokenizer(new StringReader("hello world"), MockTokenizer.WHITESPACE, false), new SingleTokenTokenStream(CreateToken("$", 0, 0))); AssertTokenStreamContents(ts, new string[] { "^", "hello", "world", "$" }, new int[] { 0, 0, 6, 11 }, new int[] { 0, 5, 11, 11 }); }
public void TestTokenStream() { ShingleMatrixFilter.DefaultSettingsCodec = null; //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); // test a plain old token stream with synonyms tranlated to rows. var tokens = new LinkedList<Token>(); tokens.AddLast(TokenFactory("hello", 1, 0, 4)); tokens.AddLast(TokenFactory("greetings", 0, 0, 4)); tokens.AddLast(TokenFactory("world", 1, 5, 10)); tokens.AddLast(TokenFactory("earth", 0, 5, 10)); tokens.AddLast(TokenFactory("tellus", 0, 5, 10)); TokenStream tls = new TokenListStream(tokens); // bi-grams TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); AssertNext(ts, "hello_world"); AssertNext(ts, "greetings_world"); AssertNext(ts, "hello_earth"); AssertNext(ts, "greetings_earth"); AssertNext(ts, "hello_tellus"); AssertNext(ts, "greetings_tellus"); Assert.IsFalse(ts.IncrementToken()); // bi-grams with no spacer character, start offset, end offset tls.Reset(); ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); AssertNext(ts, "helloworld", 0, 10); AssertNext(ts, "greetingsworld", 0, 10); AssertNext(ts, "helloearth", 0, 10); AssertNext(ts, "greetingsearth", 0, 10); AssertNext(ts, "hellotellus", 0, 10); AssertNext(ts, "greetingstellus", 0, 10); Assert.IsFalse(ts.IncrementToken()); // add ^_prefix_and_suffix_$ // // using 3d codec as it supports weights ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec(); tokens = new LinkedList<Token>(); tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow)); tls = new TokenListStream(tokens); // bi-grams, position incrememnt, weight, start offset, end offset ts = new PrefixAndSuffixAwareTokenFilter( new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0)) ); tls = new CachingTokenFilter(ts); ts = new ShingleMatrixFilter(tls, 2, 2, '_', false); //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) { // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");"); // token.Clear(); //} AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); Assert.IsFalse(ts.IncrementToken()); // test unlimited size and allow single boundary token as shingle tls.Reset(); ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false); //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) //{ // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");"); // token.Clear(); //} AssertNext(ts, "^", 1, 10.0f, 0, 0); AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello", 1, 1.0f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "world", 1, 1.0f, 5, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "$", 1, 7.071068f, 10, 10); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings", 1, 1.0f, 0, 4); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "earth", 1, 1.0f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "tellus", 1, 1.0f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); Assert.IsFalse(ts.IncrementToken()); // test unlimited size but don't allow single boundary token as shingle tls.Reset(); ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4); AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello", 1, 1.0f, 0, 4); AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "world", 1, 1.0f, 5, 10); AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4); AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings", 1, 1.0f, 0, 4); AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "earth", 1, 1.0f, 5, 10); AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10); AssertNext(ts, "tellus", 1, 1.0f, 5, 10); AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10); AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10); AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10); AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10); Assert.IsFalse(ts.IncrementToken()); //System.currentTimeMillis(); // multi-token synonyms // // Token[][][] { // {{hello}, {greetings, and, salutations}, // {{world}, {earth}, {tellus}} // } // tokens = new LinkedList<Token>(); tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow)); tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow)); tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn)); tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow)); tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow)); tls = new TokenListStream(tokens); // 2-3 grams ts = new ShingleMatrixFilter(tls, 2, 3, '_', false); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } // shingle, position increment, weight, start offset, end offset AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4); AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4); AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4); AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10); AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10); AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10); AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10); Assert.IsFalse(ts.IncrementToken()); //System.currentTimeMillis(); }