public virtual void TestBasic2() { b = new SynonymMap.Builder(true); const bool keepOrig = false; Add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); Add("bbb", "bbbb1 bbbb2", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); #pragma warning disable 162 if (keepOrig) { Verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); Verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); } else { Verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); Verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); } #pragma warning restore 612, 618 }
public virtual void TestMaxPosition3WithSynomyms() { foreach (bool consumeAll in new bool[] { true, false }) { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.EnableChecks = consumeAll; SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.Add(new CharsRef("one"), new CharsRef("first"), true); builder.Add(new CharsRef("one"), new CharsRef("alpha"), true); builder.Add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRef multiWordCharsRef = new CharsRef(); SynonymMap.Builder.Join(new string[] { "and", "indubitably", "single", "only" }, multiWordCharsRef); builder.Add(new CharsRef("one"), multiWordCharsRef, true); SynonymMap.Builder.Join(new string[] { "dopple", "ganger" }, multiWordCharsRef); builder.Add(new CharsRef("two"), multiWordCharsRef, true); SynonymMap synonymMap = builder.Build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. AssertTokenStreamContents(stream, new string[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 }); } }
public virtual void TestRandom2GraphAfter() { int numIters = AtLeast(3); Random random = Random; for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); int numEntries = AtLeast(10); for (int j = 0; j < numEntries; j++) { Add(RandomNonEmptyString(), RandomNonEmptyString(), random.nextBoolean()); } SynonymMap map = b.Build(); bool ignoreCase = random.nextBoolean(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase); TokenStream graph = new MockGraphTokenFilter(Random, syns); return(new TokenStreamComponents(tokenizer, graph)); }); CheckRandomData(random, analyzer, 100); } }
public virtual void TestBasic() { b = new SynonymMap.Builder(true); Add("a", "foo", true); Add("a b", "bar fee", true); Add("b c", "dog collar", true); Add("c d", "dog harness holder extras", true); Add("m c e", "dog barks loudly", false); Add("i j k", "feep", true); Add("e f", "foo bar", false); Add("e f", "baz bee", false); Add("z", "boo", false); Add("y", "bee", true); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); Verify("a b c", "a/bar b/fee c"); // syn output extends beyond input tokens Verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); Verify("a b a", "a/bar b/fee a/foo"); // outputs that add to one another: Verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); // two outputs for same input Verify("e f", "foo/baz bar/bee"); // verify multi-word / single-output offsets: Verify("g i j k g", "g i/feep:7_3 j k g"); // mixed keepOrig true/false: Verify("a m c e x", "a/foo dog barks loudly x"); Verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.CaptureCount > 0); // no captureStates when no syns matched Verify("p q r s t", "p q r s t"); assertEquals(0, tokensOut.CaptureCount); // no captureStates when only single-input syns, w/ no // lookahead needed, matched Verify("p q z y t", "p q boo y/bee t"); assertEquals(0, tokensOut.CaptureCount); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase); TokenStream graph = new MockGraphTokenFilter(Random(), syns); return(new TokenStreamComponents(tokenizer, graph)); }
public virtual void TestOutputHangsOffEnd() { b = new SynonymMap.Builder(true); const bool keepOrig = false; // b hangs off the end (no input token under it): Add("a", "a b", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); // Make sure endOffset inherits from previous input token: Verify("a", "a b:1"); }
public virtual void TestRandom() { int alphabetSize = TestUtil.NextInt32(Random, 2, 7); int docLen = AtLeast(3000); //final int docLen = 50; string document = GetRandomString('a', alphabetSize, docLen); if (Verbose) { Console.WriteLine("TEST: doc=" + document); } int numSyn = AtLeast(5); //final int numSyn = 2; IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>(); IList <OneSyn> syns = new JCG.List <OneSyn>(); bool dedup = Random.nextBoolean(); if (Verbose) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0; synIDX < numSyn; synIDX++) { string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim(); if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new JCG.List <string>(); synMap[synIn] = s; s.keepOrig = Random.nextBoolean(); } string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim(); [email protected](synOut); Add(synIn, synOut, s.keepOrig); if (Verbose) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); if (dedup) { PruneDups(syns); } string expected = SlowSynMatcher(document, syns, 5); if (Verbose) { Console.WriteLine("TEST: expected=" + expected); } Verify(document, expected); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase); return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); }
public virtual void TestBasic2() { b = new SynonymMap.Builder(true); const bool keepOrig = false; Add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); Add("bbb", "bbbb1 bbbb2", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); #pragma warning disable 162 if (keepOrig) { Verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); Verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); } else { Verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); Verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); } #pragma warning restore 612, 618 }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase); TokenStream graph = new MockGraphTokenFilter(Random(), syns); return new TokenStreamComponents(tokenizer, graph); }
public virtual void TestRandom() { int alphabetSize = TestUtil.NextInt(Random(), 2, 7); int docLen = AtLeast(3000); //final int docLen = 50; string document = GetRandomString('a', alphabetSize, docLen); if (VERBOSE) { Console.WriteLine("TEST: doc=" + document); } int numSyn = AtLeast(5); //final int numSyn = 2; IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>(); IList<OneSyn> syns = new List<OneSyn>(); bool dedup = Random().nextBoolean(); if (VERBOSE) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0; synIDX < numSyn; synIDX++) { string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt(Random(), 1, 5)).Trim(); OneSyn s = synMap.ContainsKey(synIn) ? synMap[synIn] : null; if (s == null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new List<string>(); synMap[synIn] = s; s.keepOrig = Random().nextBoolean(); } string synOut = GetRandomString('0', 10, TestUtil.NextInt(Random(), 1, 5)).Trim(); [email protected](synOut); Add(synIn, synOut, s.keepOrig); if (VERBOSE) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); if (dedup) { PruneDups(syns); } string expected = SlowSynMatcher(document, syns, 5); if (VERBOSE) { Console.WriteLine("TEST: expected=" + expected); } Verify(document, expected); }
public virtual void TestBasic() { b = new SynonymMap.Builder(true); Add("a", "foo", true); Add("a b", "bar fee", true); Add("b c", "dog collar", true); Add("c d", "dog harness holder extras", true); Add("m c e", "dog barks loudly", false); Add("i j k", "feep", true); Add("e f", "foo bar", false); Add("e f", "baz bee", false); Add("z", "boo", false); Add("y", "bee", true); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); Verify("a b c", "a/bar b/fee c"); // syn output extends beyond input tokens Verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); Verify("a b a", "a/bar b/fee a/foo"); // outputs that add to one another: Verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); // two outputs for same input Verify("e f", "foo/baz bar/bee"); // verify multi-word / single-output offsets: Verify("g i j k g", "g i/feep:7_3 j k g"); // mixed keepOrig true/false: Verify("a m c e x", "a/foo dog barks loudly x"); Verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.CaptureCount > 0); // no captureStates when no syns matched Verify("p q r s t", "p q r s t"); assertEquals(0, tokensOut.CaptureCount); // no captureStates when only single-input syns, w/ no // lookahead needed, matched Verify("p q z y t", "p q boo y/bee t"); assertEquals(0, tokensOut.CaptureCount); }
public virtual void TestOutputHangsOffEnd() { b = new SynonymMap.Builder(true); const bool keepOrig = false; // b hangs off the end (no input token under it): Add("a", "a b", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); // Make sure endOffset inherits from previous input token: Verify("a", "a b:1"); }