Esempio n. 1
0
        public virtual void TestBasic2()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;

            Add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
            Add("bbb", "bbbb1 bbbb2", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();

#pragma warning disable 162
            if (keepOrig)
            {
                Verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
                Verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
            else
            {
                Verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
                Verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
#pragma warning restore 612, 618
        }
        public virtual void TestMaxPosition3WithSynomyms()
        {
            foreach (bool consumeAll in new bool[] { true, false })
            {
                MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
                // if we are consuming all tokens, we can use the checks, otherwise we can't
                tokenizer.EnableChecks = consumeAll;

                SynonymMap.Builder builder = new SynonymMap.Builder(true);
                builder.Add(new CharsRef("one"), new CharsRef("first"), true);
                builder.Add(new CharsRef("one"), new CharsRef("alpha"), true);
                builder.Add(new CharsRef("one"), new CharsRef("beguine"), true);
                CharsRef multiWordCharsRef = new CharsRef();
                SynonymMap.Builder.Join(new string[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
                builder.Add(new CharsRef("one"), multiWordCharsRef, true);
                SynonymMap.Builder.Join(new string[] { "dopple", "ganger" }, multiWordCharsRef);
                builder.Add(new CharsRef("two"), multiWordCharsRef, true);
                SynonymMap synonymMap = builder.Build();
                TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
                stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

                // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
                AssertTokenStreamContents(stream, new string[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
            }
        }
Esempio n. 3
0
        public virtual void TestRandom2GraphAfter()
        {
            int    numIters = AtLeast(3);
            Random random   = Random;

            for (int i = 0; i < numIters; i++)
            {
                b = new SynonymMap.Builder(random.nextBoolean());
                int numEntries = AtLeast(10);
                for (int j = 0; j < numEntries; j++)
                {
                    Add(RandomNonEmptyString(), RandomNonEmptyString(), random.nextBoolean());
                }
                SynonymMap map        = b.Build();
                bool       ignoreCase = random.nextBoolean();

                Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                {
                    Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
                    TokenStream syns    = new SynonymFilter(tokenizer, map, ignoreCase);
                    TokenStream graph   = new MockGraphTokenFilter(Random, syns);
                    return(new TokenStreamComponents(tokenizer, graph));
                });

                CheckRandomData(random, analyzer, 100);
            }
        }
Esempio n. 4
0
        public virtual void TestBasic()
        {
            b = new SynonymMap.Builder(true);
            Add("a", "foo", true);
            Add("a b", "bar fee", true);
            Add("b c", "dog collar", true);
            Add("c d", "dog harness holder extras", true);
            Add("m c e", "dog barks loudly", false);
            Add("i j k", "feep", true);

            Add("e f", "foo bar", false);
            Add("e f", "baz bee", false);

            Add("z", "boo", false);
            Add("y", "bee", true);

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();

            Verify("a b c", "a/bar b/fee c");

            // syn output extends beyond input tokens
            Verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");

            Verify("a b a", "a/bar b/fee a/foo");

            // outputs that add to one another:
            Verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");

            // two outputs for same input
            Verify("e f", "foo/baz bar/bee");

            // verify multi-word / single-output offsets:
            Verify("g i j k g", "g i/feep:7_3 j k g");

            // mixed keepOrig true/false:
            Verify("a m c e x", "a/foo dog barks loudly x");
            Verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
            assertTrue(tokensOut.CaptureCount > 0);

            // no captureStates when no syns matched
            Verify("p q r s t", "p q r s t");
            assertEquals(0, tokensOut.CaptureCount);

            // no captureStates when only single-input syns, w/ no
            // lookahead needed, matched
            Verify("p q z y t", "p q boo y/bee t");
            assertEquals(0, tokensOut.CaptureCount);
        }
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
                TokenStream syns      = new SynonymFilter(tokenizer, map, ignoreCase);
                TokenStream graph     = new MockGraphTokenFilter(Random(), syns);

                return(new TokenStreamComponents(tokenizer, graph));
            }
Esempio n. 6
0
        public virtual void TestOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;

            // b hangs off the end (no input token under it):
            Add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();

            // Make sure endOffset inherits from previous input token:
            Verify("a", "a b:1");
        }
Esempio n. 7
0
        public virtual void TestRandom()
        {
            int alphabetSize = TestUtil.NextInt32(Random, 2, 7);

            int docLen = AtLeast(3000);
            //final int docLen = 50;

            string document = GetRandomString('a', alphabetSize, docLen);

            if (Verbose)
            {
                Console.WriteLine("TEST: doc=" + document);
            }

            int numSyn = AtLeast(5);
            //final int numSyn = 2;

            IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>();
            IList <OneSyn> syns  = new JCG.List <OneSyn>();
            bool           dedup = Random.nextBoolean();

            if (Verbose)
            {
                Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0; synIDX < numSyn; synIDX++)
            {
                string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim();
                if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null)
                {
                    s     = new OneSyn();
                    s.@in = synIn;
                    syns.Add(s);
                    s.@out        = new JCG.List <string>();
                    synMap[synIn] = s;
                    s.keepOrig    = Random.nextBoolean();
                }
                string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim();
                [email protected](synOut);
                Add(synIn, synOut, s.keepOrig);
                if (Verbose)
                {
                    Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
                }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();

            if (dedup)
            {
                PruneDups(syns);
            }

            string expected = SlowSynMatcher(document, syns, 5);

            if (Verbose)
            {
                Console.WriteLine("TEST: expected=" + expected);
            }

            Verify(document, expected);
        }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
     TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase);
     return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
 }
        public virtual void TestBasic2()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            Add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
            Add("bbb", "bbbb1 bbbb2", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();

#pragma warning disable 162
            if (keepOrig)
            {
                Verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
                Verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
            else
            {
                Verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
                Verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
#pragma warning restore 612, 618
        }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
     TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase);
     TokenStream graph = new MockGraphTokenFilter(Random(), syns);
     return new TokenStreamComponents(tokenizer, graph);
 }
        public virtual void TestRandom()
        {

            int alphabetSize = TestUtil.NextInt(Random(), 2, 7);

            int docLen = AtLeast(3000);
            //final int docLen = 50;

            string document = GetRandomString('a', alphabetSize, docLen);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: doc=" + document);
            }

            int numSyn = AtLeast(5);
            //final int numSyn = 2;

            IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>();
            IList<OneSyn> syns = new List<OneSyn>();
            bool dedup = Random().nextBoolean();
            if (VERBOSE)
            {
                Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0; synIDX < numSyn; synIDX++)
            {
                string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt(Random(), 1, 5)).Trim();
                OneSyn s = synMap.ContainsKey(synIn) ? synMap[synIn] : null;
                if (s == null)
                {
                    s = new OneSyn();
                    s.@in = synIn;
                    syns.Add(s);
                    s.@out = new List<string>();
                    synMap[synIn] = s;
                    s.keepOrig = Random().nextBoolean();
                }
                string synOut = GetRandomString('0', 10, TestUtil.NextInt(Random(), 1, 5)).Trim();
                [email protected](synOut);
                Add(synIn, synOut, s.keepOrig);
                if (VERBOSE)
                {
                    Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
                }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();

            if (dedup)
            {
                PruneDups(syns);
            }

            string expected = SlowSynMatcher(document, syns, 5);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: expected=" + expected);
            }

            Verify(document, expected);
        }
        public virtual void TestBasic()
        {
            b = new SynonymMap.Builder(true);
            Add("a", "foo", true);
            Add("a b", "bar fee", true);
            Add("b c", "dog collar", true);
            Add("c d", "dog harness holder extras", true);
            Add("m c e", "dog barks loudly", false);
            Add("i j k", "feep", true);

            Add("e f", "foo bar", false);
            Add("e f", "baz bee", false);

            Add("z", "boo", false);
            Add("y", "bee", true);

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();

            Verify("a b c", "a/bar b/fee c");

            // syn output extends beyond input tokens
            Verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");

            Verify("a b a", "a/bar b/fee a/foo");

            // outputs that add to one another:
            Verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");

            // two outputs for same input
            Verify("e f", "foo/baz bar/bee");

            // verify multi-word / single-output offsets:
            Verify("g i j k g", "g i/feep:7_3 j k g");

            // mixed keepOrig true/false:
            Verify("a m c e x", "a/foo dog barks loudly x");
            Verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
            assertTrue(tokensOut.CaptureCount > 0);

            // no captureStates when no syns matched
            Verify("p q r s t", "p q r s t");
            assertEquals(0, tokensOut.CaptureCount);

            // no captureStates when only single-input syns, w/ no
            // lookahead needed, matched
            Verify("p q z y t", "p q boo y/bee t");
            assertEquals(0, tokensOut.CaptureCount);
        }
        public virtual void TestOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            // b hangs off the end (no input token under it):
            Add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt = tokensOut.AddAttribute<ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>();
            offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>();
            posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>();

            // Make sure endOffset inherits from previous input token:
            Verify("a", "a b:1");
        }