private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field)
        {
            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text);
            }

            ICharTermAttribute termAtt;
            IOffsetAttribute offsetAtt;
            IPositionIncrementAttribute posIncAtt;
            IPositionLengthAttribute posLengthAtt;
            ITypeAttribute typeAtt;

            IList<string> tokens = new List<string>();
            IList<string> types = new List<string>();
            IList<int> positions = new List<int>();
            IList<int> positionLengths = new List<int>();
            IList<int> startOffsets = new List<int>();
            IList<int> endOffsets = new List<int>();

            int remainder = random.Next(10);
            StringReader reader = new StringReader(text);

            TokenStream ts;
            using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader))
            {
                 termAtt = ts.HasAttribute<ICharTermAttribute>()
                    ? ts.GetAttribute<ICharTermAttribute>()
                    : null;
                offsetAtt = ts.HasAttribute<IOffsetAttribute>()
                    ? ts.GetAttribute<IOffsetAttribute>()
                    : null;
                posIncAtt = ts.HasAttribute<IPositionIncrementAttribute>()
                    ? ts.GetAttribute<IPositionIncrementAttribute>()
                    : null;
                posLengthAtt = ts.HasAttribute<IPositionLengthAttribute>()
                    ? ts.GetAttribute<IPositionLengthAttribute>()
                    : null;
                typeAtt = ts.HasAttribute<ITypeAttribute>() ? ts.GetAttribute<ITypeAttribute>() : null;

                ts.Reset();

                // First pass: save away "correct" tokens
                while (ts.IncrementToken())
                {
                    Assert.IsNotNull(termAtt, "has no CharTermAttribute");
                    tokens.Add(termAtt.ToString());
                    if (typeAtt != null)
                    {
                        types.Add(typeAtt.Type);
                    }
                    if (posIncAtt != null)
                    {
                        positions.Add(posIncAtt.PositionIncrement);
                    }
                    if (posLengthAtt != null)
                    {
                        positionLengths.Add(posLengthAtt.PositionLength);
                    }
                    if (offsetAtt != null)
                    {
                        startOffsets.Add(offsetAtt.StartOffset());
                        endOffsets.Add(offsetAtt.EndOffset());
                    }
                }
                ts.End();
            }

            // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
            if (tokens.Count > 0)
            {
                // KWTokenizer (for example) can produce a token
                // even when input is length 0:
                if (text.Length != 0)
                {
                    // (Optional) second pass: do something evil:
                    int evilness = random.Next(50);
                    if (evilness == 17)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception");
                        }
                        // Throw an errant exception from the Reader:

                        MockReaderWrapper evilReader = new MockReaderWrapper(random, text);
                        evilReader.ThrowExcAfterChar(random.Next(text.Length));
                        reader = evilReader;

                        try
                        {
                            // NOTE: some Tokenizers go and read characters
                            // when you call .setReader(Reader), eg
                            // PatternTokenizer.  this is a bit
                            // iffy... (really, they should only
                            // pull from the Reader when you call
                            // .incremenToken(), I think?), but we
                            // currently allow it, so, we must call
                            // a.TokenStream inside the try since we may
                            // hit the exc on init:
                            ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(evilReader, remainder) : evilReader);
                            ts.Reset();
                            while (ts.IncrementToken()) ;
                            Assert.Fail("did not hit exception");
                        }
                        catch (Exception re)
                        {
                            Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re));
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                    else if (evilness == 7)
                    {
                        // Only consume a subset of the tokens:
                        int numTokensToRead = random.Next(tokens.Count);
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens");
                        }

                        reader = new StringReader(text);
                        ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader);
                        ts.Reset();
                        for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++)
                        {
                            Assert.IsTrue(ts.IncrementToken());
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                }
            }

            // Final pass: verify clean tokenization matches
            // results from first pass:

            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens");
            }
            reader = new StringReader(text);

            long seed = random.Next();
            random = new Random((int)seed);
            if (random.Next(30) == 7)
            {
                if (VERBOSE)
                {
                    Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader");
                }

                reader = new MockReaderWrapper(random, text);
            }

            ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader);
            if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (typeAtt != null && posIncAtt != null && offsetAtt != null)
            {
                // offset + pos + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && offsetAtt != null)
            {
                // offset + pos
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (offsetAtt != null)
            {
                // offset
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect);
            }
            else
            {
                // terms only
                AssertTokenStreamContents(ts, tokens.ToArray());
            }

            if (field != null)
            {
                reader = new StringReader(text);
                random = new Random((int)seed);
                if (random.Next(30) == 7)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader");
                    }

                    reader = new MockReaderWrapper(random, text);
                }

                field.ReaderValue = useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader;
            }
        }
예제 #2
0
        private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field)
        {
            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text);
            }

            ICharTermAttribute          termAtt;
            IOffsetAttribute            offsetAtt;
            IPositionIncrementAttribute posIncAtt;
            IPositionLengthAttribute    posLengthAtt;
            ITypeAttribute typeAtt;

            IList <string> tokens          = new List <string>();
            IList <string> types           = new List <string>();
            IList <int>    positions       = new List <int>();
            IList <int>    positionLengths = new List <int>();
            IList <int>    startOffsets    = new List <int>();
            IList <int>    endOffsets      = new List <int>();

            int          remainder = random.Next(10);
            StringReader reader    = new StringReader(text);

            TokenStream ts;

            using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader))
            {
                termAtt = ts.HasAttribute <ICharTermAttribute>()
                    ? ts.GetAttribute <ICharTermAttribute>()
                    : null;
                offsetAtt = ts.HasAttribute <IOffsetAttribute>()
                    ? ts.GetAttribute <IOffsetAttribute>()
                    : null;
                posIncAtt = ts.HasAttribute <IPositionIncrementAttribute>()
                    ? ts.GetAttribute <IPositionIncrementAttribute>()
                    : null;
                posLengthAtt = ts.HasAttribute <IPositionLengthAttribute>()
                    ? ts.GetAttribute <IPositionLengthAttribute>()
                    : null;
                typeAtt = ts.HasAttribute <ITypeAttribute>() ? ts.GetAttribute <ITypeAttribute>() : null;

                ts.Reset();

                // First pass: save away "correct" tokens
                while (ts.IncrementToken())
                {
                    Assert.IsNotNull(termAtt, "has no CharTermAttribute");
                    tokens.Add(termAtt.ToString());
                    if (typeAtt != null)
                    {
                        types.Add(typeAtt.Type);
                    }
                    if (posIncAtt != null)
                    {
                        positions.Add(posIncAtt.PositionIncrement);
                    }
                    if (posLengthAtt != null)
                    {
                        positionLengths.Add(posLengthAtt.PositionLength);
                    }
                    if (offsetAtt != null)
                    {
                        startOffsets.Add(offsetAtt.StartOffset());
                        endOffsets.Add(offsetAtt.EndOffset());
                    }
                }
                ts.End();
            }

            // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
            if (tokens.Count > 0)
            {
                // KWTokenizer (for example) can produce a token
                // even when input is length 0:
                if (text.Length != 0)
                {
                    // (Optional) second pass: do something evil:
                    int evilness = random.Next(50);
                    if (evilness == 17)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception");
                        }
                        // Throw an errant exception from the Reader:

                        MockReaderWrapper evilReader = new MockReaderWrapper(random, text);
                        evilReader.ThrowExcAfterChar(random.Next(text.Length));
                        reader = evilReader;

                        try
                        {
                            // NOTE: some Tokenizers go and read characters
                            // when you call .setReader(Reader), eg
                            // PatternTokenizer.  this is a bit
                            // iffy... (really, they should only
                            // pull from the Reader when you call
                            // .incremenToken(), I think?), but we
                            // currently allow it, so, we must call
                            // a.TokenStream inside the try since we may
                            // hit the exc on init:
                            ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(evilReader, remainder) : evilReader);
                            ts.Reset();
                            while (ts.IncrementToken())
                            {
                                ;
                            }
                            Assert.Fail("did not hit exception");
                        }
                        catch (Exception re)
                        {
                            Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re));
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                    else if (evilness == 7)
                    {
                        // Only consume a subset of the tokens:
                        int numTokensToRead = random.Next(tokens.Count);
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens");
                        }

                        reader = new StringReader(text);
                        ts     = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader);
                        ts.Reset();
                        for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++)
                        {
                            Assert.IsTrue(ts.IncrementToken());
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                }
            }

            // Final pass: verify clean tokenization matches
            // results from first pass:

            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens");
            }
            reader = new StringReader(text);

            long seed = random.Next();

            random = new Random((int)seed);
            if (random.Next(30) == 7)
            {
                if (VERBOSE)
                {
                    Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader");
                }

                reader = new MockReaderWrapper(random, text);
            }

            ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader);
            if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (typeAtt != null && posIncAtt != null && offsetAtt != null)
            {
                // offset + pos + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && offsetAtt != null)
            {
                // offset + pos
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (offsetAtt != null)
            {
                // offset
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect);
            }
            else
            {
                // terms only
                AssertTokenStreamContents(ts, tokens.ToArray());
            }

            if (field != null)
            {
                reader = new StringReader(text);
                random = new Random((int)seed);
                if (random.Next(30) == 7)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader");
                    }

                    reader = new MockReaderWrapper(random, text);
                }

                field.ReaderValue = useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader;
            }
        }