private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text); } ICharTermAttribute termAtt; IOffsetAttribute offsetAtt; IPositionIncrementAttribute posIncAtt; IPositionLengthAttribute posLengthAtt; ITypeAttribute typeAtt; IList<string> tokens = new List<string>(); IList<string> types = new List<string>(); IList<int> positions = new List<int>(); IList<int> positionLengths = new List<int>(); IList<int> startOffsets = new List<int>(); IList<int> endOffsets = new List<int>(); int remainder = random.Next(10); StringReader reader = new StringReader(text); TokenStream ts; using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader)) { termAtt = ts.HasAttribute<ICharTermAttribute>() ? ts.GetAttribute<ICharTermAttribute>() : null; offsetAtt = ts.HasAttribute<IOffsetAttribute>() ? ts.GetAttribute<IOffsetAttribute>() : null; posIncAtt = ts.HasAttribute<IPositionIncrementAttribute>() ? ts.GetAttribute<IPositionIncrementAttribute>() : null; posLengthAtt = ts.HasAttribute<IPositionLengthAttribute>() ? ts.GetAttribute<IPositionLengthAttribute>() : null; typeAtt = ts.HasAttribute<ITypeAttribute>() ? ts.GetAttribute<ITypeAttribute>() : null; ts.Reset(); // First pass: save away "correct" tokens while (ts.IncrementToken()) { Assert.IsNotNull(termAtt, "has no CharTermAttribute"); tokens.Add(termAtt.ToString()); if (typeAtt != null) { types.Add(typeAtt.Type); } if (posIncAtt != null) { positions.Add(posIncAtt.PositionIncrement); } if (posLengthAtt != null) { positionLengths.Add(posLengthAtt.PositionLength); } if (offsetAtt != null) { startOffsets.Add(offsetAtt.StartOffset()); endOffsets.Add(offsetAtt.EndOffset()); } } ts.End(); } // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (tokens.Count > 0) { // KWTokenizer (for example) can produce a token // even when input is length 0: if (text.Length != 0) { // (Optional) second pass: do something evil: int evilness = random.Next(50); if (evilness == 17) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception"); } // Throw an errant exception from the Reader: MockReaderWrapper evilReader = new MockReaderWrapper(random, text); evilReader.ThrowExcAfterChar(random.Next(text.Length)); reader = evilReader; try { // NOTE: some Tokenizers go and read characters // when you call .setReader(Reader), eg // PatternTokenizer. this is a bit // iffy... (really, they should only // pull from the Reader when you call // .incremenToken(), I think?), but we // currently allow it, so, we must call // a.TokenStream inside the try since we may // hit the exc on init: ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(evilReader, remainder) : evilReader); ts.Reset(); while (ts.IncrementToken()) ; Assert.Fail("did not hit exception"); } catch (Exception re) { Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re)); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } else if (evilness == 7) { // Only consume a subset of the tokens: int numTokensToRead = random.Next(tokens.Count); if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens"); } reader = new StringReader(text); ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader); ts.Reset(); for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++) { Assert.IsTrue(ts.IncrementToken()); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } } } // Final pass: verify clean tokenization matches // results from first pass: if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens"); } reader = new StringReader(text); long seed = random.Next(); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader); if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (offsetAtt != null) { // offset AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect); } else { // terms only AssertTokenStreamContents(ts, tokens.ToArray()); } if (field != null) { reader = new StringReader(text); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } field.ReaderValue = useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader; } }
private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text); } ICharTermAttribute termAtt; IOffsetAttribute offsetAtt; IPositionIncrementAttribute posIncAtt; IPositionLengthAttribute posLengthAtt; ITypeAttribute typeAtt; IList <string> tokens = new List <string>(); IList <string> types = new List <string>(); IList <int> positions = new List <int>(); IList <int> positionLengths = new List <int>(); IList <int> startOffsets = new List <int>(); IList <int> endOffsets = new List <int>(); int remainder = random.Next(10); StringReader reader = new StringReader(text); TokenStream ts; using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader)) { termAtt = ts.HasAttribute <ICharTermAttribute>() ? ts.GetAttribute <ICharTermAttribute>() : null; offsetAtt = ts.HasAttribute <IOffsetAttribute>() ? ts.GetAttribute <IOffsetAttribute>() : null; posIncAtt = ts.HasAttribute <IPositionIncrementAttribute>() ? ts.GetAttribute <IPositionIncrementAttribute>() : null; posLengthAtt = ts.HasAttribute <IPositionLengthAttribute>() ? ts.GetAttribute <IPositionLengthAttribute>() : null; typeAtt = ts.HasAttribute <ITypeAttribute>() ? ts.GetAttribute <ITypeAttribute>() : null; ts.Reset(); // First pass: save away "correct" tokens while (ts.IncrementToken()) { Assert.IsNotNull(termAtt, "has no CharTermAttribute"); tokens.Add(termAtt.ToString()); if (typeAtt != null) { types.Add(typeAtt.Type); } if (posIncAtt != null) { positions.Add(posIncAtt.PositionIncrement); } if (posLengthAtt != null) { positionLengths.Add(posLengthAtt.PositionLength); } if (offsetAtt != null) { startOffsets.Add(offsetAtt.StartOffset()); endOffsets.Add(offsetAtt.EndOffset()); } } ts.End(); } // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (tokens.Count > 0) { // KWTokenizer (for example) can produce a token // even when input is length 0: if (text.Length != 0) { // (Optional) second pass: do something evil: int evilness = random.Next(50); if (evilness == 17) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception"); } // Throw an errant exception from the Reader: MockReaderWrapper evilReader = new MockReaderWrapper(random, text); evilReader.ThrowExcAfterChar(random.Next(text.Length)); reader = evilReader; try { // NOTE: some Tokenizers go and read characters // when you call .setReader(Reader), eg // PatternTokenizer. this is a bit // iffy... (really, they should only // pull from the Reader when you call // .incremenToken(), I think?), but we // currently allow it, so, we must call // a.TokenStream inside the try since we may // hit the exc on init: ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(evilReader, remainder) : evilReader); ts.Reset(); while (ts.IncrementToken()) { ; } Assert.Fail("did not hit exception"); } catch (Exception re) { Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re)); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } else if (evilness == 7) { // Only consume a subset of the tokens: int numTokensToRead = random.Next(tokens.Count); if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens"); } reader = new StringReader(text); ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader); ts.Reset(); for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++) { Assert.IsTrue(ts.IncrementToken()); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } } } // Final pass: verify clean tokenization matches // results from first pass: if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens"); } reader = new StringReader(text); long seed = random.Next(); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader); if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (offsetAtt != null) { // offset AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect); } else { // terms only AssertTokenStreamContents(ts, tokens.ToArray()); } if (field != null) { reader = new StringReader(text); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } field.ReaderValue = useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader; } }