public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); tee1.Reset(); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute <ICheckClearAttributesAttribute>(); dogDetector.AddAttribute <ICheckClearAttributesAttribute>(); theDetector.AddAttribute <ICheckClearAttributesAttribute>(); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false)); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" }); AssertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); string[] lowerCaseTokens = new string[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = CultureInfo.InvariantCulture.TextInfo.ToLower(tokens1[i]); } AssertTokenStreamContents(lowerCasing, lowerCaseTokens); }
public virtual void TestGeneral() { TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false)); TokenStream sink1 = source.NewSinkTokenStream(); TokenStream sink2 = source.NewSinkTokenStream(theFilter); source.AddAttribute <ICheckClearAttributesAttribute>(); sink1.AddAttribute <ICheckClearAttributesAttribute>(); sink2.AddAttribute <ICheckClearAttributesAttribute>(); AssertTokenStreamContents(source, tokens1); AssertTokenStreamContents(sink1, tokens1); AssertTokenStreamContents(sink2, new string[] { "The", "the" }); }
public virtual void Test() { TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4); string test = "The quick red fox jumped over the lazy brown dogs"; TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.NewSinkTokenStream(sinkFilter); int count = 0; tee.Reset(); while (tee.IncrementToken()) { count++; } int sinkCount = 0; rangeToks.Reset(); while (rangeToks.IncrementToken()) { sinkCount++; } assertTrue(count + " does not equal: " + 10, count == 10); assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2); }
public virtual void TestLooseDateFormat() { DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(CultureInfo.InvariantCulture); string test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/2/2006"; TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); TeeSinkTokenFilter.SinkTokenStream sink = tee.NewSinkTokenStream(sinkFilter); int count = 0; tee.Reset(); while (tee.IncrementToken()) { count++; } assertTrue(count + " does not equal: " + 18, count == 18); int sinkCount = 0; sink.Reset(); while (sink.IncrementToken()) { sinkCount++; } assertTrue("sink Size: " + sinkCount + " is not: " + 2, sinkCount == 2); }
public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter() { Store.Directory dir = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TokenStream tokenStream = analyzer.GetTokenStream("field", "abcd "); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream); TokenStream sink = tee.NewSinkTokenStream(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Field f1 = new Field("field", tee, ft); Field f2 = new Field("field", sink, ft); doc.Add(f1); doc.Add(f2); w.AddDocument(doc); w.Dispose(); IndexReader r = DirectoryReader.Open(dir); Terms vector = r.GetTermVectors(0).GetTerms("field"); assertEquals(1, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); termsEnum.Next(); assertEquals(2, termsEnum.TotalTermFreq); DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null); assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, positions.Freq); positions.NextPosition(); assertEquals(0, positions.StartOffset); assertEquals(4, positions.EndOffset); positions.NextPosition(); assertEquals(8, positions.StartOffset); assertEquals(12, positions.EndOffset); assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc()); r.Dispose(); dir.Dispose(); }
public virtual void Test() { TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D"); string test = "The quick red fox jumped over the lazy brown dogs"; TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false))); TeeSinkTokenFilter.SinkTokenStream sink = ttf.NewSinkTokenStream(sinkFilter); bool seenDogs = false; ICharTermAttribute termAtt = ttf.AddAttribute <ICharTermAttribute>(); ITypeAttribute typeAtt = ttf.AddAttribute <ITypeAttribute>(); ttf.Reset(); while (ttf.IncrementToken()) { if (termAtt.ToString().Equals("dogs", StringComparison.Ordinal)) { seenDogs = true; assertTrue(typeAtt.Type + " is not equal to " + "D", typeAtt.Type.Equals("D", StringComparison.Ordinal) == true); } else { assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals("word", StringComparison.Ordinal)); } } assertTrue(seenDogs + " does not equal: " + true, seenDogs == true); int sinkCount = 0; sink.Reset(); while (sink.IncrementToken()) { sinkCount++; } assertTrue("sink Size: " + sinkCount + " is not: " + 1, sinkCount == 1); }
public virtual void Test() { TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D"); string test = "The quick red fox jumped over the lazy brown dogs"; TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(this, new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false))); TeeSinkTokenFilter.SinkTokenStream sink = ttf.NewSinkTokenStream(sinkFilter); bool seenDogs = false; ICharTermAttribute termAtt = ttf.AddAttribute<ICharTermAttribute>(); ITypeAttribute typeAtt = ttf.AddAttribute<ITypeAttribute>(); ttf.Reset(); while (ttf.IncrementToken()) { if (termAtt.ToString().Equals("dogs")) { seenDogs = true; assertTrue(typeAtt.Type + " is not equal to " + "D", typeAtt.Type.Equals("D") == true); } else { assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals("word")); } } assertTrue(seenDogs + " does not equal: " + true, seenDogs == true); int sinkCount = 0; sink.Reset(); while (sink.IncrementToken()) { sinkCount++; } assertTrue("sink Size: " + sinkCount + " is not: " + 1, sinkCount == 1); }
/// <summary> /// Not an explicit test, just useful to print out some info on performance /// </summary> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { StringBuilder buffer = new StringBuilder(); Console.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { //buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' '); buffer.Append(i.ToString(CultureInfo.InvariantCulture)).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100); ICharTermAttribute tfTok = stream.AddAttribute <ICharTermAttribute>(); ICharTermAttribute sinkTok = sink.AddAttribute <ICharTermAttribute>(); for (int i = 0; stream.IncrementToken(); i++) { assertTrue(sink.IncrementToken()); assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; //long start = DateTimeHelperClass.CurrentUnixTimeMillis(); long start = Environment.TickCount; for (int i = 0; i < 20; i++) { stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))); IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } //long finish = DateTimeHelperClass.CurrentUnixTimeMillis(); long finish = Environment.TickCount; Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink //start = DateTimeHelperClass.CurrentUnixTimeMillis(); start = Environment.TickCount; for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>(); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>(); while (sink.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } //finish = DateTimeHelperClass.CurrentUnixTimeMillis(); finish = Environment.TickCount; Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); } Console.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }