An Analyzer used primarily at query time to wrap another analyzer and provide a layer of protection which prevents very common words from being passed into queries.

For very large indexes the cost of reading TermDocs for a very common word can be high. This analyzer was created after experience with a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for this term to take 2 seconds.

Inheritance: AnalyzerWrapper
Exemple #1
0
        public virtual void TestTokenStream()
        {
            QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), reader, 10);
            TokenStream ts = a.GetTokenStream("repetitiveField", "this boring");

            AssertTokenStreamContents(ts, new string[] { "this" });
        }
        public virtual void TestNoStopwords()
        {
            // Note: an empty list of fields passed in
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Enumerable.Empty<string>(), 1);
            TokenStream protectedTokenStream = protectedAnalyzer.TokenStream("variedField", "quick");
            AssertTokenStreamContents(protectedTokenStream, new string[] { "quick" });

            protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "boring");
            AssertTokenStreamContents(protectedTokenStream, new string[] { "boring" });
        }
Exemple #3
0
        public virtual void TestStopwordsPerFieldMaxDocFreq()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, new string[] { "repetitiveField" }, 10);
            int numStopWords = protectedAnalyzer.GetStopWords("repetitiveField").Length;

            assertTrue("Should have identified stop words", numStopWords > 0);

            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, new string[] { "repetitiveField", "variedField" }, 10);
            int numNewStopWords = protectedAnalyzer.GetStopWords("repetitiveField").Length + protectedAnalyzer.GetStopWords("variedField").Length;

            assertTrue("Should have identified more stop words", numNewStopWords > numStopWords);
        }
Exemple #4
0
        public virtual void TestNoFieldNamePollution()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, new string[] { "repetitiveField" }, 10);

            TokenStream protectedTokenStream = protectedAnalyzer.GetTokenStream("repetitiveField", "boring");

            // Check filter set up OK
            AssertTokenStreamContents(protectedTokenStream, new string[0]);

            protectedTokenStream = protectedAnalyzer.GetTokenStream("variedField", "boring");
            // Filter should not prevent stopwords in one field being used in another
            AssertTokenStreamContents(protectedTokenStream, new string[] { "boring" });
        }
Exemple #5
0
        public virtual void TestStopwordsPerFieldMaxPercentDocs()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, new string[] { "variedField" }, 1f / 2f);
            TokenStream protectedTokenStream = protectedAnalyzer.GetTokenStream("repetitiveField", "boring");

            // A filter on one Field should not affect queries on another
            AssertTokenStreamContents(protectedTokenStream, new string[] { "boring" });

            protectedAnalyzer    = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, new string[] { "variedField", "repetitiveField" }, 1f / 2f);
            protectedTokenStream = protectedAnalyzer.GetTokenStream("repetitiveField", "boring");
            // A filter on the right Field should affect queries on it
            AssertTokenStreamContents(protectedTokenStream, new string[0]);
        }
        public virtual void TestStopwordsAllFieldsMaxPercentDocs()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 2f);

            TokenStream protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "boring");
            // A filter on terms in > one half of docs remove boring
            AssertTokenStreamContents(protectedTokenStream, new string[0]);

            protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "vaguelyboring");
            // A filter on terms in > half of docs should not remove vaguelyBoring
            AssertTokenStreamContents(protectedTokenStream, new string[] { "vaguelyboring" });

            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, 1f / 4f);
            protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "vaguelyboring");
            // A filter on terms in > quarter of docs should remove vaguelyBoring
            AssertTokenStreamContents(protectedTokenStream, new string[0]);
        }
 public override void SetUp()
 {
     dir = new RAMDirectory();
     appAnalyzer = new WhitespaceAnalyzer();
     IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
     int numDocs = 200;
     for (int i = 0; i < numDocs; i++)
     {
         Document doc = new Document();
         String variedFieldValue = variedFieldValues[i % variedFieldValues.Length];
         String repetitiveFieldValue = repetitiveFieldValues[i % repetitiveFieldValues.Length];
         doc.Add(new Field("variedField", variedFieldValue, Field.Store.YES, Field.Index.ANALYZED));
         doc.Add(new Field("repetitiveField", repetitiveFieldValue, Field.Store.YES, Field.Index.ANALYZED));
         writer.AddDocument(doc);
     }
     writer.Close();
     reader = IndexReader.Open(dir, true);
     protectedAnalyzer = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, appAnalyzer);
     base.SetUp();
 }
 public virtual void TestDefaultStopwordsAllFields()
 {
     protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader);
     TokenStream protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "boring");
     AssertTokenStreamContents(protectedTokenStream, new string[0]); // Default stop word filtering will remove boring
 }
 public virtual void TestTokenStream()
 {
     QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), reader, 10);
     TokenStream ts = a.TokenStream("repetitiveField", "this boring");
     AssertTokenStreamContents(ts, new string[] { "this" });
 }
        public virtual void TestNoFieldNamePollution()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.AsList("repetitiveField"), 10);

            TokenStream protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "boring");
            // Check filter set up OK
            AssertTokenStreamContents(protectedTokenStream, new string[0]);

            protectedTokenStream = protectedAnalyzer.TokenStream("variedField", "boring");
            // Filter should not prevent stopwords in one field being used in another
            AssertTokenStreamContents(protectedTokenStream, new string[] { "boring" });
        }
        public virtual void TestStopwordsPerFieldMaxDocFreq()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.AsList("repetitiveField"), 10);
            int numStopWords = protectedAnalyzer.GetStopWords("repetitiveField").Length;
            assertTrue("Should have identified stop words", numStopWords > 0);

            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.AsList("repetitiveField", "variedField"), 10);
            int numNewStopWords = protectedAnalyzer.GetStopWords("repetitiveField").Length + protectedAnalyzer.GetStopWords("variedField").Length;
            assertTrue("Should have identified more stop words", numNewStopWords > numStopWords);
        }
        public virtual void TestStopwordsPerFieldMaxPercentDocs()
        {
            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.AsList("variedField"), 1f / 2f);
            TokenStream protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "boring");
            // A filter on one Field should not affect queries on another
            AssertTokenStreamContents(protectedTokenStream, new string[] { "boring" });

            protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer, reader, Arrays.AsList("variedField", "repetitiveField"), 1f / 2f);
            protectedTokenStream = protectedAnalyzer.TokenStream("repetitiveField", "boring");
            // A filter on the right Field should affect queries on it
            AssertTokenStreamContents(protectedTokenStream, new string[0]);
        }
 public void TestTokenStream()
 {
     QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer());
     a.AddStopWords(reader, 10);
     TokenStream ts = a.TokenStream("repetitiveField", new StringReader("this boring"));
     ITermAttribute termAtt = ts.GetAttribute<ITermAttribute>();
     Assert.True(ts.IncrementToken());
     Assert.AreEqual("this", termAtt.Term);
     Assert.False(ts.IncrementToken());
 }
 public void TestWrappingNonReusableAnalyzer()
 {
     QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new NonreusableAnalyzer());
     a.AddStopWords(reader, 10);
     int numHits = Search(a, "repetitiveField:boring");
     Assert.True(numHits == 0);
     numHits = Search(a, "repetitiveField:vaguelyboring");
     Assert.True(numHits == 0);
 }