public virtual void TestIgnoreCase()
        {
            // lets make booked stem to books
            // the override filter will convert "booked" to "books",
            // but also mark it with KeywordAttribute so Porter will not change it.
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
            builder.Add("boOkEd", "books");
            Tokenizer   tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, new string[] { "books" });
        }
Exemple #2
0
            protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final boolean updateOffsets = random().nextBoolean();
                bool updateOffsets = random().nextBoolean();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.Version version = updateOffsets ? org.apache.lucene.util.Version.LUCENE_43 : TEST_VERSION_CURRENT;
                Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT;

                return(new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets)));
            }
Exemple #3
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // Emits the entire input as a single token.
            TokenStream source = new KeywordTokenizer(reader);

            var map = new Dictionary <char, char> {
                { '-', ' ' }
            };

            // replaces specified characters from the token stream and lowercases the result
            return(new MapCharFilter(map, source));
        }
Exemple #4
0
 public virtual void TestEmptyTerm()
 {
     foreach (String lang in SNOWBALL_LANGS)
     {
         Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
         {
             Tokenizer tokenizer = new KeywordTokenizer(reader);
             return(new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang)));
         });
         CheckOneTerm(a, "", "");
     }
 }
        public virtual void TestEmptyTerm()
        {
            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer   = new KeywordTokenizer(reader);
                bool updateOffsets    = Random.nextBoolean();
                LuceneVersion version = updateOffsets ? LuceneVersion.LUCENE_43 : TEST_VERSION_CURRENT;
                return(new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets)));
            });

            CheckOneTerm(a, "", "");
        }
 public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     if (matchVersion.OnOrAfter(Version.LUCENE_40))
     {
         KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
         return(new TokenStreamComponents(tokenizer, tokenizer));
     }
     else
     {
         KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
         return(new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator)));
     }
 }
        public void TestEmptyTerm()
        {
            Random   random = Random;
            Analyzer a      = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);
#pragma warning disable 612, 618
                return(new TokenStreamComponents(tokenizer, new WordTokenFilter(tokenizer)));

#pragma warning restore 612, 618
            });

            CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
        }
Exemple #8
0
        /// <summary>
        /// For the supplied language, run the stemmer against all strings in voc.txt
        /// The output should be the same as the string in output.txt
        /// </summary>
        private void AssertCorrectOutput(string snowballLanguage, string dataDirectory)
        {
            if (Verbose)
            {
                Console.WriteLine("checking snowball language: " + snowballLanguage);
            }

            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer t = new KeywordTokenizer(reader);
                return(new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage)));
            });

            VocabularyAssert.AssertVocabulary(a, GetDataFile("TestSnowballVocabData.zip"), dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
        }
Exemple #9
0
        public void TestEmptyTerm()
        {
            IStringEncoder[] encoders = new IStringEncoder[] {
                new Metaphone(), new DoubleMetaphone(), new Soundex() /*, new RefinedSoundex()*/, new Caverphone2()
            };
            foreach (IStringEncoder e in encoders)
            {
                Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                {
                    Tokenizer tokenizer = new KeywordTokenizer(reader);
                    return(new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, Random().nextBoolean())));
                });

                CheckOneTerm(a, "", "");
            }
        }
        public GitHubIndex(Directory indexDirectory, string githubApiKey)
        {
            github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo"))
            {
                Credentials = new Credentials(githubApiKey)
            };

            analyzer = new PerFieldAnalyzerWrapper(
                // Example of a pre-built custom analyzer
                defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion),

                // Example of inline anonymous analyzers
                fieldAnalyzers: new Dictionary <string, Analyzer>
            {
                // Field analyzer for owner
                {
                    "owner",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new KeywordTokenizer(reader);
                        TokenStream result = new ASCIIFoldingFilter(source);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                },
                // Field analyzer for name
                {
                    "name",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new StandardTokenizer(GitHubIndex.MatchVersion, reader);
                        TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET);
                        result             = new ASCIIFoldingFilter(result);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                }
            });

            queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion,
                                                    new[] { "name", "description", "readme" }, analyzer);


            indexWriter     = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer));
            searcherManager = new SearcherManager(indexWriter, true, null);
        }
Exemple #11
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            if (_adapter.IsNumericField(fieldName) ||
                _adapter.IsNotAnalyzed(fieldName))
            {
                var tokenizer = new KeywordTokenizer(reader);
                var filter    = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer);
                return(new TokenStreamComponents(tokenizer, filter));
            }
            else
            {
                var tokenizer         = new MtgTokenizer(reader);
                var lowerCaseFilter   = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer);
                var replacementFilter = new ReplaceFilter(lowerCaseFilter, MtgAplhabet.Replacements);

                return(new TokenStreamComponents(tokenizer, replacementFilter));
            }
        }
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (this.matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
#pragma warning restore 612, 618
            {
                var tokenizer = new KeywordTokenizer(this.factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
                return(new TokenStreamComponents(tokenizer, tokenizer));
            }
            else
            {
                var tokenizer = new KeywordTokenizer(reader);
                return(new TokenStreamComponents(tokenizer,
#pragma warning disable 612, 618
                                                 new CollationKeyFilter(tokenizer, this.collator)));

#pragma warning restore 612, 618
            }
        }
Exemple #13
0
        public MySearch(string indexPath)
        {
            //_analyzer = new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION);

            _analyzer = new MultiFieldAnalyzerWrapper(
                defaultAnalyzer: new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION, true),
                new[]
            {
                (
                    new[] { "genre", "year" },
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                {
                    var source = new KeywordTokenizer(reader);
                    TokenStream result = new ASCIIFoldingFilter(source);
                    result = new LowerCaseFilter(MATCH_LUCENE_VERSION, result);
                    return(new TokenStreamComponents(source, result));
                })
                )
            });
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException
        public virtual void testSupplementaryCharacters()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10);
            string s = TestUtil.randomUnicodeString(random(), 10);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length());
            int codePointCount = s.codePointCount(0, s.Length);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3);
            int minGram = TestUtil.Next(random(), 1, 3);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10);
            int         maxGram = TestUtil.Next(random(), minGram, 10);
            TokenStream tk      = new KeywordTokenizer(new StringReader(s));

            tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
            CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute));
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
            OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute));

            tk.reset();
            for (int start = 0; start < codePointCount; ++start)
            {
                for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end)
                {
                    assertTrue(tk.incrementToken());
                    assertEquals(0, offsetAtt.startOffset());
                    assertEquals(s.Length, offsetAtt.endOffset());
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int startIndex = Character.offsetByCodePoints(s, 0, start);
                    int startIndex = char.offsetByCodePoints(s, 0, start);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int endIndex = Character.offsetByCodePoints(s, 0, end);
                    int endIndex = char.offsetByCodePoints(s, 0, end);
                    assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString());
                }
            }
            assertFalse(tk.incrementToken());
        }
		public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
		{
			if (overridesTokenStreamMethod)
			{
				// LUCENE-1678: force fallback to tokenStream() if we
				// have been subclassed and that subclass overrides
				// tokenStream but not reusableTokenStream
				return TokenStream(fieldName, reader);
			}
			var tokenizer = (Tokenizer) PreviousTokenStream;
			if (tokenizer == null)
			{
				tokenizer = new KeywordTokenizer(reader);
				PreviousTokenStream = tokenizer;
			}
			else
				tokenizer.Reset(reader);
			return tokenizer;
		}
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new KeywordTokenizer(reader);

            //add in filters
            // first normalize the StandardTokenizer
            //result = new KeyWo(result);

            // makes sure everything is lower case
            result = new LowerCaseFilter(result);

            // use the default list of Stop Words, provided by the StopAnalyzer class.
            //result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);

            // injects the synonyms.
            //result = new SynonymFilter(result, SynonymEngine);

            //return the built token stream.
            return(result);
        }
Exemple #17
0
        public void TestCustomAttribute()
        {
            TokenStream stream = new KeywordTokenizer(new StringReader("D'Angelo"));

            stream = new PatternKeywordMarkerFilter(stream, new Regex(".*"));
            stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
            IKeywordAttribute keyAtt = stream.AddAttribute <IKeywordAttribute>();

            stream.Reset();
            int i = 0;

            while (stream.IncrementToken())
            {
                assertTrue(keyAtt.IsKeyword);
                i++;
            }
            assertEquals(12, i);
            stream.End();
            stream.Dispose();
        }
        public virtual void TestIgnoreCaseNoSideEffects()
        {
            Dictionary d;

            System.IO.Stream affixStream = typeof(TestStemmer).getResourceAsStream("simple.aff");
            System.IO.Stream dictStream  = typeof(TestStemmer).getResourceAsStream("simple.dic");
            try
            {
                d = new Dictionary(affixStream, new Stream[] { dictStream }, true);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(affixStream, dictStream);
            }
            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);
                return(new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, d)));
            });

            CheckOneTerm(a, "NoChAnGy", "NoChAnGy");
        }
Exemple #19
0
        public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader)
        {
            if (overridesTokenStreamMethod)
            {
                // LUCENE-1678: force fallback to tokenStream() if we
                // have been subclassed and that subclass overrides
                // tokenStream but not reusableTokenStream
                return(TokenStream(fieldName, reader));
            }
            var tokenizer = (Tokenizer)PreviousTokenStream;

            if (tokenizer == null)
            {
                tokenizer           = new KeywordTokenizer(reader);
                PreviousTokenStream = tokenizer;
            }
            else
            {
                tokenizer.Reset(reader);
            }
            return(tokenizer);
        }
 public virtual void TestRandomStrings()
 {
     for (int i = 0; i < 10000; i++)
     {
         string text = TestUtil.RandomUnicodeString(Random, 100);
         int min = TestUtil.NextInt32(Random, 0, 100);
         int max = TestUtil.NextInt32(Random, 0, 100);
         int count = text.CodePointCount(0, text.Length);
         if (min > max)
         {
             int temp = min;
             min = max;
             max = temp;
         }
         bool expected = count >= min && count <= max;
         TokenStream stream = new KeywordTokenizer(new StringReader(text));
         stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max);
         stream.Reset();
         assertEquals(expected, stream.IncrementToken());
         stream.End();
         stream.Dispose();
     }
 }
        public virtual void TestSupplementaryCharacters()
        {
            string      s = TestUtil.RandomUnicodeString(Random(), 10);
            int         codePointCount = s.CodePointCount(0, s.Length);
            int         minGram        = TestUtil.NextInt(Random(), 1, 3);
            int         maxGram        = TestUtil.NextInt(Random(), minGram, 10);
            TokenStream tk             = new KeywordTokenizer(new StringReader(s));

            tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
            ICharTermAttribute termAtt   = tk.AddAttribute <ICharTermAttribute>();
            IOffsetAttribute   offsetAtt = tk.AddAttribute <IOffsetAttribute>();

            tk.Reset();
            for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i)
            {
                assertTrue(tk.IncrementToken());
                assertEquals(0, offsetAtt.StartOffset);
                assertEquals(s.Length, offsetAtt.EndOffset);
                int end = Character.OffsetByCodePoints(s, 0, i);
                assertEquals(s.Substring(0, end), termAtt.ToString());
            }
            assertFalse(tk.IncrementToken());
        }
Exemple #22
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testRandomRealisticKeyword() throws java.io.IOException
        public virtual void testRandomRealisticKeyword()
        {
            IDictionary <string, string> map = new Dictionary <string, string>();
            int numTerms = atLeast(50);

            for (int i = 0; i < numTerms; i++)
            {
                string randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
                if (randomRealisticUnicodeString.Length > 0)
                {
                    string value = TestUtil.randomSimpleString(random());
                    map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value;
                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder         builder  = new StemmerOverrideFilter.Builder(random().nextBoolean());
            ISet <KeyValuePair <string, string> > entrySet = map.SetOfKeyValuePairs();

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                builder.add(entry.Key, entry.Value);
            }
            StemmerOverrideMap build = builder.build();

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                if (random().nextBoolean())
                {
                    Tokenizer   tokenizer = new KeywordTokenizer(new StringReader(entry.Key));
                    TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
                    assertTokenStreamContents(stream, new string[] { entry.Value });
                }
            }
        }
Exemple #23
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer)));
            }
Exemple #24
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new Analyzer.TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang)));
            }
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer)));
            }
Exemple #26
0
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer t = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t))));
            }
        public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	  {
		if (matchVersion.OnOrAfter(Version.LUCENE_40))
		{
		  KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
		  return new TokenStreamComponents(tokenizer, tokenizer);
		}
		else
		{
		  KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
		  return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
		}
	  }
Exemple #28
0
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new IrishLowerCaseFilter(tokenizer)));
            }
Exemple #29
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer)));
            }
Exemple #30
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5)));
            }
Exemple #31
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15)));
            }
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new KeywordTokenizer(reader);

                return(new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords)));
            }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new KeywordTokenizer(reader);
     return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
 }