public virtual void TestIgnoreCase() { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); builder.Add("boOkEd", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "books" }); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean updateOffsets = random().nextBoolean(); bool updateOffsets = random().nextBoolean(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.Version version = updateOffsets ? org.apache.lucene.util.Version.LUCENE_43 : TEST_VERSION_CURRENT; Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT; return(new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets))); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { // Emits the entire input as a single token. TokenStream source = new KeywordTokenizer(reader); var map = new Dictionary <char, char> { { '-', ' ' } }; // replaces specified characters from the token stream and lowercases the result return(new MapCharFilter(map, source)); }
public virtual void TestEmptyTerm() { foreach (String lang in SNOWBALL_LANGS) { Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang))); }); CheckOneTerm(a, "", ""); } }
public virtual void TestEmptyTerm() { Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new KeywordTokenizer(reader); bool updateOffsets = Random.nextBoolean(); LuceneVersion version = updateOffsets ? LuceneVersion.LUCENE_43 : TEST_VERSION_CURRENT; return(new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets))); }); CheckOneTerm(a, "", ""); }
public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (matchVersion.OnOrAfter(Version.LUCENE_40)) { KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); return(new TokenStreamComponents(tokenizer, tokenizer)); } else { KeywordTokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator))); } }
public void TestEmptyTerm() { Random random = Random; Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new KeywordTokenizer(reader); #pragma warning disable 612, 618 return(new TokenStreamComponents(tokenizer, new WordTokenFilter(tokenizer))); #pragma warning restore 612, 618 }); CheckAnalysisConsistency(random, a, random.nextBoolean(), ""); }
/// <summary> /// For the supplied language, run the stemmer against all strings in voc.txt /// The output should be the same as the string in output.txt /// </summary> private void AssertCorrectOutput(string snowballLanguage, string dataDirectory) { if (Verbose) { Console.WriteLine("checking snowball language: " + snowballLanguage); } Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer t = new KeywordTokenizer(reader); return(new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage))); }); VocabularyAssert.AssertVocabulary(a, GetDataFile("TestSnowballVocabData.zip"), dataDirectory + "/voc.txt", dataDirectory + "/output.txt"); }
public void TestEmptyTerm() { IStringEncoder[] encoders = new IStringEncoder[] { new Metaphone(), new DoubleMetaphone(), new Soundex() /*, new RefinedSoundex()*/, new Caverphone2() }; foreach (IStringEncoder e in encoders) { Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, Random().nextBoolean()))); }); CheckOneTerm(a, "", ""); } }
public GitHubIndex(Directory indexDirectory, string githubApiKey) { github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo")) { Credentials = new Credentials(githubApiKey) }; analyzer = new PerFieldAnalyzerWrapper( // Example of a pre-built custom analyzer defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion), // Example of inline anonymous analyzers fieldAnalyzers: new Dictionary <string, Analyzer> { // Field analyzer for owner { "owner", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new KeywordTokenizer(reader); TokenStream result = new ASCIIFoldingFilter(source); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) }, // Field analyzer for name { "name", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new StandardTokenizer(GitHubIndex.MatchVersion, reader); TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) } }); queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion, new[] { "name", "description", "readme" }, analyzer); indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer)); searcherManager = new SearcherManager(indexWriter, true, null); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (_adapter.IsNumericField(fieldName) || _adapter.IsNotAnalyzed(fieldName)) { var tokenizer = new KeywordTokenizer(reader); var filter = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer); return(new TokenStreamComponents(tokenizer, filter)); } else { var tokenizer = new MtgTokenizer(reader); var lowerCaseFilter = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer); var replacementFilter = new ReplaceFilter(lowerCaseFilter, MtgAplhabet.Replacements); return(new TokenStreamComponents(tokenizer, replacementFilter)); } }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (this.matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) #pragma warning restore 612, 618 { var tokenizer = new KeywordTokenizer(this.factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); return(new TokenStreamComponents(tokenizer, tokenizer)); } else { var tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, #pragma warning disable 612, 618 new CollationKeyFilter(tokenizer, this.collator))); #pragma warning restore 612, 618 } }
public MySearch(string indexPath) { //_analyzer = new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION); _analyzer = new MultiFieldAnalyzerWrapper( defaultAnalyzer: new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION, true), new[] { ( new[] { "genre", "year" }, Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new KeywordTokenizer(reader); TokenStream result = new ASCIIFoldingFilter(source); result = new LowerCaseFilter(MATCH_LUCENE_VERSION, result); return(new TokenStreamComponents(source, result)); }) ) });
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException public virtual void testSupplementaryCharacters() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10); string s = TestUtil.randomUnicodeString(random(), 10); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length()); int codePointCount = s.codePointCount(0, s.Length); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3); int minGram = TestUtil.Next(random(), 1, 3); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10); int maxGram = TestUtil.Next(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute)); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.Length, offsetAtt.endOffset()); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int startIndex = Character.offsetByCodePoints(s, 0, start); int startIndex = char.offsetByCodePoints(s, 0, start); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int endIndex = Character.offsetByCodePoints(s, 0, end); int endIndex = char.offsetByCodePoints(s, 0, end); assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString()); } } assertFalse(tk.incrementToken()); }
public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return TokenStream(fieldName, reader); } var tokenizer = (Tokenizer) PreviousTokenStream; if (tokenizer == null) { tokenizer = new KeywordTokenizer(reader); PreviousTokenStream = tokenizer; } else tokenizer.Reset(reader); return tokenizer; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new KeywordTokenizer(reader); //add in filters // first normalize the StandardTokenizer //result = new KeyWo(result); // makes sure everything is lower case result = new LowerCaseFilter(result); // use the default list of Stop Words, provided by the StopAnalyzer class. //result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); // injects the synonyms. //result = new SynonymFilter(result, SynonymEngine); //return the built token stream. return(result); }
public void TestCustomAttribute() { TokenStream stream = new KeywordTokenizer(new StringReader("D'Angelo")); stream = new PatternKeywordMarkerFilter(stream, new Regex(".*")); stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)); IKeywordAttribute keyAtt = stream.AddAttribute <IKeywordAttribute>(); stream.Reset(); int i = 0; while (stream.IncrementToken()) { assertTrue(keyAtt.IsKeyword); i++; } assertEquals(12, i); stream.End(); stream.Dispose(); }
public virtual void TestIgnoreCaseNoSideEffects() { Dictionary d; System.IO.Stream affixStream = typeof(TestStemmer).getResourceAsStream("simple.aff"); System.IO.Stream dictStream = typeof(TestStemmer).getResourceAsStream("simple.dic"); try { d = new Dictionary(affixStream, new Stream[] { dictStream }, true); } finally { IOUtils.DisposeWhileHandlingException(affixStream, dictStream); } Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, d))); }); CheckOneTerm(a, "NoChAnGy", "NoChAnGy"); }
public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader) { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return(TokenStream(fieldName, reader)); } var tokenizer = (Tokenizer)PreviousTokenStream; if (tokenizer == null) { tokenizer = new KeywordTokenizer(reader); PreviousTokenStream = tokenizer; } else { tokenizer.Reset(reader); } return(tokenizer); }
public virtual void TestRandomStrings() { for (int i = 0; i < 10000; i++) { string text = TestUtil.RandomUnicodeString(Random, 100); int min = TestUtil.NextInt32(Random, 0, 100); int max = TestUtil.NextInt32(Random, 0, 100); int count = text.CodePointCount(0, text.Length); if (min > max) { int temp = min; min = max; max = temp; } bool expected = count >= min && count <= max; TokenStream stream = new KeywordTokenizer(new StringReader(text)); stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max); stream.Reset(); assertEquals(expected, stream.IncrementToken()); stream.End(); stream.Dispose(); } }
public virtual void TestSupplementaryCharacters() { string s = TestUtil.RandomUnicodeString(Random(), 10); int codePointCount = s.CodePointCount(0, s.Length); int minGram = TestUtil.NextInt(Random(), 1, 3); int maxGram = TestUtil.NextInt(Random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); ICharTermAttribute termAtt = tk.AddAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = tk.AddAttribute <IOffsetAttribute>(); tk.Reset(); for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i) { assertTrue(tk.IncrementToken()); assertEquals(0, offsetAtt.StartOffset); assertEquals(s.Length, offsetAtt.EndOffset); int end = Character.OffsetByCodePoints(s, 0, i); assertEquals(s.Substring(0, end), termAtt.ToString()); } assertFalse(tk.IncrementToken()); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testRandomRealisticKeyword() throws java.io.IOException public virtual void testRandomRealisticKeyword() { IDictionary <string, string> map = new Dictionary <string, string>(); int numTerms = atLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random()); if (randomRealisticUnicodeString.Length > 0) { string value = TestUtil.randomSimpleString(random()); map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(random().nextBoolean()); ISet <KeyValuePair <string, string> > entrySet = map.SetOfKeyValuePairs(); foreach (KeyValuePair <string, string> entry in entrySet) { builder.add(entry.Key, entry.Value); } StemmerOverrideMap build = builder.build(); foreach (KeyValuePair <string, string> entry in entrySet) { if (random().nextBoolean()) { Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key)); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build)); assertTokenStreamContents(stream, new string[] { entry.Value }); } } }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new Analyzer.TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang))); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer))); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new KeywordTokenizer(reader); return(new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)))); }
public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (matchVersion.OnOrAfter(Version.LUCENE_40)) { KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); return new TokenStreamComponents(tokenizer, tokenizer); } else { KeywordTokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator)); } }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new IrishLowerCaseFilter(tokenizer))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5))); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15))); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return(new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer)); }