/// <summary> /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>, /// and a <seealso cref="SnowballFilter"/> /// </summary> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins"))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { // This should be a good tokenizer for most European-language documents: // Splits words at punctuation characters, removing punctuation. // Splits words at hyphens, unless there's a number in the token... // Recognizes email addresses and internet hostnames as one token. var intput = new StandardTokenizer(Version.LUCENE_30, reader); // A ShingleMatrixFilter constructs shingles from a token stream. // "2010 Audi RS5 Quattro Coupe" => "2010 Audi", "Audi RS5", "RS5 Quattro", "Quattro Coupe" var shingleMatrixOutput = new ShingleMatrixFilter( // stream from which to construct the matrix intput, // minimum number of tokens in any shingle 2, // maximum number of tokens in any shingle. 8, // character to use between texts of the token parts in a shingle. ' '); // Normalizes token text to lower case. var lowerCaseFilter = new LowerCaseFilter(shingleMatrixOutput); // Removes stop words from a token stream. return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader); var shingleMatrix = new ShingleMatrixFilter(tokenizer, 2, 8, ' '); var lowerCaseFilter = new LowerCaseFilter(shingleMatrix); return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader); TokenStream filterStream = new StandardFilter(tokenizer); TokenStream stream = new StopFilter(true, filterStream, _stopWords, true); return stream; }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new RuSnowballFilter(result); return result; }
public override TokenStream TokenStream(String fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream sink = new StandardFilter(source); sink = new LowerCaseFilter(sink); //sink = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), sink, stopSet); sink = new CroatianStemFilter(sink, stemmer); return sink; }
/** Constructs a {@link StandardTokenizer} filtered by a {@link * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter} * and a {@link SpanishStemFilter}. */ public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(Version.LUCENE_24,reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true,result, stopTable); result = new SpanishStemFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_29, reader); tokenizer.MaxTokenLength = 255; TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET); return new NGramTokenFilter(filter, 2, 6); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream ts = new StandardTokenizer(matchVersion, reader); ts = new StandardFilter(ts); ts = new ThaiWordFilter(ts); ts = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return ts; }
public override TokenStream TokenStream(string fieldname, TextReader reader) { TokenStream result = new StandardTokenizer(_version, reader); result = new LowerCaseFilter(result); result = new PersianNormalizationFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable); result = new PersianStemFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(this.enableStopPositionIncrements, result, stoptable); result = new BulgarianStemFilter(result); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader); tokenizer.MaxTokenLength = 255; TokenStream filter = new StandardFilter(tokenizer); filter = new LowerCaseFilter(filter); filter = new NGramTokenFilter(filter, 2, 255); return filter; }
/// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new SnowballFilter(result, name); return result; }
public virtual void TestElision_() { string test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test)); CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, AsSet("l", "M"), false); TokenFilter filter = new ElisionFilter(tokenizer, articles); IList<string> tas = Filter(filter); assertEquals("embrouille", tas[4]); assertEquals("O'brian", tas[6]); assertEquals("enfin", tas[7]); }
public virtual void TestHugeDoc() { StringBuilder sb = new StringBuilder(); char[] whitespace = new char[4094]; Arrays.Fill(whitespace, ' '); sb.Append(whitespace); sb.Append("testing 1234"); string input = sb.ToString(); StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); BaseTokenStreamTestCase.AssertTokenStreamContents(tokenizer, new string[] { "testing", "1234" }); }
public void TokenizingReturnsExpectedTerms(string text, TokenAttributes[] expected) { // Arrange var tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(text)); var filter = new ExpandAcronymsFilter(tokenStream, NuGetAcronymExpansionProvider.Instance); // Act var actual = filter.Tokenize().ToArray(); // Assert Assert.Equal(expected, actual); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(kLuceneVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords)); result = new EdgeNGramTokenFilter( result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.Side.FRONT, 1, 20); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(this._luceneVersion, reader); result = new StandardFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this._luceneVersion), result, CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false)) ); result = new FrenchStemFilter(result, CharArraySet.EMPTY_SET); // Convert to lowercase after stemming! result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); return result; }
public void TestElision2() { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test)); HashSet<String> articles = new HashSet<String>(); articles.Add("l"); articles.Add("M"); TokenFilter filter = new ElisionFilter(tokenizer, articles); List<string> tas = Filtre(filter); Assert.AreEqual("embrouille", tas[4]); Assert.AreEqual("O'brian", tas[6]); Assert.AreEqual("enfin", tas[7]); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); result = new SnowballFilter(result, "English"); return result; }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns></returns> public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { //create the tokenizer TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader); //add in filters result = new StandardFilter(result); // first normalize the StandardTokenizer result = new LowerCaseFilter(result);// makes sure everything is lower case result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // use the default list of Stop Words, provided by the StopAnalyzer class. result = new SynonymFilter(result, SynonymEngine); // injects the synonyms. //return the built token stream. return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Apply standard tokenizer to input var tokenizedInput = new StandardTokenizer(_version, reader); //Apply standard, lowercase and English stop words filters to input var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)), StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer()); //Apply EdgeNGram filter to front of words //Min size of grams max size of grams var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram); return grammedInput; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) result = new StopFilter(false, result, STOP_WORDS); result = new ASCIIFoldingFilter(result); // we are using a distinct version of the Spanish stemmer, called Spanish2 // Please check if this class can be found in the Snowball library, the relative path // should be: Snowball\SF\Snowball\Ext\ // just in case, I would leave a copy of this class in this project result = new SnowballFilter(result, "Spanish"); return result; }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result; if (tagsMode) { result = new TagsTokenizer(reader); } else { result = new StandardTokenizer(reader); } result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return(result); }
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym); tokenStream.SetMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (useDefaultStopPositionIncrements) { result = new StopFilter(result, stopSet); } else { result = new StopFilter(enableStopPositionIncrements, result, stopSet); } return(result); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader); TokenStream filter = new StandardFilter(src); //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET; HashSet<string> newWords = new HashSet<string>() { "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also", "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a", "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it" }; foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET) { newWords.Add(var); } TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader); result = new SnowballFilter(result, new EnglishStemmer()); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(true, result, newWords); return result; }
public TokenStreamComponentsAnonymousInnerClassHelper(StandardAnalyzer outerInstance, StandardTokenizer src, TokenStream tok, TextReader reader) : base(src, tok) { this.outerInstance = outerInstance; this.reader = reader; this.src = src; }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new DanishStemmer()); return new TokenStreamComponents(source, result); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(tokenizer); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream tokenStream = new MockGraphTokenFilter(Random(), tokenizer); return new TokenStreamComponents(tokenizer, tokenStream); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 Tokenizer tokenizer = new StandardTokenizer(LuceneVersion.LUCENE_40, reader); #pragma warning restore 612, 618 return new TokenStreamComponents(tokenizer); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return new TokenStreamComponents(source, result); }