A grammar-based tokenizer constructed with JFlex

This should be a good tokenizer for most European-language documents:

  • Splits words at punctuation characters, removing punctuation. However, a dot that's not followed by whitespace is considered part of a token.
  • Splits words at hyphens, unless there's a number in the token, in which case the whole token is interpreted as a product number and is not split.
  • Recognizes email addresses and internet hostnames as one token.

Many applications have specific tokenizer needs. If this tokenizer does not suit your application, please consider copying this source code directory to your project and maintaining your own grammar-based tokenizer.

Наследование: Lucene.Net.Analysis.Tokenizer
Пример #1
0
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
         result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish"))
     {
         result = new TurkishLowerCaseFilter(result);
     }
     else
     {
         result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
         result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
Пример #2
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // This should be a good tokenizer for most European-language documents:
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            var intput = new StandardTokenizer(Version.LUCENE_30, reader);

            // A ShingleMatrixFilter constructs shingles from a token stream.
            // "2010 Audi RS5 Quattro Coupe" => "2010 Audi", "Audi RS5", "RS5 Quattro", "Quattro Coupe"
            var shingleMatrixOutput = new ShingleMatrixFilter(
                                                // stream from which to construct the matrix
                                                intput,
                                                // minimum number of tokens in any shingle
                                                2,
                                                // maximum number of tokens in any shingle.
                                                8,
                                                // character to use between texts of the token parts in a shingle.
                                                ' ');

            // Normalizes token text to lower case.
            var lowerCaseFilter = new LowerCaseFilter(shingleMatrixOutput);

            // Removes stop words from a token stream.
            return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
Пример #3
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader);
     var shingleMatrix = new ShingleMatrixFilter(tokenizer, 2, 8, ' ');
     var lowerCaseFilter = new LowerCaseFilter(shingleMatrix);
     return new StopFilter(true, lowerCaseFilter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 }
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader);
     TokenStream filterStream = new StandardFilter(tokenizer);
     TokenStream stream = new StopFilter(true, filterStream, _stopWords, true);
     return stream;
 }
Пример #5
0
 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
 /// </summary>
 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 {
     TokenStream result = new StandardTokenizer(reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     result = new StopFilter(result, stopSet);
     return result;
 }
Пример #6
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     result = new RuSnowballFilter(result);
     return result;
 }
Пример #7
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream sink = new StandardFilter(source);
     sink = new LowerCaseFilter(sink);
     //sink = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), sink, stopSet);
     sink = new CroatianStemFilter(sink, stemmer);
     return sink;
 }
Пример #8
0
 /** Constructs a {@link StandardTokenizer} filtered by a {@link
  * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}
  * and a {@link SpanishStemFilter}. */
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(Version.LUCENE_24,reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     result = new StopFilter(true,result, stopTable);
     result = new SpanishStemFilter(result);
     return result;
 }
Пример #9
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var tokenizer = new StandardTokenizer(Version.LUCENE_29, reader);
     tokenizer.MaxTokenLength = 255;
     TokenStream filter = new StandardFilter(tokenizer);
     filter = new LowerCaseFilter(filter);
     filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET);
     return new NGramTokenFilter(filter, 2, 6);
 }
Пример #10
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream ts = new StandardTokenizer(matchVersion, reader);
     ts = new StandardFilter(ts);
     ts = new ThaiWordFilter(ts);
     ts = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                         ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
     return ts;
 }
 public override TokenStream TokenStream(string fieldname, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(_version, reader);
     result = new LowerCaseFilter(result);
     result = new PersianNormalizationFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable);
     result = new PersianStemFilter(result);
     return result;
 }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(matchVersion, reader);
            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(this.enableStopPositionIncrements, result, stoptable);
            result = new BulgarianStemFilter(result);

            return result;
        }
Пример #13
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var tokenizer = new StandardTokenizer(Version.LUCENE_30, reader);
            tokenizer.MaxTokenLength = 255;
            TokenStream filter = new StandardFilter(tokenizer);
            filter = new LowerCaseFilter(filter);
            filter = new NGramTokenFilter(filter, 2, 255);

            return filter;
        }
Пример #14
0
 /// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link
 /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. 
 /// </summary>
 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
     if (stopSet != null)
         result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                 result, stopSet);
     result = new SnowballFilter(result, name);
     return result;
 }
Пример #15
0
 public virtual void TestElision_()
 {
     string test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
     Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
     CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, AsSet("l", "M"), false);
     TokenFilter filter = new ElisionFilter(tokenizer, articles);
     IList<string> tas = Filter(filter);
     assertEquals("embrouille", tas[4]);
     assertEquals("O'brian", tas[6]);
     assertEquals("enfin", tas[7]);
 }
 public virtual void TestHugeDoc()
 {
     StringBuilder sb = new StringBuilder();
     char[] whitespace = new char[4094];
     Arrays.Fill(whitespace, ' ');
     sb.Append(whitespace);
     sb.Append("testing 1234");
     string input = sb.ToString();
     StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
     BaseTokenStreamTestCase.AssertTokenStreamContents(tokenizer, new string[] { "testing", "1234" });
 }
        public void TokenizingReturnsExpectedTerms(string text, TokenAttributes[] expected)
        {
            // Arrange
            var tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(text));
            var filter = new ExpandAcronymsFilter(tokenStream, NuGetAcronymExpansionProvider.Instance);

            // Act
            var actual = filter.Tokenize().ToArray();

            // Assert
            Assert.Equal(expected, actual);
        }
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(kLuceneVersion, reader);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new ASCIIFoldingFilter(result);
            result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
            result = new EdgeNGramTokenFilter(
                result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.Side.FRONT, 1, 20);

            return result;
        }
 public override TokenStream TokenStream(string fieldName, TextReader reader) {
     TokenStream result = new StandardTokenizer(this._luceneVersion, reader);
     result = new StandardFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this._luceneVersion),
         result,
         CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false))
     );
     result = new FrenchStemFilter(result, CharArraySet.EMPTY_SET);
     // Convert to lowercase after stemming!
     result = new LowerCaseFilter(result);
     result = new ASCIIFoldingFilter(result);
     return result;
 }
Пример #20
0
 public void TestElision2()
 {
     String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
     Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test));
     HashSet<String> articles = new HashSet<String>();
     articles.Add("l");
     articles.Add("M");
     TokenFilter filter = new ElisionFilter(tokenizer, articles);
     List<string> tas = Filtre(filter);
     Assert.AreEqual("embrouille", tas[4]);
     Assert.AreEqual("O'brian", tas[6]);
     Assert.AreEqual("enfin", tas[7]);
 }
Пример #21
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);
            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);

            if (STOP_WORDS != null)
                result = new StopFilter(false, result, STOP_WORDS);
            result = new ASCIIFoldingFilter(result);
            result = new SnowballFilter(result, "English");

            return result;
        }
Пример #22
0
        /// <summary>
        /// 
        /// </summary>
        /// <remarks></remarks>
        /// <seealso cref=""/>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <returns></returns>
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            //create the tokenizer
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader);

            //add in filters
            result = new StandardFilter(result); // first normalize the StandardTokenizer
            result = new LowerCaseFilter(result);// makes sure everything is lower case
            result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // use the default list of Stop Words, provided by the StopAnalyzer class.
            result = new SynonymFilter(result, SynonymEngine); // injects the synonyms.

            //return the built token stream.
            return result;
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Apply standard tokenizer to input
            var tokenizedInput = new StandardTokenizer(_version, reader);

            //Apply standard, lowercase and English stop words filters to input
            var filteredInput = new SnowballFilter(new StopFilter(true, new LowerCaseFilter(new StandardFilter(tokenizedInput)),
                StopAnalyzer.ENGLISH_STOP_WORDS_SET), new EnglishStemmer());

            //Apply EdgeNGram filter to front of words
            //Min size of grams max size of grams
            var grammedInput = new EdgeNGramTokenFilter(filteredInput, Side.FRONT, _mingram, _maxgram);

            return grammedInput;
        }
Пример #24
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
                result = new StopFilter(false, result, STOP_WORDS);
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return result;
        }
Пример #25
0
        /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            TokenStream result;

            if (tagsMode)
            {
                result = new TagsTokenizer(reader);
            }
            else
            {
                result = new StandardTokenizer(reader);
            }

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, stopSet);
            return(result);
        }
Пример #26
0
        /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);

            tokenStream.SetMaxTokenLength(maxTokenLength);
            TokenStream result = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);
            if (useDefaultStopPositionIncrements)
            {
                result = new StopFilter(result, stopSet);
            }
            else
            {
                result = new StopFilter(enableStopPositionIncrements, result, stopSet);
            }
            return(result);
        }
Пример #27
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            Tokenizer src = new StandardTokenizer(Version.LUCENE_30, reader);
            TokenStream filter = new StandardFilter(src);
            //var stoplist = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
            HashSet<string> newWords = new HashSet<string>()
            {
                "said", "have", "the", "more", "from", "who", "he", "than", "it", "were", "use", "has", "also",
                "been", "we", "which", "had", "you", "us", "them", "so", "in", "i", "our", "his", "to", "of", "a",
                "st", "ad", "co", "re", "ve", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "f", "g", "it"
            };
            foreach (var var in StopAnalyzer.ENGLISH_STOP_WORDS_SET)
            {
                newWords.Add(var);
            }

            TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
            result = new SnowballFilter(result, new EnglishStemmer());
            result = new LowerCaseFilter(result);
            result = new ASCIIFoldingFilter(result);
            result = new StopFilter(true, result, newWords);
            return result;
        }
Пример #28
0
 public TokenStreamComponentsAnonymousInnerClassHelper(StandardAnalyzer outerInstance, StandardTokenizer src, TokenStream tok, TextReader reader)
     : base(src, tok)
 {
     this.outerInstance = outerInstance;
     this.reader        = reader;
     this.src           = src;
 }
Пример #29
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
     return new TokenStreamComponents(tokenizer);
 }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
     TokenStream tokenStream = new MockGraphTokenFilter(Random(), tokenizer);
     return new TokenStreamComponents(tokenizer, tokenStream);
 }
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
#pragma warning disable 612, 618
                Tokenizer tokenizer = new StandardTokenizer(LuceneVersion.LUCENE_40, reader);
#pragma warning restore 612, 618
                return new TokenStreamComponents(tokenizer);
            }
Пример #33
0
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>,
 ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem
 ///         exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         result = new ApostropheFilter(result);
     }
     result = new TurkishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Any())
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
 }