public override TokenStream TokenStream (string fieldName, TextReader reader) { bool is_text_prop = false; // Strip off the first two characters in a property. // We store type information in those two characters, so we don't // want to index them. if (fieldName.StartsWith ("prop:")) { if (strip_extra_property_info) { // Skip everything up to and including the first : int c; do { c = reader.Read (); } while (c != -1 && c != ':'); } is_text_prop = fieldName.StartsWith ("prop:t"); // If this is non-text property, just return one token // containing the entire string. We do this to avoid // tokenizing keywords. if (! is_text_prop) { // We don't want to lower case the token if it's // not in the private namespace. TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ()); if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace)) return singleton_stream; else return new LowerCaseFilter (singleton_stream); } } else if (fieldName == "PropertyKeyword") return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ())); else if (fieldName == "Properties") return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ())); else if (fieldName == "TextLinks") return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ())); TokenStream outstream; outstream = base.TokenStream (fieldName, reader); NoiseEmailHostFilter.LinkCallback add_link_callback = null; lock (this) { if (fieldName == "Text") add_link_callback = add_link; } if (fieldName == "Text" || fieldName == "HotText" || fieldName == "PropertyText" || is_text_prop) { outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback); // Sharing Stemmer is not thread safe. // Currently our underlying lucene indexing is not done in multiple threads. StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE); outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod); } return outstream; }
public override TokenStream TokenStream (string fieldName, TextReader reader) { TokenStream outstream; outstream = base.TokenStream (fieldName, reader); outstream = new NoiseEmailHostFilter (outstream, true); outstream = new SnowballFilter (outstream, "English"); return outstream; }