Exemplo n.º 1
0
			public override TokenStream TokenStream (string fieldName, TextReader reader)
			{
				bool is_text_prop = false;

				// Strip off the first two characters in a property.
				// We store type information in those two characters, so we don't
				// want to index them.
				if (fieldName.StartsWith ("prop:")) {
					
					if (strip_extra_property_info) {
						// Skip everything up to and including the first :
						int c;
						do {
							c = reader.Read ();
						} while (c != -1 && c != ':');
					}

					is_text_prop = fieldName.StartsWith ("prop:t");

					// If this is non-text property, just return one token
					// containing the entire string.  We do this to avoid
					// tokenizing keywords.
					if (! is_text_prop) {
						// We don't want to lower case the token if it's
						// not in the private namespace.
							
						TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());
						
						if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace))
							return singleton_stream;
						else
							return new LowerCaseFilter (singleton_stream);
					}
				} else if (fieldName == "PropertyKeyword")
					return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
				else if (fieldName == "Properties")
					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
				else if (fieldName == "TextLinks")
					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));

				TokenStream outstream;
				outstream = base.TokenStream (fieldName, reader);

				NoiseEmailHostFilter.LinkCallback add_link_callback = null;
				lock (this) {
					if (fieldName == "Text")
						add_link_callback = add_link;
				}

				if (fieldName == "Text"
				    || fieldName == "HotText"
				    || fieldName == "PropertyText"
				    || is_text_prop) {
					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback);
					// Sharing Stemmer is not thread safe.
					// Currently our underlying lucene indexing is not done in multiple threads.
					StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
					outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
				}

				return outstream;
			}
Exemplo n.º 2
0
		public override TokenStream TokenStream (string fieldName, TextReader reader)
		{
			TokenStream outstream;
			outstream = base.TokenStream (fieldName, reader);
			outstream = new NoiseEmailHostFilter (outstream, true);
			outstream = new SnowballFilter (outstream, "English");

			return outstream;
		}