Example #1
0
                        public override TokenStream TokenStream (string fieldName, TextReader reader)
                        {
                                bool is_text_prop = false;

                                // Strip off the first two characters in a property.
                                // We store type information in those two characters, so we don't
                                // want to index them.
                                if (fieldName.StartsWith ("prop:")) {

                                        if (strip_extra_property_info) {
                                                // Skip everything up to and including the first :
                                                int c;
                                                do {
                                                        c = reader.Read ();
                                                } while (c != -1 && c != ':');
                                        }

                                        is_text_prop = fieldName.StartsWith ("prop:t");

                                        // If this is non-text property, just return one token
                                        // containing the entire string.  We do this to avoid
                                        // tokenizing keywords.
                                        if (! is_text_prop) {
                                                // We don't want to lower case the token if it's
                                                // not in the private namespace.

                                                TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());

                                                if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace))
                                                        return singleton_stream;
                                                else
                                                        return new LowerCaseFilter (singleton_stream);
                                        }
                                } else if (fieldName == "PropertyKeyword")
                                        return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
                                else if (fieldName == "Properties")
                                        return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
                                else if (fieldName == "TextLinks")
                                        return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));

                                TokenStream outstream;
                                outstream = base.TokenStream (fieldName, reader);

                                return outstream;
                        }
Example #2
0
			public override TokenStream TokenStream (string fieldName, TextReader reader)
			{
				bool is_text_prop = false;

				// Strip off the first two characters in a property.
				// We store type information in those two characters, so we don't
				// want to index them.
				if (fieldName.StartsWith ("prop:")) {
					
					if (strip_extra_property_info) {
						// Skip everything up to and including the first :
						int c;
						do {
							c = reader.Read ();
						} while (c != -1 && c != ':');
					}

					is_text_prop = fieldName.StartsWith ("prop:t");

					// If this is non-text property, just return one token
					// containing the entire string.  We do this to avoid
					// tokenizing keywords.
					if (! is_text_prop) {
						// We don't want to lower case the token if it's
						// not in the private namespace.
							
						TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());
						
						if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace))
							return singleton_stream;
						else
							return new LowerCaseFilter (singleton_stream);
					}
				} else if (fieldName == "PropertyKeyword")
					return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
				else if (fieldName == "Properties")
					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
				else if (fieldName == "TextLinks")
					return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));

				TokenStream outstream;
				outstream = base.TokenStream (fieldName, reader);

				NoiseEmailHostFilter.LinkCallback add_link_callback = null;
				lock (this) {
					if (fieldName == "Text")
						add_link_callback = add_link;
				}

				if (fieldName == "Text"
				    || fieldName == "HotText"
				    || fieldName == "PropertyText"
				    || is_text_prop) {
					outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname, add_link_callback);
					// Sharing Stemmer is not thread safe.
					// Currently our underlying lucene indexing is not done in multiple threads.
					StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
					outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
				}

				return outstream;
			}