Esempio n. 1
0
		private void ProcessEmailToken (Lucene.Net.Analysis.Token token)
		{
			token_type = tokentype_email;

			string email = token.TermText ();
			parts = email.Split (replace_array);
			if (parts.Length == 1) // safety check
				return;

			int index_at = email.IndexOf ('@');
			// store username part as a large token
			// and also remove the final tld part
			Array.Copy (parts, 0, parts, 1, parts.Length - 1);
			parts [0] = email.Substring (0, index_at);
#if ENABLE_RDF_ADAPTER
			if (link_call_back != null)
				link_call_back ("mailto://" + email, true);
#endif
		}
Esempio n. 2
0
		private void ProcessURLToken (Lucene.Net.Analysis.Token token)
		{
			token_type = tokentype_host;

			string hostname = token.TermText ();
			parts = hostname.Split ('.');

			if (parts [0] != "www")
				return;

			// remove initial www
			Array.Copy (parts, 1, parts, 0, parts.Length - 1);
			Array.Resize (ref parts, parts.Length - 1);
			// FIXME: Remove final tld
			// Any string of form "<alnum> '.')+<alnum>" has type HOST
			// Removing last token might remove important words from non-host
			// string of that form. To fix that, we need to match against the
			// huge list of TLDs.
		}
Esempio n. 3
0
		private bool ProcessToken (ref Lucene.Net.Analysis.Token token)
		{
			string type = token.Type ();

			if (type == tokentype_number) {
				// nobody will remember more than 20 digits
				return (token.TermText ().Length <= 20);
			} else if (type == tokentype_alphanum) {
				string text = token.TermText ();
				int begin = 0;
				bool found = false;
				// Check if number, in that case strip 0's from beginning
				foreach (char c in text) {
					if (! Char.IsDigit (c)) {
						begin = 0;
						break;
					} else if (! found) {
						if (c == '0')
							begin ++;
						else
							found = true;
					}
				}

				if (begin == 0)
					return ! IsNoise (text);
				token = new Lucene.Net.Analysis.Token (
					text.Remove (0, begin),
					begin,
					token.EndOffset (),
					type);
				return true;
			} else if (type == tokentype_email) {
				if (tokenize_email_hostname)
					ProcessEmailToken (token);
				return true;
			} else if (type == tokentype_host) {
				if (tokenize_email_hostname)
					ProcessURLToken (token);
				return true;
			} else
				// FIXME: Noise should be only tested on token type alphanum
				return ! IsNoise (token.TermText ());
		}