private void ProcessEmailToken (Lucene.Net.Analysis.Token token) { token_type = tokentype_email; string email = token.TermText (); parts = email.Split (replace_array); if (parts.Length == 1) // safety check return; int index_at = email.IndexOf ('@'); // store username part as a large token // and also remove the final tld part Array.Copy (parts, 0, parts, 1, parts.Length - 1); parts [0] = email.Substring (0, index_at); #if ENABLE_RDF_ADAPTER if (link_call_back != null) link_call_back ("mailto://" + email, true); #endif }
private void ProcessURLToken (Lucene.Net.Analysis.Token token) { token_type = tokentype_host; string hostname = token.TermText (); parts = hostname.Split ('.'); if (parts [0] != "www") return; // remove initial www Array.Copy (parts, 1, parts, 0, parts.Length - 1); Array.Resize (ref parts, parts.Length - 1); // FIXME: Remove final tld // Any string of form "<alnum> '.')+<alnum>" has type HOST // Removing last token might remove important words from non-host // string of that form. To fix that, we need to match against the // huge list of TLDs. }
private bool ProcessToken (ref Lucene.Net.Analysis.Token token) { string type = token.Type (); if (type == tokentype_number) { // nobody will remember more than 20 digits return (token.TermText ().Length <= 20); } else if (type == tokentype_alphanum) { string text = token.TermText (); int begin = 0; bool found = false; // Check if number, in that case strip 0's from beginning foreach (char c in text) { if (! Char.IsDigit (c)) { begin = 0; break; } else if (! found) { if (c == '0') begin ++; else found = true; } } if (begin == 0) return ! IsNoise (text); token = new Lucene.Net.Analysis.Token ( text.Remove (0, begin), begin, token.EndOffset (), type); return true; } else if (type == tokentype_email) { if (tokenize_email_hostname) ProcessEmailToken (token); return true; } else if (type == tokentype_host) { if (tokenize_email_hostname) ProcessURLToken (token); return true; } else // FIXME: Noise should be only tested on token type alphanum return ! IsNoise (token.TermText ()); }