public CrawlJob CreateJob(ParsedLink forLink) { Uri url; try { url = new Uri(forLink.Url); } catch (UriFormatException ex) { // relative URL, build it from current URL url = new Uri(Url.ToString().Substring(0, Url.ToString().LastIndexOf('/')) + "/" + forLink.Url); } CrawlJob job = new CrawlJob(url); job.LinkOpts = LinkOpts; // TODO: rest of the params plus maybe some checkings return job; }
void Index(CrawlJob job) { Logger.Debug(string.Format("Parsing URL {0}", job.Url)); // download content IDictionary<string, string> httpHeaders; string rawContent; try { rawContent = Download(job, out httpHeaders); } catch (WebException ex) { Logger.Error("Error downloading " + job.Url, ex); return; } // TODO: treat exceptions string mimeContentType = httpHeaders["Content-Type"]; if (mimeContentType.IndexOf(';') > 0) mimeContentType = mimeContentType.Substring(0, mimeContentType.IndexOf(';')); Logger.Debug(string.Format(" > MIME Content Type: ", mimeContentType)); IList<IContentParser> parsersByMimeType = FindParser.ByMimeContentType(mimeContentType); foreach (IContentParser parser in parsersByMimeType) { foreach (ParsedContent parsed in parser.ParseRaw(rawContent, job.LinkOpts)) { // fill in the rest of the data parsed.Location = job.Url.ToString(); parsed.Sources = new List<string>() { job.Url.Host }; SbApp.Instance.SearchEngine.Index(parsed); // if it has links, index them too if (parsed.LinkOpts.Follow && (parsed.LinkOpts.MaxDepth == -1 || parsed.LinkOpts.CurrentDepth <= parsed.LinkOpts.MaxDepth)) { foreach (var link in parsed.Links) { Index(job.CreateJob(link)); } } } } }
string Download(CrawlJob opts, out IDictionary<string, string> httpHeaders) { System.Net.ServicePointManager.Expect100Continue = false; HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(opts.Url); httpRequest.Timeout = opts.TimeoutSec * 1000; HttpWebResponse response = (HttpWebResponse)httpRequest.GetResponse(); System.IO.StreamReader reader = new System.IO.StreamReader(response.GetResponseStream()); string strResponse = reader.ReadToEnd(); response.Close(); // read HTTP headers httpHeaders = new Dictionary<string, string>(); foreach (string header in response.Headers.AllKeys) { httpHeaders[header] = response.Headers[header]; } return strResponse.Trim(); }