/// <summary> /// Called when the spider is ready to process an HTML /// URL. /// </summary> /// <param name="url">The URL that the spider is about to process.</param> /// <param name="parse">An object that will allow you you to parse the HTML on this page.</param> public void SpiderProcessURL(Uri url, SpiderParseHTML parse) { try { parse.ReadAll(); } catch (IOException) { spider.Logging.Log(Logger.Level.INFO, "Error reading page:" + url.ToString()); } }
/// <summary> /// This method is called by the thread pool to process one /// single URL. /// </summary> /// <param name="stateInfo">Not used.</param> private void SpiderWorkerProc(Object stateInfo) { Stream istream = null; WebRequest http; HttpWebResponse response; Uri url = null; try { url = (Uri)stateInfo; logging.Log(Logger.Level.INFO, "Processing: " + url); // Get the URL's contents. http = HttpWebRequest.Create(url); http.Timeout = this.options.Timeout; if (this.options.UserAgent != null) { http.Headers["User-Agent"] = this.options.UserAgent; } response = (HttpWebResponse)http.GetResponse(); // Read the URL. istream = response.GetResponseStream(); // Parse the URL. if (String.Compare(response.ContentType, "text/html") == 0) { SpiderParseHTML parse = new SpiderParseHTML(response.ResponseUri, new SpiderInputStream(istream, null), this); this.report.SpiderProcessURL(url, parse); } else { this.report.SpiderProcessURL(url, istream); } } catch (IOException e) { logging.Log(Logger.Level.INFO, "I/O error on URL:" + url); try { this.workloadManager.MarkError(url); } catch (WorkloadException) { logging.Log(Logger.Level.ERROR, "Error marking workload(1).", e); } this.report.SpiderURLError(url); return; } catch (WebException e) { logging.Log(Logger.Level.INFO, "Web error on URL:" + url); try { this.workloadManager.MarkError(url); } catch (WorkloadException) { logging.Log(Logger.Level.ERROR, "Error marking workload(2).", e); } this.report.SpiderURLError(url); return; } catch (Exception e) { try { this.workloadManager.MarkError(url); } catch (WorkloadException) { logging.Log(Logger.Level.ERROR, "Error marking workload(3).", e); } logging.Log(Logger.Level.ERROR, "Caught exception at URL:" + url.ToString(), e); this.report.SpiderURLError(url); return; } finally { if (istream != null) { istream.Close(); } } try { // Mark URL as complete. this.workloadManager.MarkProcessed(url); logging.Log(Logger.Level.INFO, "Complete: " + url); if (!url.Equals(response.ResponseUri)) { // save the URL(for redirect's) this.workloadManager.Add(response.ResponseUri, url, this.workloadManager.GetDepth(response.ResponseUri)); this.workloadManager.MarkProcessed(response.ResponseUri); } } catch (WorkloadException e) { logging.Log(Logger.Level.ERROR, "Error marking workload(3).", e); } }
/// <summary> /// Called when the spider is ready to process an HTML /// URL. Download the contents of the URL to a local file. /// </summary> /// <param name="url">The URL that the spider is about to process.</param> /// <param name="parse">An object that will allow you you to parse the HTML on this page.</param> public void SpiderProcessURL(Uri url, SpiderParseHTML parse) { String filename = URLUtility.convertFilename(this.path, url, true); Stream os = new FileStream(filename, FileMode.Create); parse.Stream.OutputStream = os; parse.ReadAll(); os.Close(); }