/// <summary> /// This method is called by main to check a link. After /// spidering through the site, the final list of bad links /// is displayed. /// </summary> /// <param name="url">The URL to check for bad links.</param> public void check(Uri url) { SpiderOptions options = new SpiderOptions(); options.WorkloadManager = typeof(MemoryWorkloadManager).FullName; LinkReport report = new LinkReport(); Spider spider = new Spider(options, report); spider.AddURL(url, null, 1); spider.Process(); Console.WriteLine(spider.Status); if (report.Bad.Count > 0) { Console.WriteLine("Bad Links Found:"); foreach (String str in report.Bad) { Console.WriteLine(str); } } else { Console.WriteLine("No bad links were found."); } }
/// <summary> /// Construct a SpiderParseHTML object. This object allows /// you to parse HTML, while the spider collects link /// information in the background. /// </summary> /// <param name="baseURL">The URL that is being parsed, this is used for relative links.</param> /// <param name="istream">The InputStream being parsed.</param> /// <param name="spider">The Spider that is parsing.</param> public SpiderParseHTML(Uri baseURL, SpiderInputStream istream, Spider spider) : base(istream) { this.stream = istream; this.spider = spider; this.baseURL = baseURL; this.depth = spider.Workload.GetDepth(baseURL); }
/// <summary> /// Download an entire site. /// </summary> /// <param name="config">The spider configuration file to use.</param> /// <param name="baseHost">The URL to start from.</param> /// <param name="local">The local path to save files to.</param> public void Download(String config, Uri baseHost, String local) { WorldSpiderReport report = new WorldSpiderReport(local); SpiderOptions options = new SpiderOptions(); options.Load(config); Spider spider = new Spider(options, report); spider.AddURL(baseHost, null, 1); spider.Process(); Console.WriteLine(spider.Status); }
/// <summary> /// Download an entire site. /// </summary> /// <param name="config">The spider configuration file to use.</param> /// <param name="baseURL">The URL to start from.></param> /// <param name="local">The local path to save files to.</param> public void Download(String config, Uri baseURL, String local) { SpiderReport report = new SpiderReport(local); SpiderOptions options = new SpiderOptions(); options.Load(config); Spider spider = new Spider(options, report); spider.Logging.Console = true; spider.Logging.Filename = "c:\\spider.log"; spider.Logging.Clear(); spider.AddURL(baseURL, null, 1); spider.Process(); Console.WriteLine(spider.Status); }
/// <summary> /// Called when the spider finds a URL. /// </summary> /// <param name="url">The URL that was found.</param> /// <param name="source">Where the URL was found.</param> /// <param name="type">What sort of tag produced this URL.</param> /// <returns></returns> public bool SpiderFoundURL(Uri url, Uri source, Spider.URLType type) { if ((this.baseHost != null) && (String.Compare(this.baseHost, url.Host, true) != 0)) { return false; } return true; }
/// <summary> /// Called when the spider is starting up. This method /// provides the SpiderReportable class with the spider /// object. /// </summary> /// <param name="spider">The spider that will be working with this object.</param> public void Init(Spider spider) { this.spider = spider; }
/// <summary> /// Used internally, to add a URL to the spider's workload. /// </summary> /// <param name="u">The URL to add.</param> /// <param name="type">What type of link this is.</param> private void AddURL(String u, Spider.URLType type) { if (u == null) { return; } try { Uri url = URLUtility.constructURL(this.baseURL, u, true); url = this.spider.Workload.ConvertURL(url.ToString()); if ((String.Compare(url.Scheme, "http", true) == 0) || (String.Compare(url.Scheme, "https", true) == 0)) { if (this.spider.Report.SpiderFoundURL(url, this.baseURL, type)) { try { this.spider.AddURL(url, this.baseURL, this.depth + 1); } catch (WorkloadException e) { throw new IOException(e.Message); } } } } catch (UriFormatException) { spider.Logging.Log(Logger.Level.INFO, "Malformed URL found:" + u); } catch (WorkloadException) { spider.Logging.Log(Logger.Level.INFO, "Invalid URL found:" + u); } }
/// <summary> /// Called when the spider encounters a URL. This function /// will always return true. Because this spider will /// theoretically visit every URL on the Internet, all /// URL's will be processed. /// </summary> /// <param name="url">The URL that the spider found.</param> /// <param name="source">The page that the URL was found on.</param> /// <param name="type">The type of link this URL is.</param> /// <returns>True if the spider should scan for links on this page.</returns> public bool SpiderFoundURL(Uri url, Uri source, Spider.URLType type) { return true; }
/// <summary> /// Not used. /// </summary> /// <param name="spider">Not used.</param> public void Init(Spider spider) { }