public static void run() { items = new List <BookData>(); var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/")); // callback to gather links spider.FetchCompleted += (s, a) => { // This callback can be replaced by: // spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default) // and is here for demonstration purposes // Use a simple SubString-based split to get all "<a>" tags var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html); // Add the collected links to the queue (s as SimpleSpider).AddPages(links, a.Link); }; // callback to gather items spider.FetchCompleted += fetchCompleted_items_XPath; // Sample using XPath spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject // Ignore (cancel) the pages containing "/reviews/" spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); }; // execute from first page spider.Execute(); // List all books foreach (var b in items) { Console.WriteLine($" > {b.Price:C2} {b.Title}"); } }
public void AnchorHelper_GetAnchorsTests_BrokenHtmlAnchor() { var uri = new Uri("http://foo.bar"); string html = "<html><div><a></a><li><a href=\"TEST\" id= \"A\""; var arr = AnchorHelper.GetAnchors(uri, html).ToArray(); Assert.Empty(arr); }
public void AnchorHelper_GetAnchorsTests_Empty() { var uri = new Uri("http://foo.bar"); string html = "<a id=\"TEST\"> test </a>"; var arr = AnchorHelper.GetAnchors(uri, html).ToArray(); Assert.Empty(arr); }
public void AnchorHelper_GetAnchorsTests_Base() { var uri = new Uri("http://foo.bar"); string html = "<a href=\"TEST\"> test </a>"; var arr = AnchorHelper.GetAnchors(uri, html).ToArray(); Assert.Equal(new Uri[] { new Uri("http://foo.bar/TEST") }, arr); }
public void AnchorHelper_GetAnchorsTests_MoreComplex() { var uri = new Uri("http://foo.bar"); string html = "<html><div><a></a><li><a href=\"TEST\"> test </a><a href=\"T2\"> test2 </a>"; var arr = AnchorHelper.GetAnchors(uri, html).ToArray(); Assert.Equal(new Uri[] { new Uri("http://foo.bar/TEST"), new Uri("http://foo.bar/T2") }, arr); }
public IEnumerable <Uri> GetLinks(FetchCompleteEventArgs args) { string htmlContent = args.Html; // AnchorHelper.GetAnchors(request, htmlContent); if (htmlContent.StartsWith("<?xml")) { // rss foreach (var link in htmlContent.Split("<link")) { if (link == null) { continue; } if (link.Length < 5) { continue; } if (link[1] == '?') { continue; } var content = link.Substring(link.IndexOf('>') + 1); content = content.Substring(0, content.IndexOf('<')); if (content.StartsWith("http")) { yield return(new Uri(content)); } } } else { foreach (var l in AnchorHelper.GetAnchors(args.Link, htmlContent)) { yield return(l); } } }
private void fetchCompleted_AutoCollect(object Sender, FetchCompleteEventArgs args) { try { if (!Configuration.Auto_AnchorsLinks) { return; } if (string.IsNullOrEmpty(args.Html)) { return; } var links = AnchorHelper.GetAnchors(args.Link.Uri, args.Html); // Add the collected links to the queue AddPages(links, args.Link); } catch (Exception ex) { Configuration.Auto_AnchorsLinks = false; log.Error(ex, "Failed while auto-collecting links. Auto-collection disabled"); } }
public IEnumerable <Uri> GetLinks(FetchCompleteEventArgs args) { var doc = args.GetDocument(); // <a foreach (var uri in AnchorHelper.GetAnchors(args.Link, doc)) { yield return(uri); } // <button onClick var buttons = doc.DocumentNode .SelectNodes(".//button") .Select(x => x.Attributes["onclick"]) .Where(att => att != null); foreach (var btn in buttons) { string value = btn.Value; string url; try { if (!value.Contains("window.open")) { continue; } url = value.Substring(value.IndexOf("(") + 1); url = url.Substring(0, url.IndexOf(")")); int idxQuote = url.IndexOf("'"); int idxDoubleQuote = url.IndexOf("\""); char delimiter = '"'; if (idxDoubleQuote < 0) { delimiter = '\''; } if (idxQuote >= 0 && idxQuote < idxDoubleQuote) { delimiter = '\''; } int idxStart = url.IndexOf(delimiter) + 1; // skip quote int idxEnd = url.IndexOf(delimiter, idxStart + 1); if (idxStart < 0) { continue; } if (idxEnd < 0) { continue; } url = url.Substring(idxStart, idxEnd - idxStart); } catch { continue; } yield return(new Uri(args.Link, url)); } }
public IEnumerable <Uri> GetLinks(FetchCompleteEventArgs args) { return(AnchorHelper.GetAnchors(args.Link, args.GetDocument())); }