Example #1
0
        public static void run()
        {
            items = new List <BookData>();
            var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"));

            // callback to gather links
            spider.FetchCompleted += (s, a) =>
            {
                // This callback can be replaced by:
                //  spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default)
                // and is here for demonstration purposes

                // Use a simple SubString-based split to get all "<a>" tags
                var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html);
                // Add the collected links to the queue
                (s as SimpleSpider).AddPages(links, a.Link);
            };
            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items_XPath;   // Sample using XPath
            spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };

            // execute from first page
            spider.Execute();

            // List all books
            foreach (var b in items)
            {
                Console.WriteLine($" > {b.Price:C2} {b.Title}");
            }
        }
Example #2
0
        public void AnchorHelper_GetAnchorsTests_BrokenHtmlAnchor()
        {
            var uri = new Uri("http://foo.bar");

            string html = "<html><div><a></a><li><a href=\"TEST\" id= \"A\"";
            var    arr  = AnchorHelper.GetAnchors(uri, html).ToArray();

            Assert.Empty(arr);
        }
Example #3
0
        public void AnchorHelper_GetAnchorsTests_Empty()
        {
            var uri = new Uri("http://foo.bar");

            string html = "<a id=\"TEST\"> test </a>";
            var    arr  = AnchorHelper.GetAnchors(uri, html).ToArray();

            Assert.Empty(arr);
        }
Example #4
0
        public void AnchorHelper_GetAnchorsTests_Base()
        {
            var uri = new Uri("http://foo.bar");

            string html = "<a href=\"TEST\"> test </a>";
            var    arr  = AnchorHelper.GetAnchors(uri, html).ToArray();

            Assert.Equal(new Uri[] { new Uri("http://foo.bar/TEST") }, arr);
        }
Example #5
0
        public void AnchorHelper_GetAnchorsTests_MoreComplex()
        {
            var uri = new Uri("http://foo.bar");

            string html = "<html><div><a></a><li><a href=\"TEST\"> test </a><a href=\"T2\"> test2 </a>";
            var    arr  = AnchorHelper.GetAnchors(uri, html).ToArray();

            Assert.Equal(new Uri[]
            {
                new Uri("http://foo.bar/TEST"),
                new Uri("http://foo.bar/T2")
            },
                         arr);
        }
Example #6
0
        public IEnumerable <Uri> GetLinks(FetchCompleteEventArgs args)
        {
            string htmlContent = args.Html;

            // AnchorHelper.GetAnchors(request, htmlContent);
            if (htmlContent.StartsWith("<?xml"))
            {
                // rss
                foreach (var link in htmlContent.Split("<link"))
                {
                    if (link == null)
                    {
                        continue;
                    }
                    if (link.Length < 5)
                    {
                        continue;
                    }
                    if (link[1] == '?')
                    {
                        continue;
                    }

                    var content = link.Substring(link.IndexOf('>') + 1);
                    content = content.Substring(0, content.IndexOf('<'));

                    if (content.StartsWith("http"))
                    {
                        yield return(new Uri(content));
                    }
                }
            }
            else
            {
                foreach (var l in AnchorHelper.GetAnchors(args.Link, htmlContent))
                {
                    yield return(l);
                }
            }
        }
Example #7
0
        private void fetchCompleted_AutoCollect(object Sender, FetchCompleteEventArgs args)
        {
            try
            {
                if (!Configuration.Auto_AnchorsLinks)
                {
                    return;
                }
                if (string.IsNullOrEmpty(args.Html))
                {
                    return;
                }

                var links = AnchorHelper.GetAnchors(args.Link.Uri, args.Html);

                // Add the collected links to the queue
                AddPages(links, args.Link);
            }
            catch (Exception ex)
            {
                Configuration.Auto_AnchorsLinks = false;
                log.Error(ex, "Failed while auto-collecting links. Auto-collection disabled");
            }
        }
Example #8
0
        public IEnumerable <Uri> GetLinks(FetchCompleteEventArgs args)
        {
            var doc = args.GetDocument();

            // <a
            foreach (var uri in AnchorHelper.GetAnchors(args.Link, doc))
            {
                yield return(uri);
            }

            // <button onClick
            var buttons = doc.DocumentNode
                          .SelectNodes(".//button")
                          .Select(x => x.Attributes["onclick"])
                          .Where(att => att != null);

            foreach (var btn in buttons)
            {
                string value = btn.Value;
                string url;
                try
                {
                    if (!value.Contains("window.open"))
                    {
                        continue;
                    }

                    url = value.Substring(value.IndexOf("(") + 1);
                    url = url.Substring(0, url.IndexOf(")"));

                    int idxQuote       = url.IndexOf("'");
                    int idxDoubleQuote = url.IndexOf("\"");

                    char delimiter = '"';
                    if (idxDoubleQuote < 0)
                    {
                        delimiter = '\'';
                    }
                    if (idxQuote >= 0 && idxQuote < idxDoubleQuote)
                    {
                        delimiter = '\'';
                    }

                    int idxStart = url.IndexOf(delimiter) + 1; // skip quote
                    int idxEnd   = url.IndexOf(delimiter, idxStart + 1);

                    if (idxStart < 0)
                    {
                        continue;
                    }
                    if (idxEnd < 0)
                    {
                        continue;
                    }

                    url = url.Substring(idxStart, idxEnd - idxStart);
                }
                catch { continue; }

                yield return(new Uri(args.Link, url));
            }
        }
Example #9
0
 public IEnumerable <Uri> GetLinks(FetchCompleteEventArgs args)
 {
     return(AnchorHelper.GetAnchors(args.Link, args.GetDocument()));
 }