Exemplo n.º 1
0
        public async void Begin()
        {
            Console.WriteLine("Scraper Started");

            while (true)
            {
                using (var dbContext = new ScraperContext())
                {
                    ScrapedUri nextTarget = await dbContext.ScrapeUri.Where(s => s.Scraped == false && s.ScrapeAttempts < 3).FirstOrDefaultAsync();

                    if (nextTarget != default)
                    {
                        WebPage webpage = await TryScrapeWebPage(nextTarget.AbsoluteUri);

                        if (webpage != null)
                        {
                            HtmlNode[] linkNodes = webpage.Html.CssSelect("a").ToArray();


                            for (int x = 0; x < linkNodes.Length; x++)
                            {
                                string link = linkNodes[x].GetAttributeValue("href");
                                if (link != null && link != "")
                                {
                                    Uri uri = LinkValidation.Validate(link, nextTarget.AbsoluteUri);

                                    if (uri != null)
                                    {
                                        ScrapedUri scrapeUri = new ScrapedUri
                                        {
                                            AbsoluteUri    = uri.AbsoluteUri,
                                            Scheme         = uri.Scheme,
                                            Host           = uri.Host,
                                            QueryParams    = uri.Query,
                                            FileType       = GetFileType(uri.Segments[uri.Segments.Length - 1]),
                                            ScrapeDataTime = DateTime.UtcNow
                                        };

                                        dbContext.Add(scrapeUri);
                                        Console.WriteLine("Adding:" + scrapeUri.AbsoluteUri);
                                    }
                                }
                            }
                            nextTarget.Scraped = true;
                        }

                        nextTarget.ScrapeAttempts++;
                        await dbContext.SaveChangesAsync();
                    }
                    else
                    {
                        Console.WriteLine("No more Uris to scrape");
                        break;
                    }
                }
            }
            Console.WriteLine("Scraper Finished");
        }
Exemplo n.º 2
0
 public Scraper()
 {
     using (var dbContext = new ScraperContext())
     {
         if (dbContext.ScrapeUri.Count() == 0)
         {
             Console.WriteLine("Adding start point");
             ScrapedUri scrapeUri = new ScrapedUri
             {
                 AbsoluteUri = "http://demo.com",
                 Scheme      = "http",
                 Host        = "demo.com",
                 QueryParams = ""
             };
             dbContext.Add(scrapeUri);
         }
     }
 }