public async void Begin() { Console.WriteLine("Scraper Started"); while (true) { using (var dbContext = new ScraperContext()) { ScrapedUri nextTarget = await dbContext.ScrapeUri.Where(s => s.Scraped == false && s.ScrapeAttempts < 3).FirstOrDefaultAsync(); if (nextTarget != default) { WebPage webpage = await TryScrapeWebPage(nextTarget.AbsoluteUri); if (webpage != null) { HtmlNode[] linkNodes = webpage.Html.CssSelect("a").ToArray(); for (int x = 0; x < linkNodes.Length; x++) { string link = linkNodes[x].GetAttributeValue("href"); if (link != null && link != "") { Uri uri = LinkValidation.Validate(link, nextTarget.AbsoluteUri); if (uri != null) { ScrapedUri scrapeUri = new ScrapedUri { AbsoluteUri = uri.AbsoluteUri, Scheme = uri.Scheme, Host = uri.Host, QueryParams = uri.Query, FileType = GetFileType(uri.Segments[uri.Segments.Length - 1]), ScrapeDataTime = DateTime.UtcNow }; dbContext.Add(scrapeUri); Console.WriteLine("Adding:" + scrapeUri.AbsoluteUri); } } } nextTarget.Scraped = true; } nextTarget.ScrapeAttempts++; await dbContext.SaveChangesAsync(); } else { Console.WriteLine("No more Uris to scrape"); break; } } } Console.WriteLine("Scraper Finished"); }
public Scraper() { using (var dbContext = new ScraperContext()) { if (dbContext.ScrapeUri.Count() == 0) { Console.WriteLine("Adding start point"); ScrapedUri scrapeUri = new ScrapedUri { AbsoluteUri = "http://demo.com", Scheme = "http", Host = "demo.com", QueryParams = "" }; dbContext.Add(scrapeUri); } } }