public HtmlDocument LoadDocument(string url) { var htmlDoc = new HtmlDocument(); try { using (var responseStream = CreateRequest(url).GetResponse().GetResponseStream()) { htmlDoc.Load(responseStream, Encoding.UTF8); } } catch (WebException webEx) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Error connecting to specified URL: " + webEx.Message + ", retrying..."); } return(htmlDoc); }
protected virtual void StartScraping() { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Started WebScraper @(" + BaseURL + ")..."); for (var nextPage = 1; ; nextPage++) { var nextURL = CreateNextURL(nextPage); var doc = HtmlLoader.LoadDocument(nextURL); AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Processing page [" + nextPage.ToString() + "] @(" + nextURL + ")"); var rows = GetMangaRows(doc); var rowCount = rows.Count(); AppLogHelper.Log(AppLoggerBase.LogTarget.File, "[" + rowCount + "] rows found. Processing rows..."); if (rowCount == 0) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "No more titles found, exiting main loop..."); break; } foreach (var row in rows) { var title = GetMangaTitle(row); if (title == null) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting title, skipping..."); continue; } var titleURL = GetMangaURL(row); if (titleURL == null) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting URL, skipping..."); continue; } var author = GetMangaAuthor(row); if (author == null) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting author, skipping..."); continue; } var imagePath = GetMangaImagePath(row); if (imagePath == null) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting image URL, skipping..."); continue; } var pubStatus = GetMangaPublishingStatus(row); if (pubStatus == null) { AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting publishing status, skipping..."); continue; } AppLogHelper.Log(AppLoggerBase.LogTarget.File, title + ", " + titleURL); var mangaEntry = new MangaList { Title = title, Site = titleURL, Author = author, ImagePath = imagePath, PubStatus = pubStatus }; ScraperRepo.AddEntry(mangaEntry); } ScraperRepo.SaveChanges(); AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Successfully added [" + rowCount + "] records to repository"); var hasOnePageOnly = HasOnePageOnly(); if (hasOnePageOnly) { break; } AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Finished scraping page [" + nextPage + "]"); } AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Finished scraping @(" + BaseURL + ")"); }