示例#1
0
        public HtmlDocument LoadDocument(string url)
        {
            var htmlDoc = new HtmlDocument();

            try {
                using (var responseStream = CreateRequest(url).GetResponse().GetResponseStream()) {
                    htmlDoc.Load(responseStream, Encoding.UTF8);
                }
            } catch (WebException webEx) {
                AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Error connecting to specified URL: " + webEx.Message + ", retrying...");
            }

            return(htmlDoc);
        }
示例#2
0
        protected virtual void StartScraping()
        {
            AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Started WebScraper @(" + BaseURL + ")...");

            for (var nextPage = 1; ; nextPage++)
            {
                var nextURL = CreateNextURL(nextPage);
                var doc     = HtmlLoader.LoadDocument(nextURL);

                AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Processing page [" + nextPage.ToString() + "] @(" + nextURL + ")");

                var rows     = GetMangaRows(doc);
                var rowCount = rows.Count();

                AppLogHelper.Log(AppLoggerBase.LogTarget.File, "[" + rowCount + "] rows found. Processing rows...");

                if (rowCount == 0)
                {
                    AppLogHelper.Log(AppLoggerBase.LogTarget.File, "No more titles found, exiting main loop...");
                    break;
                }

                foreach (var row in rows)
                {
                    var title = GetMangaTitle(row);
                    if (title == null)
                    {
                        AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting title, skipping...");
                        continue;
                    }

                    var titleURL = GetMangaURL(row);
                    if (titleURL == null)
                    {
                        AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting URL, skipping...");
                        continue;
                    }

                    var author = GetMangaAuthor(row);
                    if (author == null)
                    {
                        AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting author, skipping...");
                        continue;
                    }

                    var imagePath = GetMangaImagePath(row);
                    if (imagePath == null)
                    {
                        AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting image URL, skipping...");
                        continue;
                    }

                    var pubStatus = GetMangaPublishingStatus(row);
                    if (pubStatus == null)
                    {
                        AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Failed in extracting publishing status, skipping...");
                        continue;
                    }

                    AppLogHelper.Log(AppLoggerBase.LogTarget.File, title + ", " + titleURL);

                    var mangaEntry = new MangaList {
                        Title     = title,
                        Site      = titleURL,
                        Author    = author,
                        ImagePath = imagePath,
                        PubStatus = pubStatus
                    };

                    ScraperRepo.AddEntry(mangaEntry);
                }
                ScraperRepo.SaveChanges();
                AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Successfully added [" + rowCount + "] records to repository");

                var hasOnePageOnly = HasOnePageOnly();
                if (hasOnePageOnly)
                {
                    break;
                }

                AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Finished scraping page [" + nextPage + "]");
            }
            AppLogHelper.Log(AppLoggerBase.LogTarget.File, "Finished scraping @(" + BaseURL + ")");
        }