public static async Task Crawler() { //Download chromium browser revision package await new BrowserFetcher().DownloadAsync(ChromiumRevision); using var browser = await Puppeteer.LaunchAsync(new LaunchOptions { // Headless = true Headless = false }); var optionsBuilder = new DbContextOptionsBuilder <ContentContext>(); var options = optionsBuilder .UseSqlite("Data Source=D:\\WebCrawlerPrj\\Crawler\\DB\\content.db;", providerOptions => providerOptions.CommandTimeout(60)); await using var _context = new ContentContext(options.Options); var articles = _context.CrawledLinks.Select(c => new { c.Id, c.Url }).ToList(); foreach (var article in articles) { var page = await browser.NewPageAsync(); var pageContent = await GetPageContent(page, article.Url, 1024, 768); var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(pageContent); var articleContent = document.QuerySelector(".sidebar_1").InnerHtml; var articleModel = CreatePageModel(article.Url, articleContent, article.Id); var articleEntity = PageProfile.MapCreateModelToEntity(articleModel); await _context.Pages.AddAsync(articleEntity); await _context.SaveChangesAsync(); await page.CloseAsync(); } await browser.CloseAsync(); }
private static async Task OnPageCrawlCompleted(PageCrawlCompletedArgs e, ContentContext _context) { if (e.CrawledPage.HttpRequestException != null || e.CrawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", e.CrawledPage.Uri.AbsoluteUri); return; } if (string.IsNullOrEmpty(e.CrawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", e.CrawledPage.Uri.AbsoluteUri); return; } count++; var httpStatus = e.CrawledPage.HttpResponseMessage.StatusCode; var rawPageText = e.CrawledPage.Content.Text; if (e.CrawledPage.Uri.AbsoluteUri == "https://www.webtoons.com/en/dailySchedule") { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(rawPageText); var comicCards = document.QuerySelectorAll(".daily_card_item"); foreach (var comicCard in comicCards) { var imageLink = new Uri(comicCard.QuerySelector("img").GetAttribute("src")).PathAndQuery; var link = comicCard.GetAttribute("href"); var titleNo = HttpUtility.ParseQueryString(new Uri(link).Query).Get("title_no"); var genre = comicCard.QuerySelector(".genre").InnerHtml; var subject = comicCard.QuerySelector(".subj").InnerHtml; var author = comicCard.QuerySelector(".author").InnerHtml; var cardModel = createWebToonModel(link, titleNo, imageLink, genre, subject, author); var cardEntity = WebtoonProfile.MapCreateModelToEntity(cardModel); var existingWeb = await _context.WebToons.FindAsync(titleNo); if (existingWeb != null) { continue; } await _context.WebToons.AddAsync(cardEntity); // await _context.SaveChangesAsync(); // Console.WriteLine(imageLink); // await _context.SaveChangesAsync(); } } else if (e.CrawledPage.Uri.AbsoluteUri.Contains("list")) { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(rawPageText); var list = document.QuerySelectorAll("#_listUl li"); Console.WriteLine(list.Length); foreach (var item in list) { var link = new Uri(e.CrawledPage.Uri.AbsoluteUri); var titleNo = HttpUtility.ParseQueryString(link.Query).Get("title_no"); var episodeName = item.QuerySelector("a .subj").InnerHtml; var episodeThumb = new Uri(item.QuerySelector("a .thmb img").GetAttribute("src")).PathAndQuery; var episodeDate = item.QuerySelector("a .date").InnerHtml; var episodeLink = item.QuerySelector("a").GetAttribute("href"); var episodeLinkHash = ComputeSha256Hash(episodeLink); var episodeModel = createEpisodeModel(titleNo, episodeName, episodeThumb, episodeDate, episodeLink, episodeLinkHash); var episodeEntity = EpisodeProfile.MapCreateModelToEntity(episodeModel); // Console.WriteLine(episodeThumb); // var _context = Program.ServiceProvider.GetService<ContentContext>(); var existed = await _context.Episodes.FindAsync(episodeLinkHash); if (existed != null) { continue; } await _context.AddAsync(episodeEntity); // await _context.SaveChangesAsync(); // Console.WriteLine(episodeLinkHash); } } else if (e.CrawledPage.Uri.AbsoluteUri.Contains("viewer")) { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(rawPageText); var content = document.QuerySelectorAll(".viewer_img img"); var contentLinks = content.Select(cont => new Uri(cont.GetAttribute("data-url")).PathAndQuery).ToList(); // foreach (var cont in contentLinks) // { // var linkData = new Uri(cont); // Console.WriteLine(linkData.PathAndQuery); // } var episodeLink = e.CrawledPage.Uri.AbsoluteUri; var epsiodeLinkHash = ComputeSha256Hash(episodeLink); var contentJson = JsonSerializer.Serialize(contentLinks); var pageModel = createPageModel(epsiodeLinkHash, contentJson); var pageEntity = PageProfile.MapCreateModelToEntity(pageModel); // var _context = Program.ServiceProvider.GetService<ContentContext>(); var existingPage = _context.Pages.Where(p => p.Content == contentJson).ToList(); if (existingPage.Count == 0) { await _context.AddAsync(pageEntity); } // await _context.SaveChangesAsync(); } await _context.SaveChangesAsync(); Program.LogInfo(count.ToString()); }