public async Task ProcessPageContent(IHtmlDocument document, Uri requestUri) { var uri = requestUri.ToString(); if (uri == "https://www.webtoons.com/en/dailySchedule") { var comicCards = document.QuerySelectorAll(".daily_card_item"); foreach (var comicCard in comicCards) { var link = comicCard.GetAttribute("href"); var webToon = new WebToon { Author = comicCard.QuerySelector(".author").InnerHtml, ContentHash = "", Genre = comicCard.QuerySelector(".genre").InnerHtml, ImageLink = new Uri(comicCard.QuerySelector("img").GetAttribute("src")).PathAndQuery, Subject = comicCard.QuerySelector(".subj").InnerHtml, TitleNo = HttpUtility.ParseQueryString(new Uri(link).Query).Get("title_no"), Updated = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat), Uri = comicCard.GetAttribute("href") }; var existingWeb = await _context.WebToons.FindAsync(webToon.TitleNo); if (existingWeb != null) { continue; } await _context.WebToons.AddAsync(webToon); } } else if (uri.Contains("/list?title_no")) { var list = document.QuerySelectorAll("#_listUl li"); Console.WriteLine(list.Length); foreach (var item in list) { var episode = new Episode { ContentHash = "", EpisodeDate = item.QuerySelector("a .date").InnerHtml, EpisodeLink = item.QuerySelector("a").GetAttribute("href"), EpisodeName = item.QuerySelector("a .subj").InnerHtml, EpisodeThumbnail = new Uri(item.QuerySelector("a .thmb img").GetAttribute("src")).PathAndQuery, TitleNo = HttpUtility.ParseQueryString(requestUri.Query).Get("title_no"), Updated = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat) }; episode.EpisodeLinkHash = Tools.ComputeSha256Hash(episode.EpisodeLink); var existed = await _context.Episodes.FindAsync(episode.EpisodeLinkHash); if (existed != null) { continue; } await _context.AddAsync(episode); } } else if (uri.Contains("viewer")) { var content = document.QuerySelectorAll(".viewer_img img"); var contentLinks = content .Where(cont => cont.GetAttribute("data-url") != null) .Select(cont => new Uri(cont.GetAttribute("data-url")) .PathAndQuery).ToList(); var page = new Page { Content = JsonSerializer.Serialize(contentLinks), EpisodeLink = uri, Updated = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat) }; page.EpisodeLinkHash = Tools.ComputeSha256Hash(page.EpisodeLink); var existingPage = _context.Pages.Where(p => p.Content == page.Content).ToList(); if (existingPage.Count == 0) { await _context.AddAsync(page); } } await _context.SaveChangesAsync(); }
private static async Task OnPageCrawlCompleted(PageCrawlCompletedArgs e, ContentContext _context) { if (e.CrawledPage.HttpRequestException != null || e.CrawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", e.CrawledPage.Uri.AbsoluteUri); return; } if (string.IsNullOrEmpty(e.CrawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", e.CrawledPage.Uri.AbsoluteUri); return; } count++; var httpStatus = e.CrawledPage.HttpResponseMessage.StatusCode; var rawPageText = e.CrawledPage.Content.Text; if (e.CrawledPage.Uri.AbsoluteUri == "https://www.webtoons.com/en/dailySchedule") { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(rawPageText); var comicCards = document.QuerySelectorAll(".daily_card_item"); foreach (var comicCard in comicCards) { var imageLink = new Uri(comicCard.QuerySelector("img").GetAttribute("src")).PathAndQuery; var link = comicCard.GetAttribute("href"); var titleNo = HttpUtility.ParseQueryString(new Uri(link).Query).Get("title_no"); var genre = comicCard.QuerySelector(".genre").InnerHtml; var subject = comicCard.QuerySelector(".subj").InnerHtml; var author = comicCard.QuerySelector(".author").InnerHtml; var cardModel = createWebToonModel(link, titleNo, imageLink, genre, subject, author); var cardEntity = WebtoonProfile.MapCreateModelToEntity(cardModel); var existingWeb = await _context.WebToons.FindAsync(titleNo); if (existingWeb != null) { continue; } await _context.WebToons.AddAsync(cardEntity); // await _context.SaveChangesAsync(); // Console.WriteLine(imageLink); // await _context.SaveChangesAsync(); } } else if (e.CrawledPage.Uri.AbsoluteUri.Contains("list")) { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(rawPageText); var list = document.QuerySelectorAll("#_listUl li"); Console.WriteLine(list.Length); foreach (var item in list) { var link = new Uri(e.CrawledPage.Uri.AbsoluteUri); var titleNo = HttpUtility.ParseQueryString(link.Query).Get("title_no"); var episodeName = item.QuerySelector("a .subj").InnerHtml; var episodeThumb = new Uri(item.QuerySelector("a .thmb img").GetAttribute("src")).PathAndQuery; var episodeDate = item.QuerySelector("a .date").InnerHtml; var episodeLink = item.QuerySelector("a").GetAttribute("href"); var episodeLinkHash = ComputeSha256Hash(episodeLink); var episodeModel = createEpisodeModel(titleNo, episodeName, episodeThumb, episodeDate, episodeLink, episodeLinkHash); var episodeEntity = EpisodeProfile.MapCreateModelToEntity(episodeModel); // Console.WriteLine(episodeThumb); // var _context = Program.ServiceProvider.GetService<ContentContext>(); var existed = await _context.Episodes.FindAsync(episodeLinkHash); if (existed != null) { continue; } await _context.AddAsync(episodeEntity); // await _context.SaveChangesAsync(); // Console.WriteLine(episodeLinkHash); } } else if (e.CrawledPage.Uri.AbsoluteUri.Contains("viewer")) { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); var document = parser.ParseDocument(rawPageText); var content = document.QuerySelectorAll(".viewer_img img"); var contentLinks = content.Select(cont => new Uri(cont.GetAttribute("data-url")).PathAndQuery).ToList(); // foreach (var cont in contentLinks) // { // var linkData = new Uri(cont); // Console.WriteLine(linkData.PathAndQuery); // } var episodeLink = e.CrawledPage.Uri.AbsoluteUri; var epsiodeLinkHash = ComputeSha256Hash(episodeLink); var contentJson = JsonSerializer.Serialize(contentLinks); var pageModel = createPageModel(epsiodeLinkHash, contentJson); var pageEntity = PageProfile.MapCreateModelToEntity(pageModel); // var _context = Program.ServiceProvider.GetService<ContentContext>(); var existingPage = _context.Pages.Where(p => p.Content == contentJson).ToList(); if (existingPage.Count == 0) { await _context.AddAsync(pageEntity); } // await _context.SaveChangesAsync(); } await _context.SaveChangesAsync(); Program.LogInfo(count.ToString()); }