public async Task ProcessPageContent(IHtmlDocument document, Uri requestUri)
        {
            var uri = requestUri.ToString();

            if (uri == "https://www.webtoons.com/en/dailySchedule")
            {
                var comicCards = document.QuerySelectorAll(".daily_card_item");

                foreach (var comicCard in comicCards)
                {
                    var link    = comicCard.GetAttribute("href");
                    var webToon = new WebToon
                    {
                        Author      = comicCard.QuerySelector(".author").InnerHtml,
                        ContentHash = "",
                        Genre       = comicCard.QuerySelector(".genre").InnerHtml,
                        ImageLink   = new Uri(comicCard.QuerySelector("img").GetAttribute("src")).PathAndQuery,
                        Subject     = comicCard.QuerySelector(".subj").InnerHtml,
                        TitleNo     = HttpUtility.ParseQueryString(new Uri(link).Query).Get("title_no"),
                        Updated     = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat),
                        Uri         = comicCard.GetAttribute("href")
                    };
                    var existingWeb = await _context.WebToons.FindAsync(webToon.TitleNo);

                    if (existingWeb != null)
                    {
                        continue;
                    }
                    await _context.WebToons.AddAsync(webToon);
                }
            }
            else if (uri.Contains("/list?title_no"))
            {
                var list = document.QuerySelectorAll("#_listUl li");
                Console.WriteLine(list.Length);
                foreach (var item in list)
                {
                    var episode = new Episode
                    {
                        ContentHash      = "",
                        EpisodeDate      = item.QuerySelector("a .date").InnerHtml,
                        EpisodeLink      = item.QuerySelector("a").GetAttribute("href"),
                        EpisodeName      = item.QuerySelector("a .subj").InnerHtml,
                        EpisodeThumbnail = new Uri(item.QuerySelector("a .thmb img").GetAttribute("src")).PathAndQuery,
                        TitleNo          = HttpUtility.ParseQueryString(requestUri.Query).Get("title_no"),
                        Updated          = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat)
                    };
                    episode.EpisodeLinkHash = Tools.ComputeSha256Hash(episode.EpisodeLink);

                    var existed = await _context.Episodes.FindAsync(episode.EpisodeLinkHash);

                    if (existed != null)
                    {
                        continue;
                    }
                    await _context.AddAsync(episode);
                }
            }
            else if (uri.Contains("viewer"))
            {
                var content      = document.QuerySelectorAll(".viewer_img img");
                var contentLinks = content
                                   .Where(cont => cont.GetAttribute("data-url") != null)
                                   .Select(cont => new Uri(cont.GetAttribute("data-url"))
                                           .PathAndQuery).ToList();

                var page = new Page
                {
                    Content     = JsonSerializer.Serialize(contentLinks),
                    EpisodeLink = uri,
                    Updated     = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat)
                };
                page.EpisodeLinkHash = Tools.ComputeSha256Hash(page.EpisodeLink);

                var existingPage = _context.Pages.Where(p => p.Content == page.Content).ToList();
                if (existingPage.Count == 0)
                {
                    await _context.AddAsync(page);
                }
            }

            await _context.SaveChangesAsync();
        }
Beispiel #2
0
        private static async Task OnPageCrawlCompleted(PageCrawlCompletedArgs e, ContentContext _context)
        {
            if (e.CrawledPage.HttpRequestException != null ||
                e.CrawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", e.CrawledPage.Uri.AbsoluteUri);
                return;
            }

            if (string.IsNullOrEmpty(e.CrawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", e.CrawledPage.Uri.AbsoluteUri);
                return;
            }

            count++;
            var httpStatus  = e.CrawledPage.HttpResponseMessage.StatusCode;
            var rawPageText = e.CrawledPage.Content.Text;

            if (e.CrawledPage.Uri.AbsoluteUri == "https://www.webtoons.com/en/dailySchedule")
            {
                var context    = BrowsingContext.New(Configuration.Default);
                var parser     = context.GetService <IHtmlParser>();
                var document   = parser.ParseDocument(rawPageText);
                var comicCards = document.QuerySelectorAll(".daily_card_item");

                foreach (var comicCard in comicCards)
                {
                    var imageLink = new Uri(comicCard.QuerySelector("img").GetAttribute("src")).PathAndQuery;
                    var link      = comicCard.GetAttribute("href");
                    var titleNo   = HttpUtility.ParseQueryString(new Uri(link).Query).Get("title_no");
                    var genre     = comicCard.QuerySelector(".genre").InnerHtml;
                    var subject   = comicCard.QuerySelector(".subj").InnerHtml;
                    var author    = comicCard.QuerySelector(".author").InnerHtml;

                    var cardModel   = createWebToonModel(link, titleNo, imageLink, genre, subject, author);
                    var cardEntity  = WebtoonProfile.MapCreateModelToEntity(cardModel);
                    var existingWeb = await _context.WebToons.FindAsync(titleNo);

                    if (existingWeb != null)
                    {
                        continue;
                    }
                    await _context.WebToons.AddAsync(cardEntity);

                    // await _context.SaveChangesAsync();
                    // Console.WriteLine(imageLink);

                    // await _context.SaveChangesAsync();
                }
            }
            else if (e.CrawledPage.Uri.AbsoluteUri.Contains("list"))
            {
                var context  = BrowsingContext.New(Configuration.Default);
                var parser   = context.GetService <IHtmlParser>();
                var document = parser.ParseDocument(rawPageText);
                var list     = document.QuerySelectorAll("#_listUl li");
                Console.WriteLine(list.Length);
                foreach (var item in list)
                {
                    var link            = new Uri(e.CrawledPage.Uri.AbsoluteUri);
                    var titleNo         = HttpUtility.ParseQueryString(link.Query).Get("title_no");
                    var episodeName     = item.QuerySelector("a .subj").InnerHtml;
                    var episodeThumb    = new Uri(item.QuerySelector("a .thmb img").GetAttribute("src")).PathAndQuery;
                    var episodeDate     = item.QuerySelector("a .date").InnerHtml;
                    var episodeLink     = item.QuerySelector("a").GetAttribute("href");
                    var episodeLinkHash = ComputeSha256Hash(episodeLink);
                    var episodeModel    = createEpisodeModel(titleNo, episodeName, episodeThumb, episodeDate, episodeLink,
                                                             episodeLinkHash);
                    var episodeEntity = EpisodeProfile.MapCreateModelToEntity(episodeModel);
                    // Console.WriteLine(episodeThumb);
                    // var _context = Program.ServiceProvider.GetService<ContentContext>();
                    var existed = await _context.Episodes.FindAsync(episodeLinkHash);

                    if (existed != null)
                    {
                        continue;
                    }
                    await _context.AddAsync(episodeEntity);

                    // await _context.SaveChangesAsync();

                    // Console.WriteLine(episodeLinkHash);
                }
            }
            else if (e.CrawledPage.Uri.AbsoluteUri.Contains("viewer"))
            {
                var context      = BrowsingContext.New(Configuration.Default);
                var parser       = context.GetService <IHtmlParser>();
                var document     = parser.ParseDocument(rawPageText);
                var content      = document.QuerySelectorAll(".viewer_img img");
                var contentLinks = content.Select(cont => new Uri(cont.GetAttribute("data-url")).PathAndQuery).ToList();
                // foreach (var cont in contentLinks)
                // {
                //     var linkData = new Uri(cont);
                //     Console.WriteLine(linkData.PathAndQuery);
                // }
                var episodeLink     = e.CrawledPage.Uri.AbsoluteUri;
                var epsiodeLinkHash = ComputeSha256Hash(episodeLink);
                var contentJson     = JsonSerializer.Serialize(contentLinks);
                var pageModel       = createPageModel(epsiodeLinkHash, contentJson);
                var pageEntity      = PageProfile.MapCreateModelToEntity(pageModel);
                // var _context = Program.ServiceProvider.GetService<ContentContext>();
                var existingPage = _context.Pages.Where(p => p.Content == contentJson).ToList();
                if (existingPage.Count == 0)
                {
                    await _context.AddAsync(pageEntity);
                }

                // await _context.SaveChangesAsync();
            }

            await _context.SaveChangesAsync();

            Program.LogInfo(count.ToString());
        }