示例#1
0
        public static async Task Crawler()
        {
            //Download chromium browser revision package
            await new BrowserFetcher().DownloadAsync(ChromiumRevision);
            using var browser = await Puppeteer.LaunchAsync(new LaunchOptions
            {
                // Headless = true
                Headless = false
            });

            var optionsBuilder = new DbContextOptionsBuilder <ContentContext>();
            var options        = optionsBuilder
                                 .UseSqlite("Data Source=D:\\WebCrawlerPrj\\Crawler\\DB\\content.db;",
                                            providerOptions => providerOptions.CommandTimeout(60));

            await using var _context = new ContentContext(options.Options);
            var articles = _context.CrawledLinks.Select(c => new { c.Id, c.Url }).ToList();

            foreach (var article in articles)
            {
                var page = await browser.NewPageAsync();

                var pageContent = await GetPageContent(page, article.Url, 1024, 768);

                var context        = BrowsingContext.New(Configuration.Default);
                var parser         = context.GetService <IHtmlParser>();
                var document       = parser.ParseDocument(pageContent);
                var articleContent = document.QuerySelector(".sidebar_1").InnerHtml;
                var articleModel   = CreatePageModel(article.Url, articleContent, article.Id);
                var articleEntity  = PageProfile.MapCreateModelToEntity(articleModel);
                await _context.Pages.AddAsync(articleEntity);

                await _context.SaveChangesAsync();

                await page.CloseAsync();
            }
            await browser.CloseAsync();
        }
示例#2
0
        private static async Task OnPageCrawlCompleted(PageCrawlCompletedArgs e, ContentContext _context)
        {
            if (e.CrawledPage.HttpRequestException != null ||
                e.CrawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", e.CrawledPage.Uri.AbsoluteUri);
                return;
            }

            if (string.IsNullOrEmpty(e.CrawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", e.CrawledPage.Uri.AbsoluteUri);
                return;
            }

            count++;
            var httpStatus  = e.CrawledPage.HttpResponseMessage.StatusCode;
            var rawPageText = e.CrawledPage.Content.Text;

            if (e.CrawledPage.Uri.AbsoluteUri == "https://www.webtoons.com/en/dailySchedule")
            {
                var context    = BrowsingContext.New(Configuration.Default);
                var parser     = context.GetService <IHtmlParser>();
                var document   = parser.ParseDocument(rawPageText);
                var comicCards = document.QuerySelectorAll(".daily_card_item");

                foreach (var comicCard in comicCards)
                {
                    var imageLink = new Uri(comicCard.QuerySelector("img").GetAttribute("src")).PathAndQuery;
                    var link      = comicCard.GetAttribute("href");
                    var titleNo   = HttpUtility.ParseQueryString(new Uri(link).Query).Get("title_no");
                    var genre     = comicCard.QuerySelector(".genre").InnerHtml;
                    var subject   = comicCard.QuerySelector(".subj").InnerHtml;
                    var author    = comicCard.QuerySelector(".author").InnerHtml;

                    var cardModel   = createWebToonModel(link, titleNo, imageLink, genre, subject, author);
                    var cardEntity  = WebtoonProfile.MapCreateModelToEntity(cardModel);
                    var existingWeb = await _context.WebToons.FindAsync(titleNo);

                    if (existingWeb != null)
                    {
                        continue;
                    }
                    await _context.WebToons.AddAsync(cardEntity);

                    // await _context.SaveChangesAsync();
                    // Console.WriteLine(imageLink);

                    // await _context.SaveChangesAsync();
                }
            }
            else if (e.CrawledPage.Uri.AbsoluteUri.Contains("list"))
            {
                var context  = BrowsingContext.New(Configuration.Default);
                var parser   = context.GetService <IHtmlParser>();
                var document = parser.ParseDocument(rawPageText);
                var list     = document.QuerySelectorAll("#_listUl li");
                Console.WriteLine(list.Length);
                foreach (var item in list)
                {
                    var link            = new Uri(e.CrawledPage.Uri.AbsoluteUri);
                    var titleNo         = HttpUtility.ParseQueryString(link.Query).Get("title_no");
                    var episodeName     = item.QuerySelector("a .subj").InnerHtml;
                    var episodeThumb    = new Uri(item.QuerySelector("a .thmb img").GetAttribute("src")).PathAndQuery;
                    var episodeDate     = item.QuerySelector("a .date").InnerHtml;
                    var episodeLink     = item.QuerySelector("a").GetAttribute("href");
                    var episodeLinkHash = ComputeSha256Hash(episodeLink);
                    var episodeModel    = createEpisodeModel(titleNo, episodeName, episodeThumb, episodeDate, episodeLink,
                                                             episodeLinkHash);
                    var episodeEntity = EpisodeProfile.MapCreateModelToEntity(episodeModel);
                    // Console.WriteLine(episodeThumb);
                    // var _context = Program.ServiceProvider.GetService<ContentContext>();
                    var existed = await _context.Episodes.FindAsync(episodeLinkHash);

                    if (existed != null)
                    {
                        continue;
                    }
                    await _context.AddAsync(episodeEntity);

                    // await _context.SaveChangesAsync();

                    // Console.WriteLine(episodeLinkHash);
                }
            }
            else if (e.CrawledPage.Uri.AbsoluteUri.Contains("viewer"))
            {
                var context      = BrowsingContext.New(Configuration.Default);
                var parser       = context.GetService <IHtmlParser>();
                var document     = parser.ParseDocument(rawPageText);
                var content      = document.QuerySelectorAll(".viewer_img img");
                var contentLinks = content.Select(cont => new Uri(cont.GetAttribute("data-url")).PathAndQuery).ToList();
                // foreach (var cont in contentLinks)
                // {
                //     var linkData = new Uri(cont);
                //     Console.WriteLine(linkData.PathAndQuery);
                // }
                var episodeLink     = e.CrawledPage.Uri.AbsoluteUri;
                var epsiodeLinkHash = ComputeSha256Hash(episodeLink);
                var contentJson     = JsonSerializer.Serialize(contentLinks);
                var pageModel       = createPageModel(epsiodeLinkHash, contentJson);
                var pageEntity      = PageProfile.MapCreateModelToEntity(pageModel);
                // var _context = Program.ServiceProvider.GetService<ContentContext>();
                var existingPage = _context.Pages.Where(p => p.Content == contentJson).ToList();
                if (existingPage.Count == 0)
                {
                    await _context.AddAsync(pageEntity);
                }

                // await _context.SaveChangesAsync();
            }

            await _context.SaveChangesAsync();

            Program.LogInfo(count.ToString());
        }