示例#1
0
        private async void Crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.HttpRequestException != null || crawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine($"Crawl of page failed {crawledPage.Uri.AbsoluteUri}");
            }
            else
            {
                Console.WriteLine($"Crawl of page succeeded {crawledPage.Uri.AbsoluteUri}");
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine($"Page had no content {crawledPage.Uri.AbsoluteUri}");
            }

            var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument;
            var url        = crawledPage.Uri.AbsoluteUri;
            var allArticle = angleSharpHtmlDocument.GetElementsByClassName("pw article");

            if (allArticle.Length > 0)
            {
                string text  = null;
                string html  = null;
                string title = null;
                foreach (var f in allArticle)
                {
                    title += f.GetElementsByTagName("h1").First().TextContent;
                    text  += f.TextContent;
                    html  += f.InnerHtml;
                }

                var newsDate = angleSharpHtmlDocument.GetElementsByClassName("news-date-time news_date");

                DateTime date = new DateTime();

                foreach (var t in newsDate)
                {
                    date = DateTime.Parse(t.TextContent);
                }
                Article article = new Article(title, url, date, html, text);

                articleDao.Save(article);
            }
        }