Example #1
0
        /// <summary>
        /// 抓取
        /// </summary>
        static void Work()
        {
            try
            {
                Sw.Reset();
                Sw.Start();

                //重复数量统计
                int repeatCount = 0;

                string html = HttpUtil.GetString(BlogDataUrl);

                List <Blog> blogs = new List <Blog>();

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(html);

                //获取所有文章数据项
                var itemBodys = doc.DocumentNode.SelectNodes("//div[@class='post_item_body']");

                foreach (var itemBody in itemBodys)
                {
                    //标题元素
                    var titleElem = itemBody.SelectSingleNode("h3/a");
                    //获取标题
                    var title = titleElem?.InnerText;
                    //获取url
                    var url = titleElem?.Attributes["href"]?.Value;

                    //摘要元素
                    var summaryElem = itemBody.SelectSingleNode("p[@class='post_item_summary']");
                    //获取摘要
                    var summary = summaryElem?.InnerText.Replace("\r\n", "").Trim();

                    //数据项底部元素
                    var footElem = itemBody.SelectSingleNode("div[@class='post_item_foot']");
                    //获取作者
                    var author = footElem?.SelectSingleNode("a")?.InnerText;
                    //获取文章发布时间
                    var publishTime = Regex.Match(footElem?.InnerText, "\\d+-\\d+-\\d+ \\d+:\\d+").Value;

                    //组装博客对象
                    Blog blog = new Blog()
                    {
                        Title       = title,
                        Url         = url,
                        Summary     = summary,
                        Author      = author,
                        PublishTime = DateTime.Parse(publishTime)
                    };
                    blogs.Add(blog);


                    /*Console.WriteLine($"标题:{title}");
                     * Console.WriteLine($"网址:{url}");
                     * Console.WriteLine($"摘要:{summary}");
                     * Console.WriteLine($"作者:{author}");
                     * Console.WriteLine($"发布时间:{publishTime}");
                     * Console.WriteLine("--------------华丽的分割线---------------");*/
                }

                string     blogFileName = $"cnblogs-{DateTime.Now:yyyy-MM-dd}.txt";
                string     blogFilePath = Path.Combine(_baseDir, "Blogs", blogFileName);
                FileStream fs           = new FileStream(blogFilePath, FileMode.Append, FileAccess.Write);

                StreamWriter sw = new StreamWriter(fs, Encoding.UTF8);
                //去重
                foreach (var blog in blogs)
                {
                    if (PreviousBlogs.Any(b => b.Url == blog.Url))
                    {
                        repeatCount++;
                    }
                    else
                    {
                        sw.WriteLine($"标题:{blog.Title}");
                        sw.WriteLine($"网址:{blog.Url}");
                        sw.WriteLine($"摘要:{blog.Summary}");
                        sw.WriteLine($"作者:{blog.Author}");
                        sw.WriteLine($"发布时间:{blog.PublishTime:yyyy-MM-dd HH:mm}");
                        sw.WriteLine("--------------华丽的分割线---------------");
                    }
                }
                sw.Close();
                fs.Close();

                //清除上一次抓取数据记录
                PreviousBlogs.Clear();
                //加入本次抓取记录
                PreviousBlogs.AddRange(blogs);

                //持久化本次抓取数据到文本 以便于异常退出恢复之后不出现重复数据
                File.WriteAllText(_tmpFilePath, JsonConvert.SerializeObject(blogs));

                Sw.Stop();

                //统计信息

                _logger.Info($"Get data success,Time:{Sw.ElapsedMilliseconds}ms,Data Count:{blogs.Count},Repeat:{repeatCount},Effective:{blogs.Count - repeatCount}");

                //发送邮件
                if ((DateTime.Now - _recordTime).TotalHours >= 24)
                {
                    _sendLogger.Info($"准备发送邮件,记录时间:{_recordTime:yyyy-MM-dd HH:mm:ss}");
                    SendMail();
                    _recordTime = new DateTime(DateTime.Now.Year, DateTime.Now.Month, DateTime.Now.Day, 9, 0, 0);
                    _sendLogger.Info($"记录时间已更新:{_recordTime:yyyy-MM-dd HH:mm:ss}");
                }
            }
            catch (Exception)
            {
                throw;
            }
            finally
            {
                Sw.Stop();
            }
        }