/// <summary> /// 抓取 /// </summary> static void Work() { try { Sw.Reset(); Sw.Start(); //重复数量统计 int repeatCount = 0; string html = HttpUtil.GetString(BlogDataUrl); List <Blog> blogs = new List <Blog>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //获取所有文章数据项 var itemBodys = doc.DocumentNode.SelectNodes("//div[@class='post_item_body']"); foreach (var itemBody in itemBodys) { //标题元素 var titleElem = itemBody.SelectSingleNode("h3/a"); //获取标题 var title = titleElem?.InnerText; //获取url var url = titleElem?.Attributes["href"]?.Value; //摘要元素 var summaryElem = itemBody.SelectSingleNode("p[@class='post_item_summary']"); //获取摘要 var summary = summaryElem?.InnerText.Replace("\r\n", "").Trim(); //数据项底部元素 var footElem = itemBody.SelectSingleNode("div[@class='post_item_foot']"); //获取作者 var author = footElem?.SelectSingleNode("a")?.InnerText; //获取文章发布时间 var publishTime = Regex.Match(footElem?.InnerText, "\\d+-\\d+-\\d+ \\d+:\\d+").Value; //组装博客对象 Blog blog = new Blog() { Title = title, Url = url, Summary = summary, Author = author, PublishTime = DateTime.Parse(publishTime) }; blogs.Add(blog); /*Console.WriteLine($"标题:{title}"); * Console.WriteLine($"网址:{url}"); * Console.WriteLine($"摘要:{summary}"); * Console.WriteLine($"作者:{author}"); * Console.WriteLine($"发布时间:{publishTime}"); * Console.WriteLine("--------------华丽的分割线---------------");*/ } string blogFileName = $"cnblogs-{DateTime.Now:yyyy-MM-dd}.txt"; string blogFilePath = Path.Combine(_baseDir, "Blogs", blogFileName); FileStream fs = new FileStream(blogFilePath, FileMode.Append, FileAccess.Write); StreamWriter sw = new StreamWriter(fs, Encoding.UTF8); //去重 foreach (var blog in blogs) { if (PreviousBlogs.Any(b => b.Url == blog.Url)) { repeatCount++; } else { sw.WriteLine($"标题:{blog.Title}"); sw.WriteLine($"网址:{blog.Url}"); sw.WriteLine($"摘要:{blog.Summary}"); sw.WriteLine($"作者:{blog.Author}"); sw.WriteLine($"发布时间:{blog.PublishTime:yyyy-MM-dd HH:mm}"); sw.WriteLine("--------------华丽的分割线---------------"); } } sw.Close(); fs.Close(); //清除上一次抓取数据记录 PreviousBlogs.Clear(); //加入本次抓取记录 PreviousBlogs.AddRange(blogs); //持久化本次抓取数据到文本 以便于异常退出恢复之后不出现重复数据 File.WriteAllText(_tmpFilePath, JsonConvert.SerializeObject(blogs)); Sw.Stop(); //统计信息 _logger.Info($"Get data success,Time:{Sw.ElapsedMilliseconds}ms,Data Count:{blogs.Count},Repeat:{repeatCount},Effective:{blogs.Count - repeatCount}"); //发送邮件 if ((DateTime.Now - _recordTime).TotalHours >= 24) { _sendLogger.Info($"准备发送邮件,记录时间:{_recordTime:yyyy-MM-dd HH:mm:ss}"); SendMail(); _recordTime = new DateTime(DateTime.Now.Year, DateTime.Now.Month, DateTime.Now.Day, 9, 0, 0); _sendLogger.Info($"记录时间已更新:{_recordTime:yyyy-MM-dd HH:mm:ss}"); } } catch (Exception) { throw; } finally { Sw.Stop(); } }