/// <summary> /// 抓取 /// </summary> static void Work() { try { Sw.Reset(); Sw.Start(); //重复数量统计 int repeatCount = 0; string html = HttpUtil.GetString(BlogDataUrl); List <Blog> blogs = new List <Blog>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //获取所有文章数据项 var itemBodys = doc.DocumentNode.SelectNodes("//div[@class='post_item_body']"); foreach (var itemBody in itemBodys) { //标题元素 var titleElem = itemBody.SelectSingleNode("h3/a"); //获取标题 var title = titleElem?.InnerText; //获取url var url = titleElem?.Attributes["href"]?.Value; //摘要元素 var summaryElem = itemBody.SelectSingleNode("p[@class='post_item_summary']"); //获取摘要 var summary = summaryElem?.InnerText.Replace("\r\n", "").Trim(); //数据项底部元素 var footElem = itemBody.SelectSingleNode("div[@class='post_item_foot']"); //获取作者 var author = footElem?.SelectSingleNode("a")?.InnerText; //获取文章发布时间 var publishTime = Regex.Match(footElem?.InnerText, "\\d+-\\d+-\\d+ \\d+:\\d+").Value; //组装博客对象 Blog blog = new Blog() { Title = title, Url = url, Summary = summary, Author = author, PublishTime = DateTime.Parse(publishTime) }; blogs.Add(blog); /*Console.WriteLine($"标题:{title}"); * Console.WriteLine($"网址:{url}"); * Console.WriteLine($"摘要:{summary}"); * Console.WriteLine($"作者:{author}"); * Console.WriteLine($"发布时间:{publishTime}"); * Console.WriteLine("--------------华丽的分割线---------------");*/ } string blogFileName = $"cnblogs-{DateTime.Now:yyyy-MM-dd}.txt"; string blogFilePath = Path.Combine(_baseDir, "Blogs", blogFileName); FileStream fs = new FileStream(blogFilePath, FileMode.Append, FileAccess.Write); StreamWriter sw = new StreamWriter(fs, Encoding.UTF8); //去重 foreach (var blog in blogs) { if (PreviousBlogs.Any(b => b.Url == blog.Url)) { repeatCount++; } else { sw.WriteLine($"标题:{blog.Title}"); sw.WriteLine($"网址:{blog.Url}"); sw.WriteLine($"摘要:{blog.Summary}"); sw.WriteLine($"作者:{blog.Author}"); sw.WriteLine($"发布时间:{blog.PublishTime:yyyy-MM-dd HH:mm}"); sw.WriteLine("--------------华丽的分割线---------------"); } } sw.Close(); fs.Close(); //清除上一次抓取数据记录 PreviousBlogs.Clear(); //加入本次抓取记录 PreviousBlogs.AddRange(blogs); //持久化本次抓取数据到文本 以便于异常退出恢复之后不出现重复数据 File.WriteAllText(_tmpFilePath, JsonConvert.SerializeObject(blogs)); Sw.Stop(); //统计信息 _logger.Info($"Get data success,Time:{Sw.ElapsedMilliseconds}ms,Data Count:{blogs.Count},Repeat:{repeatCount},Effective:{blogs.Count - repeatCount}"); //发送邮件 if ((DateTime.Now - _recordTime).TotalHours >= 24) { _sendLogger.Info($"准备发送邮件,记录时间:{_recordTime:yyyy-MM-dd HH:mm:ss}"); SendMail(); _recordTime = new DateTime(DateTime.Now.Year, DateTime.Now.Month, DateTime.Now.Day, 9, 0, 0); _sendLogger.Info($"记录时间已更新:{_recordTime:yyyy-MM-dd HH:mm:ss}"); } } catch (Exception) { throw; } finally { Sw.Stop(); } }
/// <summary> /// 抓取博客园 /// </summary> static void Work(BlogSource source) { try { Sw.Reset(); Sw.Start(); //重复数量统计 int repeatCount = 0; string html = HttpUtil.GetString(source.BlogDataUrl); List <Blog> blogs = new List <Blog>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //获取所有文章数据项 var itemBodys = doc.DocumentNode.SelectNodes(source.DicXPath["item"]); foreach (var itemBody in itemBodys) { //标题元素 var titleElem = itemBody.SelectSingleNode(source.DicXPath["title"]); //获取标题 var title = titleElem?.InnerText; //获取url var url = titleElem?.Attributes["href"]?.Value; //摘要元素 var summaryElem = itemBody.SelectSingleNode(source.DicXPath["summary"]); //获取摘要 var summary = summaryElem?.InnerText.Replace("\r\n", "").Trim(); //数据项底部元素 var footElem = itemBody.SelectSingleNode(source.DicXPath["foot"]); //获取作者 var author = footElem?.SelectSingleNode(source.DicXPath["author"])?.InnerText; //获取文章发布时间 var publishTime = (source.Path == "WoshiPm") ? footElem?.SelectSingleNode(source.DicXPath["date"])?.InnerText : Regex.Match(footElem?.InnerText, "\\d+-\\d+-\\d+ \\d+:\\d+").Value; //组装博客对象 Blog blog = new Blog() { Title = title, Url = url, Summary = summary, Author = author, PublishTime = DateTime.Parse(publishTime) }; blogs.Add(blog); } string blogFileName = $"{source.FileName}-{DateTime.Now:yyyy-MM-dd}.txt"; string blogFilePath = Path.Combine(_baseDataPath, source.Path, blogFileName); FileStream fs = new FileStream(blogFilePath, FileMode.Append, FileAccess.Write); StreamWriter sw = new StreamWriter(fs, Encoding.UTF8); //去重 foreach (var blog in blogs) { if (source.PreviousBlogs.Any(b => b.Url == blog.Url)) { repeatCount++; } else { sw.WriteLine($"标题:{blog.Title}"); sw.WriteLine($"网址:{blog.Url}"); sw.WriteLine($"摘要:{blog.Summary}"); sw.WriteLine($"作者:{blog.Author}"); sw.WriteLine($"发布时间:{blog.PublishTime:yyyy-MM-dd HH:mm}"); sw.WriteLine("--------------华丽的分割线---------------"); } } sw.Close(); fs.Close(); //清除上一次抓取数据记录 source.PreviousBlogs.Clear(); //加入本次抓取记录 source.PreviousBlogs.AddRange(blogs); Sw.Stop(); //统计信息 _logger.Info($"Get {source.Name} data success,Time:{Sw.ElapsedMilliseconds}ms,Data Count:{blogs.Count},Repeat:{repeatCount},Effective:{blogs.Count - repeatCount}"); //发送邮件 if ((DateTime.Now - source.RecordTime).TotalHours >= 24) { _sendLogger.Info($"准备发送{source.Name}聚合邮件,记录时间:{source.RecordTime:yyyy-MM-dd HH:mm:ss}"); SendMail(source); source.RecordTime = source.RecordTime.AddDays(1); _sendLogger.Info($"{source.Name}记录时间已更新:{source.RecordTime:yyyy-MM-dd HH:mm:ss}"); } SaveData(); } catch (Exception ex) { System.Console.WriteLine(ex.Message); throw; } finally { Sw.Stop(); } }