/// <summary> /// 保存评论到Excel /// </summary> /// <returns></returns> public void SaveComments() { int count = 0; while (true) { if (comments.Count > 0) { var list = comments.Select(x => x).ToList(); //保存到excel中 try { comments.RemoveAll(x => { return(list.Any(i => i.Equals(x))); }); Log4Net.LogInfo($"本次保存{list.Count}条评论"); excel.Insert(list); count++; } catch (Exception e) { Log4Net.ErrorInfo($"本次保存到Excel失败,行数:{list.Count},值:{JsonConvert.SerializeObject(list)}。异常信息:", e); } } else { if (exit) { Log4Net.LogInfo($"数据导出完成,保存Excel文件"); excel.Save(); Log4Net.LogInfo($"本次任务已完成,程序退出"); break; } } if (count / 10 == 0 && count > 0) { //每插入10次数据,保存一次Excel excel.Save(); } System.Threading.Thread.Sleep(5000); } }
public Spider(string _url, string filename) { context = BrowsingContext.New(Configuration.Default); comments = new List <Models.Comment>(); weibo_url = _url; var uri = new Uri(weibo_url); cookies = System.Configuration.ConfigurationManager.AppSettings["cookie"]; var httpClient = new HttpClient() { BaseAddress = new Uri($"{uri.Scheme}://{uri.Host}"), Timeout = new TimeSpan(0, 0, 30), }; string filedir = System.Configuration.ConfigurationManager.AppSettings["filedir"]; excel_path = $"{PlatformServices.Default.Application.ApplicationBasePath}{filedir}/out/{filename}"; excel = new Excel(excel_path); Log4Net.LogInfo($"Excel文件保存在{excel_path}"); httpService = new HttpService(httpClient); }
/// <summary> /// 处理评论 /// </summary> /// <param name="html"></param> /// <returns></returns> private async Task CommentHandle(Uri uri, string weibo_link) { Log4Net.LogInfo($"正在抓取评论链接[{uri}]"); var res_str = await httpService.GetAsync(uri.PathAndQuery, cookies); if (!string.IsNullOrEmpty(res_str)) { try { Log4Net.LogInfo($"评论链接[{uri}]内容获取成功,开始处理数据"); var response = JsonConvert.DeserializeObject <Models.Response>(res_str); var document = await context.OpenAsync(req => req.Content(response.data.html)); var document_comment_list = document.QuerySelectorAll("*").Where(x => "comment_list".Equals(x.GetAttribute("node-type"))).FirstOrDefault(); //获取评论列表的dom Log4Net.LogInfo($"取得评论区"); //这里有:主评论、子评论列表的链接、主评论下一页的链接 //1.取主评论列表 document_comment_list .QuerySelectorAll("div") .Where(x => "list_li S_line1 clearfix".Equals(x.ClassName)) .ToList() //遍历主评论 .ForEach(x => { var commentId = x.GetAttribute("comment_id"); //主评论Id var comment_body_dom = x.GetElementsByClassName("WB_text").FirstOrDefault(); //评论主体DOM //从这里取用户Id、昵称和评论内容,图片和下级子评论在别的节点中取 //1.取用户Id,第1个A标签中有用户Id和昵称 var a_dom = comment_body_dom.QuerySelectorAll("a"); var first_a_dom = a_dom.FirstOrDefault(); var user_id = first_a_dom.GetAttribute("usercard").Replace("id=", ""); //用户Id var nick = first_a_dom.TextContent; //直接取文字部分为昵称 var datetime = x.GetElementsByClassName("WB_from S_txt2").FirstOrDefault().TextContent; foreach (var a in a_dom) { //删除所有的A标签 a.Remove(); } //2.评论的内容,处理表情,先取表情的图片 comment_body_dom.QuerySelectorAll("img").Where(m => "face".Equals(m.GetAttribute("type"))).ToList().ForEach(face => { comment_body_dom.InnerHtml = comment_body_dom.InnerHtml.Replace(face.OuterHtml, face.GetAttribute("title")); //把表情图片替换成文字 }); var body = comment_body_dom.TextContent.Trim().TrimStart(':'); //评论内容 var comment = new Models.Comment { CommentId = commentId, UserId = user_id, Nick = nick, DateAndTime = datetime, Body = body, CommentUrl = $"{uri}" }; //3.处理评论图片,单图 var image_dom = x.QuerySelectorAll("div").Where(m => "comment_media_prev".Equals(m.GetAttribute("node-type"))); if (image_dom.Any()) { comment.ImageUrl = CommentImageHandle(image_dom.FirstOrDefault()); } comments.Add(comment); }); string link = ""; var page = response.data.page; if (page != null) { Log4Net.LogInfo($"处理完第[{page.pagenum}]页评论数据"); if (page.pagenum < page.totalpage) { link = $"{Link(weibo_link)}&page={(page.pagenum + 1)}"; } } else { link = await NextLinkHandle(document_comment_list); link = $"{Link(link)}"; } if (!string.IsNullOrEmpty(link)) { Log4Net.LogInfo($"5秒后处理URL[{link}]的评论"); uri = new Uri(link); System.Threading.Thread.Sleep(5000); await CommentHandle(uri, weibo_link); } else { exit = true; //退出 Log4Net.LogInfo($"评论数据处理完成"); Log4Net.LogInfo($"最后一页内容:{res_str}"); } } catch (Exception e) { Log4Net.ErrorInfo($"处理评论数据异常,URL:[{uri}],网页返回:{res_str}", e); Log4Net.LogInfo($"5秒后异常重试URL[{uri}]"); System.Threading.Thread.Sleep(5000); await CommentHandle(uri, weibo_link); } } }