public void PageProcess(PagePathogen pagePathogen) { var selector = new XPathSelector(pagePathogen.PageSource); //小说 Novel novel = new Novel(); //文章信息 novel.Articles = new List <Article>(); var nameEle = selector.SelectSingleNode("//*[@id='info']/h1"); if (nameEle != null) { //小说名称 novel.Name = nameEle.InnerText; } var authorEle = selector.SelectSingleNode("//*[@id='info']/p[1]"); if (authorEle != null) { string pStr = authorEle.InnerText; //作者 novel.Author = pStr.Split(':')[1]; } //获取对应文章信息 GetArticles(selector, novel, pagePathogen.Url, "//*[@id='list']/dl/dd/a"); //传递抓取数据信息 pagePathogen.AddResult("novel", novel); }
/// <summary> /// 页面解析 /// </summary> /// <param name="pagePathogen"></param> public void PageProcess(PagePathogen pagePathogen) { try { //添加请求地址 pagePathogen.AddResult("requestUrl", pagePathogen.Url); var selector = new XPathSelector(pagePathogen.PageSource); var node = selector.SelectSingleNode("//*[@id='content']"); if (node != null) { pagePathogen.AddResult("article", node.InnerHtml); } else { //记录爬取日志 _loggerService.WriteLog(new Log() { DateTime = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), Msg = pagePathogen.Url + "---未解析到数据!", ClassName = "", ActionName = "", Duration = 0, LogLevel = (int)LCore.Logger.LogLevel.Warn }); } } catch (Exception e) { //记录错误信息 _loggerService.WriteLog(new Log() { DateTime = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), LogLevel = (int)LCore.Logger.LogLevel.Error, ClassName = this.GetType().FullName, ActionName = e.TargetSite.Name, Msg = pagePathogen.Url + "---" + e.Message + "---" + e.StackTrace }); } }
public static PagePathogen GetResponse(HttpWebRequest request) { var pagePathogen = new PagePathogen(); pagePathogen.Url = request.Address.AbsoluteUri; pagePathogen.Host = request.Address.Host; try { using (var response = (HttpWebResponse)request.GetResponse()) { //判断如果已压缩 解压 if (response.ContentEncoding != null && response.ContentEncoding.ToLower().Contains("gzip")) { using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { pagePathogen.PageSource = reader.ReadToEnd(); } } } else { using (var stream = response.GetResponseStream()) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { pagePathogen.PageSource = reader.ReadToEnd(); } } } } } catch (Exception) { return(pagePathogen); } return(pagePathogen); }