private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb) { String target; if (strUtil.HasText(s.ListEncoding)) { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding); } else { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, ""); } if (strUtil.IsNullOrEmpty(target)) { logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb); return(target); } if (!strUtil.IsNullOrEmpty(s.GetListBodyPattern())) { HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml(target); IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetListBodyPattern()); if (Nodes.Count() > 0) { target = Nodes.ToArray()[0].OuterHtml; return(target.Trim()); } else { logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); return(null); } } //这里未来也可以改成css选择器的方式,来细化目标url集合的范围 //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); //if (match.Success) //{ // target = match.Value; //} //else //{ // target = ""; // logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); //} return(target.Trim()); }
private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb) { String target; if (strUtil.HasText(s.ListEncoding)) { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding); } else { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, ""); } if (strUtil.IsNullOrEmpty(target)) { logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb); return(target); } Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); if (match.Success) { target = match.Value; } else { target = ""; logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); } return(target.Trim()); }
private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb ) { String target; if (strUtil.HasText( s.ListEncoding )) { target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding ); } else { target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" ); } if (strUtil.IsNullOrEmpty( target )) { logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb ); return target; } else { logInfo( "抓取列表内容成功", s, sb ); } if (strUtil.HasText( s.GetListBodyPattern() )) { HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml( target ); try { IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetListBodyPattern() ); if (Nodes.Count() > 0) { logInfo( "匹配列表内容成功", s, sb ); target = Nodes.ToArray()[0].OuterHtml; target = target.Trim(); return target; } else { logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb ); return null; } } catch (Exception ex) { logInfo( "htmlDoc QuerySelectorAll解析出错=" + ex.Message, s, sb ); return null; } } //这里未来也可以改成css选择器的方式,来细化目标url集合的范围 //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); //if (match.Success) //{ // target = match.Value; //} //else //{ // target = ""; // logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); //} return target.Trim(); }
private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb ) { String target; if (strUtil.HasText( s.ListEncoding )) target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding ); else target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" ); if (strUtil.IsNullOrEmpty( target )) { logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb ); return target; } Match match = Regex.Match( target, s.GetListBodyPattern(), RegexOptions.Singleline ); if (match.Success) { target = match.Value; } else { target = ""; logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb ); } return target.Trim(); }