protected string getMatchedBody(string page, SpiderTemplate s, StringBuilder sb) { Match match = Regex.Match(page, s.GetDetailPattern(), RegexOptions.Singleline); if (match == null || !match.Success || string.IsNullOrEmpty(match.Value)) { logInfo("error=没有匹配的页面内容:" + _url, this._url, s, sb); return(null); } return(match.Groups[1].Value); }
protected string getMatchedBody( string page, SpiderTemplate s, StringBuilder sb ) { Match match = Regex.Match( page, s.GetDetailPattern(), RegexOptions.Singleline ); if (match == null || !match.Success || string.IsNullOrEmpty( match.Value )) { logInfo( "error=没有匹配的页面内容:"+_url, this._url, s, sb ); return null; } page = match.Groups[1].Value; String fpage = HtmlFilter.Filter( page ); // 过滤广告 return fpage; }
//css选择器方式提取详细页内容 protected string getMatchedBody(HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb) { IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetDetailPattern()); if (Nodes.Count() > 0) { String fpage = Nodes.ToArray()[0].OuterHtml; return(fpage); } else { logInfo("error=没有匹配的页面内容:" + _url, this._url, s, sb); return(null); } }
//css选择器方式提取详细页内容 protected string getMatchedBody( HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb ) { IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetDetailPattern() ); if (Nodes.Count() > 0) { String fpage = Nodes.ToArray()[0].OuterHtml; return fpage; } else { logInfo( "error=没有匹配的页面内容:" + _url, this._url, s, sb ); return null; } }