Esempio n. 1
0
        protected string getMatchedBody(string page, SpiderTemplate s, StringBuilder sb)
        {
            Match match = Regex.Match(page, s.GetDetailPattern(), RegexOptions.Singleline);

            if (match == null || !match.Success || string.IsNullOrEmpty(match.Value))
            {
                logInfo("error=没有匹配的页面内容:" + _url, this._url, s, sb);
                return(null);
            }

            return(match.Groups[1].Value);
        }
Esempio n. 2
0
        protected string getMatchedBody( string page, SpiderTemplate s, StringBuilder sb )
        {
            Match match = Regex.Match( page, s.GetDetailPattern(), RegexOptions.Singleline );
            if (match == null || !match.Success || string.IsNullOrEmpty( match.Value )) {
                logInfo( "error=没有匹配的页面内容:"+_url, this._url, s, sb );
                return null;
            }

            page = match.Groups[1].Value;

            String fpage = HtmlFilter.Filter( page ); // 过滤广告

            return fpage;
        }
Esempio n. 3
0
        //css选择器方式提取详细页内容
        protected string getMatchedBody(HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb)
        {
            IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetDetailPattern());

            if (Nodes.Count() > 0)
            {
                String fpage = Nodes.ToArray()[0].OuterHtml;
                return(fpage);
            }
            else
            {
                logInfo("error=没有匹配的页面内容:" + _url, this._url, s, sb);
                return(null);
            }
        }
Esempio n. 4
0
 //css选择器方式提取详细页内容
 protected string getMatchedBody( HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb )
 {
     IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetDetailPattern() );
     if (Nodes.Count() > 0) {
         String fpage = Nodes.ToArray()[0].OuterHtml;
         return fpage;
     }
     else {
         logInfo( "error=没有匹配的页面内容:" + _url, this._url, s, sb );
         return null;
     }
 }