Beispiel #1
0
        private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb)
        {
            String target;

            if (strUtil.HasText(s.ListEncoding))
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding);
            }
            else
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, "");
            }

            if (strUtil.IsNullOrEmpty(target))
            {
                logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb);

                return(target);
            }

            if (!strUtil.IsNullOrEmpty(s.GetListBodyPattern()))
            {
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd         = true,
                    OptionFixNestedTags          = true,
                    OptionReadEncoding           = true
                };
                htmlDoc.LoadHtml(target);
                IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetListBodyPattern());
                if (Nodes.Count() > 0)
                {
                    target = Nodes.ToArray()[0].OuterHtml;
                    return(target.Trim());
                }
                else
                {
                    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
                    return(null);
                }
            }
            //这里未来也可以改成css选择器的方式,来细化目标url集合的范围
            //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);
            //if (match.Success)
            //{
            //    target = match.Value;
            //}
            //else
            //{
            //    target = "";
            //    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            //}

            return(target.Trim());
        }
Beispiel #2
0
        private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb)
        {
            String target;

            if (strUtil.HasText(s.ListEncoding))
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding);
            }
            else
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, "");
            }

            if (strUtil.IsNullOrEmpty(target))
            {
                logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb);

                return(target);
            }

            Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);

            if (match.Success)
            {
                target = match.Value;
            }
            else
            {
                target = "";
                logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            }

            return(target.Trim());
        }
Beispiel #3
0
        private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb )
        {
            String target;

            if (strUtil.HasText( s.ListEncoding )) {
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding );
            }
            else {
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" );
            }

            if (strUtil.IsNullOrEmpty( target )) {
                logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb );
                return target;
            }
            else {
                logInfo( "抓取列表内容成功", s, sb );
            }

            if (strUtil.HasText( s.GetListBodyPattern() )) {
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };

                htmlDoc.LoadHtml( target );
                try {
                    IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetListBodyPattern() );

                    if (Nodes.Count() > 0) {
                        logInfo( "匹配列表内容成功", s, sb );
                        target = Nodes.ToArray()[0].OuterHtml;
                        target = target.Trim();
                        return target;
                    }
                    else {
                        logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb );
                        return null;
                    }
                }
                catch (Exception ex) {
                    logInfo( "htmlDoc QuerySelectorAll解析出错=" + ex.Message, s, sb );
                    return null;
                }
            }

            //这里未来也可以改成css选择器的方式,来细化目标url集合的范围
            //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);
            //if (match.Success)
            //{
            //    target = match.Value;
            //}
            //else
            //{
            //    target = "";
            //    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            //}

            return target.Trim();
        }
Beispiel #4
0
        private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb )
        {
            String target;

            if (strUtil.HasText( s.ListEncoding ))
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding );
            else
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" );

            if (strUtil.IsNullOrEmpty( target )) {

                logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb );

                return target;
            }

            Match match = Regex.Match( target, s.GetListBodyPattern(), RegexOptions.Singleline );
            if (match.Success) {
                target = match.Value;
            }
            else {
                target = "";
                logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb );
            }

            return target.Trim();
        }