Example #1
0
        private static DetailLink getDetailLink(Match match, SpiderTemplate s)
        {
            string url   = match.Groups[1].Value;
            string title = match.Groups[2].Value;
            //判断输入的url是否满足用户定义的通配符方式的模式
            MatchCollection matchs = Regex.Matches(url, ParseUrl(s.ListPattern), RegexOptions.Singleline);

            if (matchs.Count == 0)
            {
                return(null);
            }
            if (url.IndexOf("javascript:") >= 0)
            {
                return(null);
            }
            if (url.StartsWith("#"))
            {
                return(null);
            }

            title = Regex.Replace(title, "<.+?>", "");
            if (strUtil.IsNullOrEmpty(title))
            {
                return(null);
            }
            if (title == "更多")
            {
                return(null);
            }
            if (title == "more")
            {
                return(null);
            }
            if (title == "更多&gt;&gt;")
            {
                return(null);
            }

            string summary = "";

            if (match.Groups.Count > 2)
            {
                summary = match.Groups[3].Value;
            }

            if (url.StartsWith("http") == false)
            {
                url = strUtil.Join(s.SiteUrl, url);
            }


            DetailLink lnk = new DetailLink();

            lnk.Template = s;
            lnk.Url      = url;
            lnk.Title    = title;
            lnk.Abstract = summary;

            return(lnk);
        }
Example #2
0
        //利用HtmlAgilityPack生成HtmlDocument
        protected HtmlDocument getDetailPageBodyHtmlDocument( string detailUrl, SpiderTemplate template, StringBuilder sb )
        {
            try {
                sb.AppendLine( "抓取详细页..." + detailUrl );
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };

                String page;
                if (strUtil.HasText( template.DetailEncoding ))
                    page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, template.DetailEncoding );
                else
                    page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, "" );

                htmlDoc.LoadHtml( page );

                return htmlDoc;

            }
            catch (Exception ex) {
                logInfo( "error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb );
                return null;
            }
        }
 public override String GetContent( String url, SpiderTemplate s, StringBuilder sb )
 {
     this._template = s;
     this._url = url;
     this._log = sb;
     return this.getPageContentEx();
 }
Example #4
0
        protected string getDetailPageBody(string detailUrl, SpiderTemplate template, StringBuilder sb)
        {
            try {
                sb.AppendLine("抓取详细页..." + detailUrl);

                String page;
                if (strUtil.HasText(template.DetailEncoding))
                {
                    page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, template.DetailEncoding);
                }
                else
                {
                    page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, "");
                }

                template.SiteUrl = new UrlInfo(detailUrl).SiteUrl;

                if (strUtil.IsNullOrEmpty(page))
                {
                    logInfo("error=原始页面没有内容:" + detailUrl, detailUrl, template, sb);
                }

                return(page);
            }
            catch (Exception ex) {
                logInfo("error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb);
                return(null);
            }
        }
Example #5
0
 public virtual String GetContent(String url, SpiderTemplate s, StringBuilder sb)
 {
     this._template = s;
     this._url      = url;
     this._log      = sb;
     return(this.getPageContent());
 }
Example #6
0
        protected string getDetailPageBody( string detailUrl, SpiderTemplate template, StringBuilder sb )
        {
            try {

                sb.AppendLine( "抓取详细页..." + detailUrl );

                String page;
                if (strUtil.HasText( template.DetailEncoding ))
                    page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, template.DetailEncoding );
                else
                    page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, "" );

                template.SiteUrl = new UrlInfo( detailUrl ).SiteUrl;

                if (strUtil.IsNullOrEmpty( page )) {
                    logInfo( "error=原始页面没有内容:" + detailUrl, detailUrl, template, sb );
                }

                return page;

            }
            catch (Exception ex) {
                logInfo( "error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb );
                return null;
            }
        }
Example #7
0
        public static List <DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb)
        {
            List <DetailLink> list = new List <DetailLink>();

            if (strUtil.IsNullOrEmpty(page))
            {
                return(list);
            }

            //获取全部url
            //MatchCollection matchs = Regex.Matches( page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline );
            //if (matchs.Count == 0) {
            //    logger.Error( "list link match count=0" );
            //    logInfo( "list link match count=0", s, sb );
            //}
            SaveUrlToDB(page, s, list);
            //for (int i = matchs.Count - 1; i >= 0; i--) {
            //    DetailLink dlink =

            //    if (dlink == null) continue;

            //    if (dlink.Url.Length > 100) continue;
            //    list.Add( dlink );
            //}
            logInfo("共抓取到链接:" + list.Count, s, sb);

            return(list);
        }
Example #8
0
        private ISpiderTool getSpider( SpiderTemplate s )
        {
            if (strUtil.IsNullOrEmpty( s.SpiderType )) return defaultSpider;

            ISpiderTool spider = ObjectContext.GetByType( s.SpiderType ) as ISpiderTool;
            if (spider == null) return defaultSpider;

            return spider;
        }
Example #9
0
        private static SpiderTemplate getTemplate( String listUrl, String beginCode, String endCode )
        {
            SpiderTemplate s = new SpiderTemplate();
            s.ListUrl = listUrl;
            s.ListBodyPattern = beginCode + ".+?" + endCode;
            s.ListPattern = SpiderConfig.ListLinkPattern;

            return s;
        }
Example #10
0
        private static SpiderTemplate getTemplate(String listUrl, String beginCode, String endCode)
        {
            SpiderTemplate s = new SpiderTemplate();

            s.ListUrl         = listUrl;
            s.ListBodyPattern = beginCode + ".+?" + endCode;
            s.ListPattern     = SpiderConfig.ListLinkPattern;

            return(s);
        }
Example #11
0
        private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb)
        {
            String target;

            if (strUtil.HasText(s.ListEncoding))
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding);
            }
            else
            {
                target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, "");
            }

            if (strUtil.IsNullOrEmpty(target))
            {
                logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb);

                return(target);
            }

            if (!strUtil.IsNullOrEmpty(s.GetListBodyPattern()))
            {
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd         = true,
                    OptionFixNestedTags          = true,
                    OptionReadEncoding           = true
                };
                htmlDoc.LoadHtml(target);
                IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetListBodyPattern());
                if (Nodes.Count() > 0)
                {
                    target = Nodes.ToArray()[0].OuterHtml;
                    return(target.Trim());
                }
                else
                {
                    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
                    return(null);
                }
            }
            //这里未来也可以改成css选择器的方式,来细化目标url集合的范围
            //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);
            //if (match.Success)
            //{
            //    target = match.Value;
            //}
            //else
            //{
            //    target = "";
            //    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            //}

            return(target.Trim());
        }
Example #12
0
        private static DetailLink getDetailLink(Match match, SpiderTemplate s)
        {
            string url   = match.Groups[1].Value;
            string title = match.Groups[2].Value;

            if (url.IndexOf("javascript:") >= 0)
            {
                return(null);
            }
            if (url.StartsWith("#"))
            {
                return(null);
            }

            title = Regex.Replace(title, "<.+?>", "");
            if (strUtil.IsNullOrEmpty(title))
            {
                return(null);
            }
            if (title == "更多")
            {
                return(null);
            }
            if (title == "more")
            {
                return(null);
            }
            if (title == "更多&gt;&gt;")
            {
                return(null);
            }

            string summary = "";

            if (match.Groups.Count > 2)
            {
                summary = match.Groups[3].Value;
            }

            if (url.StartsWith("http") == false)
            {
                url = strUtil.Join(s.SiteUrl, url);
            }


            DetailLink lnk = new DetailLink();

            lnk.Template = s;
            lnk.Url      = url;
            lnk.Title    = title;
            lnk.Abstract = summary;

            return(lnk);
        }
Example #13
0
        protected string getMatchedBody(string page, SpiderTemplate s, StringBuilder sb)
        {
            Match match = Regex.Match(page, s.GetDetailPattern(), RegexOptions.Singleline);

            if (match == null || !match.Success || string.IsNullOrEmpty(match.Value))
            {
                logInfo("error=没有匹配的页面内容:" + _url, this._url, s, sb);
                return(null);
            }

            return(match.Groups[1].Value);
        }
Example #14
0
        public static List<DetailLink> GetDataList( SpiderTemplate s, StringBuilder sb )
        {
            if (strUtil.HasText( s.ListUrl ))
                s.SiteUrl = new UrlInfo( s.ListUrl ).SiteUrl;

            // 一、先抓取列表页面内容
            string page = downloadListPage( s, sb );

            // 二、得到所有文章的title和url
            List<DetailLink> list = getListItem( s, page, sb );
            return list;
        }
Example #15
0
        public void Execute()
        {
            // List<SpiderTemplate> list = SpiderTemplate.find( "IsDelete=0" ).list();
            DbContext.closeConnectionAll();

            StringBuilder log = new StringBuilder();

            IList userRanks = User.find("order by Hits desc, id desc").list(1000);
            logger.Info("begin SpiderJob=" + userRanks.Count);

            foreach (User user in userRanks)
            {
                if (string.IsNullOrEmpty(user.Profile.Address))
                    continue;
                SpiderTemplate s = new SpiderTemplate();
                s.ListUrl = user.Profile.Address;
                s.ListEncoding = user.QQ;
                s.ListBodyPattern = user.Profile.Tel;
                s.ListPattern = user.Profile.WebSite;
                s.DetailPattern = user.MSN;
                s.IsDelete = user.Id;
                s.SiteName = user.Url;
                ISpiderTool spider = getSpider(s);

                spider.DownloadPage(s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo }); // 2~6秒暂停
                DbContext.closeConnectionAll();

                int sleepms = rd.Next(SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo);
                Thread.Sleep(sleepms);
            }
            //foreach (SpiderTemplate s in list) {

            //    ISpiderTool spider = getSpider( s );

            //    spider.DownloadPage( s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo } ); // 2~6秒暂停
            //    DbContext.closeConnectionAll();

            //    int sleepms = rd.Next( SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo );
            //    Thread.Sleep( sleepms );
            //}

            String[] arrLog = log.ToString().Split( '\n' );
            StringBuilder errorLog = new StringBuilder();
            foreach (String item in arrLog) {
                if (item.Trim().StartsWith( "error=" )) errorLog.AppendLine( item.Trim() );
            }

            SpiderLog sg = new SpiderLog();
            sg.Msg = errorLog.ToString();
            sg.insert();
            DbContext.closeConnectionAll();
        }
Example #16
0
        private string getPagedContent(string page, string url, SpiderTemplate s, StringBuilder sb)
        {
            StringBuilder pList = new StringBuilder();
            List <String> urls  = getPagedUrl(page, url);

            for (int i = 0; i < urls.Count; i++)
            {
                pList.AppendLine("<hr>");
                String pageContent = new DetailSpider().GetContent(urls[i], s, sb);
                pList.Append(pageContent);
            }
            return(pList.ToString());
        }
Example #17
0
 //css选择器方式提取详细页内容
 protected string getMatchedBody(HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb)
 {
     //IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetDetailPattern() );
     //if (Nodes.Count() > 0) {
     //    String fpage = Nodes.ToArray()[0].OuterHtml;
     //    return fpage;
     //}
     //else {
     //    logInfo( "error=没有匹配的页面内容:" + _url, this._url, s, sb );
     //    return null;
     //}
     return("");
 }
Example #18
0
        public void testSpider()
        {
            StringBuilder log = new StringBuilder();

            SpiderTemplate    s    = getTemplate("http://news.163.com", "<div class=\"content\" style=\"zoom:1;\">", "<h2>图片新闻</h2>");
            List <DetailLink> list = SpiderTool.GetDataList(s, log);

            Assert.Greater(list.Count, 1);

            s    = getTemplate("http://women.sohu.com/love-story/", "<div class=\"f14list\">", "<div class=\"pages\">");
            list = SpiderTool.GetDataList(s, log);
            Assert.Greater(list.Count, 1);
        }
Example #19
0
        protected string getMatchedBody( string page, SpiderTemplate s, StringBuilder sb )
        {
            Match match = Regex.Match( page, s.GetDetailPattern(), RegexOptions.Singleline );
            if (match == null || !match.Success || string.IsNullOrEmpty( match.Value )) {
                logInfo( "error=没有匹配的页面内容:"+_url, this._url, s, sb );
                return null;
            }

            page = match.Groups[1].Value;

            String fpage = HtmlFilter.Filter( page ); // 过滤广告

            return fpage;
        }
Example #20
0
        private static string downloadListPage(SpiderTemplate s, StringBuilder sb)
        {
            string page = null;

            try {
                page = downloadListPageBody(s, sb);
            }
            catch (Exception ex) {
                logInfo("error=抓取" + s.ListUrl + "发生错误:" + ex.Message, s, sb);
                return(page);
            }

            return(page);
        }
Example #21
0
        public static List<DetailLink> GetDataList( SpiderTemplate s, StringBuilder sb )
        {
            if (strUtil.HasText( s.ListUrl ))
                s.SiteUrl = new UrlInfo( s.ListUrl ).SiteUrl;

            // 一、先抓取列表页面内容
            string page = downloadListPage( s, sb );
            if (strUtil.IsNullOrEmpty( page )) {
                logger.Error( "list page is empty, url=" + s.SiteUrl );
            }

            // 二、得到所有文章的title和url
            List<DetailLink> list = getListItem( s, page, sb );
            return list;
        }
Example #22
0
        public static List <DetailLink> GetDataList(SpiderTemplate s, StringBuilder sb)
        {
            if (strUtil.HasText(s.ListUrl))
            {
                s.SiteUrl = new UrlInfo(s.ListUrl).SiteUrl;
            }

            // 一、先抓取列表页面内容
            string page = downloadListPage(s, sb);

            // 二、得到所有文章的title和url
            List <DetailLink> list = getListItem(s, page, sb);

            return(list);
        }
Example #23
0
        private static string downloadListPage( SpiderTemplate s, StringBuilder sb )
        {
            string page = null;

            try {
                page = downloadListPageBody( s, sb );

            }
            catch (Exception ex) {
                logInfo( "error=抓取" + s.ListUrl + "发生错误:" + ex.Message, s, sb );

                return page;
            }

            return page;
        }
Example #24
0
        private ISpiderTool getSpider(SpiderTemplate s)
        {
            if (strUtil.IsNullOrEmpty(s.SpiderType))
            {
                return(defaultSpider);
            }

            ISpiderTool spider = ObjectContext.GetByType(s.SpiderType) as ISpiderTool;

            if (spider == null)
            {
                return(defaultSpider);
            }

            return(spider);
        }
Example #25
0
        private string filterPage(string input, SpiderTemplate spiderTemplate)
        {
            if (strUtil.IsNullOrEmpty(spiderTemplate.DetailClearTag))
            {
                return(input);
            }

            String[] arrTag = spiderTemplate.DetailClearTag.ToLower().Split(',');
            if (arrTag.Length == 0)
            {
                return(input);
            }

            List <String> rTag = new List <String>();

            logger.Info("filterTag, input=" + input);

            // 过滤标签,以及标签内部的内容
            foreach (String tag in arrTag)
            {
                // font/span/a 只过滤tag,不过滤内容;其他都过滤内容
                if (tag == "font" || tag == "span" || tag == "a")
                {
                    rTag.Add(tag);
                    continue;
                }

                logger.Info("tag=" + tag);

                input = RegPattern.ReplaceHtml(input, tag, true);
            }

            logger.Info("filterTag, clear tag1=" + input);


            // 只过滤标签,不过滤标签的内容
            foreach (String tag in rTag)
            {
                logger.Info("tag=" + tag);
                input = RegPattern.ReplaceHtml(input, tag, false);
            }

            logger.Info("filterTag, clear tag2=" + input);


            return(input);
        }
Example #26
0
        public static List<DetailLink> getListItem( SpiderTemplate s, string page, StringBuilder sb )
        {
            List<DetailLink> list = new List<DetailLink>();
            if (strUtil.IsNullOrEmpty( page )) return list;

            MatchCollection matchs = Regex.Matches( page, s.ListPattern, RegexOptions.Singleline );
            sb.AppendLine( "共抓取到链接:" + matchs.Count );
            for (int i = matchs.Count - 1; i >= 0; i--) {
                DetailLink dlink = getDetailLink( matchs[i], s );

                if (dlink == null) continue;

                if (dlink.Url.Length > 100) continue;
                list.Add( dlink );
            }
            return list;
        }
Example #27
0
        public void DownloadPage( SpiderTemplate s, StringBuilder log, int[] arrSleep )
        {
            logger.Info( "抓取列表页..." + s.SiteName + "_" + s.ListUrl );
            log.AppendLine( "抓取列表页..." + s.SiteName + "_" + s.ListUrl );

            List<DetailLink> list = GetDataList( s, log );

            foreach (DetailLink link in list) {
                savePageDetail( link, log );

                // 暂停几秒,TODO 可配置
                int sleepms = rd.Next( arrSleep[0], arrSleep[1] );
                Thread.Sleep( sleepms );
            }

            log.AppendLine( "抓取完毕。" );
        }
Example #28
0
        public void DownloadPage(SpiderTemplate s, StringBuilder log, int[] arrSleep)
        {
            logger.Info("抓取列表页..." + s.SiteName + "_" + s.ListUrl);
            log.AppendLine("抓取列表页..." + s.SiteName + "_" + s.ListUrl);

            List <DetailLink> list = GetDataList(s, log);

            foreach (DetailLink link in list)
            {
                savePageDetail(link, log);

                // 暂停几秒,TODO 可配置
                int sleepms = rd.Next(arrSleep[0], arrSleep[1]);
                Thread.Sleep(sleepms);
            }

            log.AppendLine("抓取完毕。");
        }
        public void GetDetail()
        {
            String newsUrl = ctx.Post("detailUrl");

            SpiderTemplate s = new SpiderTemplate();

            //String detailBeginCode = ctx.PostHtmlAll( "detailBeginCode" );
            //String detailEndCode = ctx.PostHtmlAll( "detailEndCode" );
            //String DetailPattern = detailBeginCode + "(.+?)" + detailEndCode;

            String DetailPattern = ctx.PostHtmlAll("DetailPattern");

            s.DetailPattern = DetailPattern;

            logger.Info("DetailPattern=" + s.DetailPattern);

            String detailEncoding = ctx.Post("detailEncoding");

            s.DetailEncoding = detailEncoding;

            s.IsSavePic = 0;

            StringBuilder log = new StringBuilder();

            string newsBody = new PagedDetailSpider().GetContent(newsUrl, s, log);

            String strLog = log.ToString();

            if (strLog.IndexOf("error=") >= 0)
            {
                StringBuilder sblog = new StringBuilder();
                sblog.AppendLine("detailUrl=" + newsUrl);
                sblog.AppendLine("detailPattern=" + s.DetailPattern);
                sblog.Append(log);
                echoText(sblog.ToString());
            }
            else
            {
                echoText(newsBody);
            }
        }
        public void GetList()
        {
            SpiderTemplate s = ctx.PostValue <SpiderTemplate>();

            //String beginCode = ctx.PostHtmlAll( "listBeginCode" );
            //String endCode = ctx.PostHtmlAll( "listEndCode" );
            //s.ListBodyPattern = beginCode + ".+?" + endCode;

            String listBodyPattern = ctx.PostHtmlAll("ListBodyPattern");
            String ListPattern     = ctx.PostHtmlAll("ListPattern");

            s.ListBodyPattern = listBodyPattern;

            if (strUtil.IsNullOrEmpty(ListPattern))
            {
                ListPattern = SpiderConfig.ListLinkPattern;
            }
            s.ListPattern = ListPattern;

            String listEncoding = ctx.Post("listEncoding");

            s.ListEncoding = listEncoding;

            StringBuilder     log  = new StringBuilder();
            List <DetailLink> list = SpiderTool.GetDataList(s, log);

            if (list.Count == 0)
            {
                Dictionary <String, Object> dic = new Dictionary <String, Object>();
                dic.Add("IsValid", false);
                dic.Add("listUrl", s.ListUrl);
                dic.Add("patternBody", s.ListBodyPattern);
                dic.Add("patternLinks", s.ListPattern);

                echoJson(JsonString.Convert(dic));
            }
            else
            {
                renderJson(list);
            }
        }
        public void SetTemplate(int id)
        {
            target(GetList);

            if (id > 0)
            {
                SpiderTemplate s = templateService.GetById(id);
                // 感谢 sgzwiz (http://www.wojilu.com/sgzwiz) 贡献此处代码
                set("objTemplate", JsonString.ConvertObject(s).Replace("<", "&lt;").Replace(">", "&gt;"));
            }
            else
            {
                set("objTemplate", "{Id:0}");
            }

            set("detailAction", to(GetDetail));
            set("saveLink", to(Save));

            set("listPattern", SpiderConfig.ListLinkPattern);
            set("returnUrl", to(List));
        }
        public void DoRefresh(int id)
        {
            if (id <= 0)
            {
                echoRedirect("请先选择模板");
                return;
            }

            set("processLink", to(Process, id));

            SpiderTemplate s  = templateService.GetById(id);
            TemplateAndLog tl = new TemplateAndLog();

            tl.Template = s;

            StringBuilder sb = LogCacher.GetNewSpiderLog("log" + ctx.viewer.Id);

            tl.log = sb;

            new Thread(beginRefresh).Start(tl);
        }
Example #33
0
        public static List<DetailLink> getListItem( SpiderTemplate s, string page, StringBuilder sb )
        {
            List<DetailLink> list = new List<DetailLink>();
            if (strUtil.IsNullOrEmpty( page )) return list;

            //获取全部url
            MatchCollection matchs = Regex.Matches( page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline );
            if (matchs.Count == 0) {
                logger.Error( "list link match count=0" );
            }

            for (int i = matchs.Count - 1; i >= 0; i--) {
                DetailLink dlink = getDetailLink( matchs[i], s );

                if (dlink == null) continue;

                if (dlink.Url.Length > 100) continue;
                list.Add( dlink );
            }
            sb.AppendLine( "共抓取到链接:" + list.Count );
            return list;
        }
Example #34
0
        private static void savePageDetail(DetailLink lnk, StringBuilder sb)
        {
            SpiderTemplate template = lnk.Template;
            string         url      = lnk.Url;
            string         title    = lnk.Title;
            string         summary  = lnk.Abstract;

            if (isPageExist(url, sb))
            {
                return;
            }

            String pageBody = new PagedDetailSpider().GetContent(url, template, sb);

            if (pageBody == null)
            {
                return;
            }

            SpiderArticle pd = new SpiderArticle();

            pd.Title          = title;
            pd.Url            = strUtil.SubString(url, 200);
            pd.Abstract       = summary;
            pd.Body           = pageBody;
            pd.SpiderTemplate = template;

            MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline);

            if (matchs.Count > 0)
            {
                pd.IsPic  = 1;
                pd.PicUrl = matchs[0].Groups[1].Value;
            }

            pd.insert();

            sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url);
        }
Example #35
0
        public void Execute()
        {
            List <SpiderTemplate> list = SpiderTemplate.find("IsDelete=0").list();

            DbContext.closeConnectionAll();

            logger.Info("begin SpiderJob=" + list.Count);

            StringBuilder log = new StringBuilder();

            foreach (SpiderTemplate s in list)
            {
                ISpiderTool spider = getSpider(s);

                spider.DownloadPage(s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo });   // 2~6秒暂停
                DbContext.closeConnectionAll();

                int sleepms = rd.Next(SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo);
                Thread.Sleep(sleepms);
            }

            String[]      arrLog   = log.ToString().Split('\n');
            StringBuilder errorLog = new StringBuilder();

            foreach (String item in arrLog)
            {
                if (item.Trim().StartsWith("error="))
                {
                    errorLog.AppendLine(item.Trim());
                }
            }

            SpiderLog sg = new SpiderLog();

            sg.Msg = errorLog.ToString();
            sg.insert();
            DbContext.closeConnectionAll();
        }
        public virtual void SaveSort()
        {
            int    id  = ctx.PostInt("id");
            String cmd = ctx.Post("cmd");

            SpiderTemplate        s    = templateService.GetById(id);
            List <SpiderTemplate> list = templateService.GetAll();

            if (cmd == "up")
            {
                new SortUtil <SpiderTemplate>(s, list).MoveUp();
                echoRedirect("ok");
            }
            else if (cmd == "down")
            {
                new SortUtil <SpiderTemplate>(s, list).MoveDown();
                echoRedirect("ok");
            }
            else
            {
                echoError(lang("exUnknowCmd"));
            }
        }
        private string filterPage( string input, SpiderTemplate spiderTemplate )
        {
            if (strUtil.IsNullOrEmpty( spiderTemplate.DetailClearTag )) return input;

            String[] arrTag = spiderTemplate.DetailClearTag.ToLower().Split( ',' );
            if (arrTag.Length == 0) return input;

            List<String> rTag = new List<String>();

            logger.Info( "filterTag, input=" + input );

            // 过滤标签,以及标签内部的内容
            foreach (String tag in arrTag) {

                // font/span/a 只过滤tag,不过滤内容;其他都过滤内容
                if (tag == "font" || tag == "span" || tag == "a") {
                    rTag.Add( tag );
                    continue;
                }

                logger.Info( "tag=" + tag );

                input = RegPattern.ReplaceHtml( input, tag, true );
            }

            logger.Info( "filterTag, clear tag1=" + input );

            // 只过滤标签,不过滤标签的内容
            foreach (String tag in rTag) {
                logger.Info( "tag=" + tag );
                input = RegPattern.ReplaceHtml( input, tag, false );
            }

            logger.Info( "filterTag, clear tag2=" + input );

            return input;
        }
Example #38
0
        public static List<DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb)
        {
            List<DetailLink> list = new List<DetailLink>();
            if (strUtil.IsNullOrEmpty( page )) return list;

            //获取全部url
            //MatchCollection matchs = Regex.Matches( page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline );
            //if (matchs.Count == 0) {
            //    logger.Error( "list link match count=0" );
            //    logInfo( "list link match count=0", s, sb );
            //}
            SaveUrlToDB(page, s, list);
            //for (int i = matchs.Count - 1; i >= 0; i--) {
            //    DetailLink dlink =

            //    if (dlink == null) continue;

            //    if (dlink.Url.Length > 100) continue;
            //    list.Add( dlink );
            //}
            logInfo( "共抓取到链接:" + list.Count, s, sb );

            return list;
        }
        public void GetDetail()
        {
            String newsUrl = ctx.Post( "detailUrl" );

            SpiderTemplate s = new SpiderTemplate();

            //String detailBeginCode = ctx.PostHtmlAll( "detailBeginCode" );
            //String detailEndCode = ctx.PostHtmlAll( "detailEndCode" );
            //String DetailPattern = detailBeginCode + "(.+?)" + detailEndCode;

            String DetailPattern = ctx.PostHtmlAll( "DetailPattern" );
            s.DetailPattern = DetailPattern;

            logger.Info( "DetailPattern=" + s.DetailPattern );

            String detailEncoding = ctx.Post( "detailEncoding" );
            s.DetailEncoding = detailEncoding;

            s.IsSavePic = 0;

            StringBuilder log = new StringBuilder();

            string newsBody = new PagedDetailSpider().GetContent( newsUrl, s, log );

            String strLog = log.ToString();
            if (strLog.IndexOf( "error=" ) >= 0) {
                StringBuilder sblog = new StringBuilder();
                sblog.AppendLine( "detailUrl=" + newsUrl );
                sblog.AppendLine( "detailPattern=" + s.DetailPattern );
                sblog.Append( log );
                echoText( sblog.ToString() );
            }
            else {
                echoText( newsBody );
            }
        }
 public List <SpiderTemplate> GetAll()
 {
     return(SpiderTemplate.find("order by OrderId desc, Id asc").list());
 }
 public void Update(SpiderTemplate s)
 {
     s.update();
 }
 public void Insert(SpiderTemplate s)
 {
     s.insert();
 }
 public SpiderTemplate GetById(int id)
 {
     return(SpiderTemplate.findById(id));
 }
Example #44
0
        private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb )
        {
            String target;

            if (strUtil.HasText( s.ListEncoding ))
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding );
            else
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" );

            if (strUtil.IsNullOrEmpty( target )) {

                logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb );

                return target;
            }

            Match match = Regex.Match( target, s.GetListBodyPattern(), RegexOptions.Singleline );
            if (match.Success) {
                target = match.Value;
            }
            else {
                target = "";
                logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb );
            }

            return target.Trim();
        }
Example #45
0
        private static DetailLink getDetailLink( Match match, SpiderTemplate s )
        {
            string url = match.Groups[1].Value;
            string title = match.Groups[2].Value;
            //判断输入的url是否满足用户定义的通配符方式的模式
            MatchCollection matchs = Regex.Matches( url, ParseUrl( s.ListPattern ), RegexOptions.Singleline );
            if (matchs.Count == 0) {
                return null;
            }
            if (url.IndexOf( "javascript:" ) >= 0) return null;
            if (url.StartsWith( "#" )) return null;

            title = Regex.Replace( title, "<.+?>", "" );
            if (strUtil.IsNullOrEmpty( title )) return null;
            if (title == "更多") return null;
            if (title == "more") return null;
            if (title == "更多&gt;&gt;") return null;

            string summary = "";
            if (match.Groups.Count > 2) summary = match.Groups[3].Value;

            if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url );

            DetailLink lnk = new DetailLink();
            lnk.Template = s;
            lnk.Url = url;
            lnk.Title = title;
            lnk.Abstract = summary;

            return lnk;
        }
Example #46
0
        public void Save()
        {
            int templateId = ctx.PostInt( "tid" );

            String listUrl = ctx.Post( "listUrl" );

            //String beginCode = ctx.PostHtmlAll( "listBeginCode" );
            //String endCode = ctx.PostHtmlAll( "listEndCode" );
            String listBodyPattern = ctx.PostHtmlAll( "ListBodyPattern" );
            String ListPattern = ctx.PostHtmlAll( "ListPattern" );

            //String detailBeginCode = ctx.PostHtmlAll( "detailBeginCode" );
            //String detailEndCode = ctx.PostHtmlAll( "detailEndCode" );
            String DetailPattern = ctx.PostHtmlAll( "DetailPattern" );

            if (strUtil.IsNullOrEmpty( listUrl )) errors.Add( "请填写列表页的网址" );
            //if (strUtil.IsNullOrEmpty( beginCode )) errors.Add( "请填写列表页开始代码" );
            //if (strUtil.IsNullOrEmpty( endCode )) errors.Add( "请填写列表页结束代码" );
            //if (strUtil.IsNullOrEmpty( detailBeginCode )) errors.Add( "请填写详细页开始代码" );
            //if (strUtil.IsNullOrEmpty( detailEndCode )) errors.Add( "请填写详细页结束代码" );

            if (ctx.HasErrors) {
                echoError();
                return;
            }

            SpiderTemplate s = new SpiderTemplate();
            if (templateId > 0) s = templateService.GetById( templateId );
            if (s == null) {
                echoError( "采集模板不存在" );
                return;
            }

            s.ListUrl = listUrl;
            //s.ListBodyBegin = beginCode;
            //s.ListBodyEnd = endCode;

            s.ListPattern = ListPattern;
            s.ListBodyPattern = listBodyPattern;

            //s.DetailBegin = detailBeginCode;
            //s.DetailEnd = detailEndCode;
            s.DetailPattern = DetailPattern;
            s.SiteName = ctx.Post( "siteName" );

            s.ListEncoding = ctx.Post( "listEncoding" );
            s.DetailEncoding = ctx.Post( "detailEncoding" );

            Boolean chkPic = cvt.ToBool( ctx.Post( "checkPic" ) );
            s.IsSavePic = chkPic ? 1 : 0;

            if (templateId > 0) {
                templateService.Update( s );
            }
            else {
                templateService.Insert( s );
            }

            echoAjaxOk();
        }
Example #47
0
 public void Insert( SpiderTemplate s )
 {
     s.insert();
 }
Example #48
0
        private static void savePageDetail(DetailLink lnk, StringBuilder sb)
        {
            SpiderTemplate template = lnk.Template;
            string         url      = lnk.Url;
            string         title    = lnk.Title;
            string         summary  = lnk.Abstract;

            if (isPageExist(url, sb))
            {
                return;
            }

            String pageBody = new PagedDetailSpider().GetContent(url, template, sb);


            if (pageBody == null)
            {
                return;
            }

            SpiderArticle pd = new SpiderArticle();

            pd.Title          = title;
            pd.Url            = strUtil.SubString(url, 250);
            pd.Abstract       = summary;
            pd.Body           = pageBody;
            pd.SpiderTemplate = template;

            MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline);

            if (matchs.Count > 0)
            {
                pd.IsPic  = 1;
                pd.PicUrl = matchs[0].Groups[1].Value;
            }

            pd.insert();

            sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url);


            pageBody = Regex.Replace(pageBody, "font-size", "", RegexOptions.IgnoreCase);
            string strArcitleLink = "<div class=\"ArcitleLink\"><a href=" + pd.Url + ">原文链接</a></div>";

            pageBody = pageBody + strArcitleLink;

            Maticsoft.BLL.BlogCategory bllBlogCategory = new Maticsoft.BLL.BlogCategory();
            DataSet ds      = bllBlogCategory.GetList("AppId = '" + template.IsDelete.ToString() + "'");
            int     nCateID = 1;

            if (ds.Tables[0].Rows.Count > 0)
            {
                nCateID = (int)ds.Tables[0].Rows[0]["Id"];
            }



            BlogPost data = new BlogPost();


            data.CategoryId       = nCateID;
            data.Title            = title;
            data.Abstract         = summary;
            data.Content          = pageBody;
            data.AccessStatus     = 0;
            data.CommentCondition = 0;
            data.SaveStatus       = 1;//草稿
            data.Created          = System.DateTime.Now.Date;
            data.IsTop            = 0;
            data.IsPick           = 0;
            data.IsPic            = 0;
            data.Ip         = "";
            data.OwnerId    = template.IsDelete;
            data.OwnerUrl   = template.SiteName;
            data.OwnerType  = "wojilu.Members.Users.Domain.User";
            data.CreatorUrl = template.SiteName;
            data.AppId      = template.IsDelete;;
            data.CreatorId  = template.IsDelete;
            Maticsoft.BLL.BlogPost bll = new Maticsoft.BLL.BlogPost();
            bll.Add(data);
        }
 private string getPagedContent( string page, string url, SpiderTemplate s, StringBuilder sb )
 {
     StringBuilder pList = new StringBuilder();
     List<String> urls = getPagedUrl( page, url );
     for (int i = 0; i < urls.Count; i++) {
         pList.AppendLine( "<hr>" );
         String pageContent = new DetailSpider().GetContent( urls[i], s, sb );
         pList.Append( pageContent );
     }
     return pList.ToString();
 }
Example #50
0
        public void Execute()
        {
            // List<SpiderTemplate> list = SpiderTemplate.find( "IsDelete=0" ).list();
            DbContext.closeConnectionAll();



            StringBuilder log = new StringBuilder();


            IList userRanks = User.find("order by Hits desc, id desc").list(1000);

            logger.Info("begin SpiderJob=" + userRanks.Count);

            foreach (User user in userRanks)
            {
                if (string.IsNullOrEmpty(user.Profile.Address))
                {
                    continue;
                }
                SpiderTemplate s = new SpiderTemplate();
                s.ListUrl         = user.Profile.Address;
                s.ListEncoding    = user.QQ;
                s.ListBodyPattern = user.Profile.Tel;
                s.ListPattern     = user.Profile.WebSite;
                s.DetailPattern   = user.MSN;
                s.IsDelete        = user.Id;
                s.SiteName        = user.Url;
                ISpiderTool spider = getSpider(s);

                spider.DownloadPage(s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo }); // 2~6秒暂停
                DbContext.closeConnectionAll();

                int sleepms = rd.Next(SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo);
                Thread.Sleep(sleepms);
            }
            //foreach (SpiderTemplate s in list) {

            //    ISpiderTool spider = getSpider( s );

            //    spider.DownloadPage( s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo } ); // 2~6秒暂停
            //    DbContext.closeConnectionAll();

            //    int sleepms = rd.Next( SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo );
            //    Thread.Sleep( sleepms );
            //}

            String[]      arrLog   = log.ToString().Split('\n');
            StringBuilder errorLog = new StringBuilder();

            foreach (String item in arrLog)
            {
                if (item.Trim().StartsWith("error="))
                {
                    errorLog.AppendLine(item.Trim());
                }
            }

            SpiderLog sg = new SpiderLog();

            sg.Msg = errorLog.ToString();
            sg.insert();
            DbContext.closeConnectionAll();
        }
Example #51
0
 public void Update( SpiderTemplate s )
 {
     s.update();
 }
Example #52
0
        protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List<DetailLink> list)
        {
            Dictionary<string, string> m_dicLink2Text = new Dictionary<string, string>();
            string strUrlFilterRule = s.ListPattern;
            //strUrlFilterRule = ParseUrl(strUrlFilterRule);
            HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage);

               // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority);
            string baseUrl = GetUrlLeftPart(s.ListUrl);
            DocumentWithLinks links = htmlDoc.GetLinks();
            bool bNoArticle = true;
            List<string> lstRevomeSame = new List<string>();

              //  int nCountPerPage = 0;
              //  bool bExistFind = false;
              //  List<string> lstNeedDownLoad = new List<string>();
            foreach (string link in links.Links.Union(links.References))
            {

                if (string.IsNullOrEmpty(link))
                {
                    continue;
                }

                //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string decodedLink = link;
                //if (decodedLink != link)
                //{
                //    int a = 1;
                //}
                //Console.WriteLine(decodedLink);
                string normalizedLink = GetNormalizedLink(baseUrl, decodedLink);
                //Console.WriteLine(normalizedLink);

                if (string.IsNullOrEmpty(normalizedLink))
                {
                    continue;
                }

                MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline);
                if (matchs.Count > 0)
                {
                    string strLinkText = "";

                    foreach (string strTemp in links.m_dicLink2Text.Keys)
                    {
                        if (strTemp.Contains(normalizedLink))
                        {
                            strLinkText = links.m_dicLink2Text[strTemp];
                            break;
                        }
                    }
                    //if (links.m_dicLink2Text.Keys.Contains(normalizedLink))
                    //    strLinkText = links.m_dicLink2Text[normalizedLink];

                    if (strLinkText == "")
                    {
                        if (links.m_dicLink2Text.Keys.Contains(link))
                            strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                        if (links.m_dicLink2Text.Keys.Contains(link.ToLower()))
                            strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart();
                    }

                    if (lstRevomeSame.Contains(normalizedLink))
                        continue;
                    else
                        lstRevomeSame.Add(normalizedLink);

                    //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText);
                    DetailLink lnk = new DetailLink();
                    lnk.Template = s;
                    lnk.Url = normalizedLink;
                    lnk.Title = strLinkText;
                    list.Add(lnk);
                }
                //Console.WriteLine(" uri is " + normalizedLink.ToString());
            }

            return;
        }
Example #53
0
        private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb )
        {
            String target;

            if (strUtil.HasText( s.ListEncoding )) {
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding );
            }
            else {
                target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" );
            }

            if (strUtil.IsNullOrEmpty( target )) {
                logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb );
                return target;
            }
            else {
                logInfo( "抓取列表内容成功", s, sb );
            }

            if (strUtil.HasText( s.GetListBodyPattern() )) {
                HtmlDocument htmlDoc = new HtmlDocument {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };

                htmlDoc.LoadHtml( target );
                try {
                    IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetListBodyPattern() );

                    if (Nodes.Count() > 0) {
                        logInfo( "匹配列表内容成功", s, sb );
                        target = Nodes.ToArray()[0].OuterHtml;
                        target = target.Trim();
                        return target;
                    }
                    else {
                        logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb );
                        return null;
                    }
                }
                catch (Exception ex) {
                    logInfo( "htmlDoc QuerySelectorAll解析出错=" + ex.Message, s, sb );
                    return null;
                }
            }

            //这里未来也可以改成css选择器的方式,来细化目标url集合的范围
            //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline);
            //if (match.Success)
            //{
            //    target = match.Value;
            //}
            //else
            //{
            //    target = "";
            //    logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb);
            //}

            return target.Trim();
        }
Example #54
0
        private static DetailLink getDetailLink( Match match, SpiderTemplate s )
        {
            string url = match.Groups[1].Value;
            string title = match.Groups[2].Value;

            if (url.IndexOf( "javascript:" ) >= 0) return null;
            if (url.StartsWith( "#" )) return null;

            title = Regex.Replace( title, "<.+?>", "" );
            if (strUtil.IsNullOrEmpty( title )) return null;
            if (title == "更多") return null;
            if (title == "more") return null;
            if (title == "更多&gt;&gt;") return null;

            string summary = "";
            if (match.Groups.Count > 2) summary = match.Groups[3].Value;

            if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url );

            DetailLink lnk = new DetailLink();
            lnk.Template = s;
            lnk.Url = url;
            lnk.Title = title;
            lnk.Abstract = summary;

            return lnk;
        }
Example #55
0
 //css选择器方式提取详细页内容
 protected string getMatchedBody( HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb )
 {
     IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetDetailPattern() );
     if (Nodes.Count() > 0) {
         String fpage = Nodes.ToArray()[0].OuterHtml;
         return fpage;
     }
     else {
         logInfo( "error=没有匹配的页面内容:" + _url, this._url, s, sb );
         return null;
     }
 }
Example #56
0
 private static void logInfo(String msg, SpiderTemplate s, StringBuilder sb)
 {
     logger.Info(msg);
     sb.AppendLine(msg);
 }
Example #57
0
 private static void logInfo( String msg, SpiderTemplate s, StringBuilder sb )
 {
     logger.Info( msg );
     sb.AppendLine( msg );
 }