Esempio n. 1
0
        //单线程,单个网页爬取
        public static bool StartCrawl(Project project /*, CrawlerService crawler*/)
        {
            CrawlStart();
            //存储网页url
            List <string> urls = new List <string>();
            //存储图片ul
            List <string> imgUrls = new List <string>();

            ///初始化爬虫
            InitCrawl(project);
            //如果是贴吧网址则使用爬取贴吧图片的策略
            if (TiebaHTMLService.isTiebaSite(project.ImgInputData.Url))
            {
                TibaCrawl(project, out urls, out imgUrls);
            }
            //如果是百度图片网址则使用爬取贴吧图片的策略
            else if (BaiduHTMLService.IsBaiduImgUrl(project.ImgInputData.Url))
            {
                CrawlBaiduImg(project.ImgInputData.Url, out imgUrls);
            }
            // 如果是微博网址则使用爬取微博图片的策略
            else if (WeiboHTMLService.IsWeiboUrl(project.ImgInputData.Url))
            {
                //微博
                CrawlWeiboImg(project.ImgInputData.Url, out imgUrls);
            }
            //如果是其他网址则使用爬取贴吧图片的策略
            else
            {
                DefaultCrawl(project, out urls, out imgUrls);
            }

            foreach (string item in imgUrls)
            {
                if (true /*一些筛选条件*/)
                {
                    project.URLData.ImgUrls.Enqueue(item);
                }
            }
            ThreadCrawlDownload(project /*, urls, imgUrls)*/);
            CrawlFinish();
            return(true);
        }
Esempio n. 2
0
        /// <summary>
        /// 对微博上图片爬取的封装
        /// </summary>
        /// <param name="weiboImgUrl">待爬取的微博url</param>
        /// <param name="imgUrls">爬取出来的url列表,已全部转换为绝对地址</param>
        public static void CrawlWeiboImg(string weiboImgUrl, out List <string> imgUrls)
        {
            string htmlCode = WeiboHTMLService.DownloadUrl(weiboImgUrl);

            imgUrls = WeiboImgParse.GetImgUrls(htmlCode);
        }