//单线程,单个网页爬取 public static bool StartCrawl(Project project /*, CrawlerService crawler*/) { CrawlStart(); //存储网页url List <string> urls = new List <string>(); //存储图片ul List <string> imgUrls = new List <string>(); ///初始化爬虫 InitCrawl(project); //如果是贴吧网址则使用爬取贴吧图片的策略 if (TiebaHTMLService.isTiebaSite(project.ImgInputData.Url)) { TibaCrawl(project, out urls, out imgUrls); } //如果是百度图片网址则使用爬取贴吧图片的策略 else if (BaiduHTMLService.IsBaiduImgUrl(project.ImgInputData.Url)) { CrawlBaiduImg(project.ImgInputData.Url, out imgUrls); } // 如果是微博网址则使用爬取微博图片的策略 else if (WeiboHTMLService.IsWeiboUrl(project.ImgInputData.Url)) { //微博 CrawlWeiboImg(project.ImgInputData.Url, out imgUrls); } //如果是其他网址则使用爬取贴吧图片的策略 else { DefaultCrawl(project, out urls, out imgUrls); } foreach (string item in imgUrls) { if (true /*一些筛选条件*/) { project.URLData.ImgUrls.Enqueue(item); } } ThreadCrawlDownload(project /*, urls, imgUrls)*/); CrawlFinish(); return(true); }
/// <summary> /// 对微博上图片爬取的封装 /// </summary> /// <param name="weiboImgUrl">待爬取的微博url</param> /// <param name="imgUrls">爬取出来的url列表,已全部转换为绝对地址</param> public static void CrawlWeiboImg(string weiboImgUrl, out List <string> imgUrls) { string htmlCode = WeiboHTMLService.DownloadUrl(weiboImgUrl); imgUrls = WeiboImgParse.GetImgUrls(htmlCode); }