//单线程,单个网页爬取 public static bool StartCrawl(Project project /*, CrawlerService crawler*/) { CrawlStart(); //存储网页url List <string> urls = new List <string>(); //存储图片ul List <string> imgUrls = new List <string>(); ///初始化爬虫 InitCrawl(project); //如果是贴吧网址则使用爬取贴吧图片的策略 if (TiebaHTMLService.isTiebaSite(project.ImgInputData.Url)) { TibaCrawl(project, out urls, out imgUrls); } //如果是百度图片网址则使用爬取贴吧图片的策略 else if (BaiduHTMLService.IsBaiduImgUrl(project.ImgInputData.Url)) { CrawlBaiduImg(project.ImgInputData.Url, out imgUrls); } // 如果是微博网址则使用爬取微博图片的策略 else if (WeiboHTMLService.IsWeiboUrl(project.ImgInputData.Url)) { //微博 CrawlWeiboImg(project.ImgInputData.Url, out imgUrls); } //如果是其他网址则使用爬取贴吧图片的策略 else { DefaultCrawl(project, out urls, out imgUrls); } foreach (string item in imgUrls) { if (true /*一些筛选条件*/) { project.URLData.ImgUrls.Enqueue(item); } } ThreadCrawlDownload(project /*, urls, imgUrls)*/); CrawlFinish(); return(true); }
public static void TibaCrawl(Project project, out List <string> urls, out List <string> imgUrls) { //解析网页url urls = TiebaHTMLService.TiebaParse(project.ImgInputData.Url); //存储图片url imgUrls = new List <string>(); foreach (string item in urls) { //生成一个0-1000内的随机数 int waitTime = new Random().Next(0, 1000); //为防止网站的反爬机制,等待一个随机时间 Thread.Sleep(waitTime); List <string> onePageImgUrl; project.URLData.HTMLUrls.Enqueue(item); //解析出该网页上的图片资源 string htmlcode = TiebaHTMLService.DownloadUrl(item); onePageImgUrl = TiebaImgParse.GetImgUrls(htmlcode); foreach (string imgurl in onePageImgUrl) { if (true /*一些筛选条件*/) { imgUrls.Add(imgurl); } } } ////使用并行方式提高爬取速度 //ArrayList imgUrls = ArrayList.Synchronized(new ArrayList()); //Parallel.ForEach(urls, url => { // string htmlCode = TiebaHTMLService.DownloadUrl(url); // List<string> onePageImgUrls = TiebaHTMLService.Parse(htmlCode); // foreach(string imgurl in onePageImgUrls) // { // imgUrls.Add(imgUrls); // } //}); }