Пример #1
0
        //单线程,单个网页爬取
        public static bool StartCrawl(Project project /*, CrawlerService crawler*/)
        {
            CrawlStart();
            //存储网页url
            List <string> urls = new List <string>();
            //存储图片ul
            List <string> imgUrls = new List <string>();

            ///初始化爬虫
            InitCrawl(project);
            //如果是贴吧网址则使用爬取贴吧图片的策略
            if (TiebaHTMLService.isTiebaSite(project.ImgInputData.Url))
            {
                TibaCrawl(project, out urls, out imgUrls);
            }
            //如果是百度图片网址则使用爬取贴吧图片的策略
            else if (BaiduHTMLService.IsBaiduImgUrl(project.ImgInputData.Url))
            {
                CrawlBaiduImg(project.ImgInputData.Url, out imgUrls);
            }
            // 如果是微博网址则使用爬取微博图片的策略
            else if (WeiboHTMLService.IsWeiboUrl(project.ImgInputData.Url))
            {
                //微博
                CrawlWeiboImg(project.ImgInputData.Url, out imgUrls);
            }
            //如果是其他网址则使用爬取贴吧图片的策略
            else
            {
                DefaultCrawl(project, out urls, out imgUrls);
            }

            foreach (string item in imgUrls)
            {
                if (true /*一些筛选条件*/)
                {
                    project.URLData.ImgUrls.Enqueue(item);
                }
            }
            ThreadCrawlDownload(project /*, urls, imgUrls)*/);
            CrawlFinish();
            return(true);
        }
Пример #2
0
 public static void TibaCrawl(Project project, out List <string> urls, out List <string> imgUrls)
 {
     //解析网页url
     urls = TiebaHTMLService.TiebaParse(project.ImgInputData.Url);
     //存储图片url
     imgUrls = new List <string>();
     foreach (string item in urls)
     {
         //生成一个0-1000内的随机数
         int waitTime = new Random().Next(0, 1000);
         //为防止网站的反爬机制,等待一个随机时间
         Thread.Sleep(waitTime);
         List <string> onePageImgUrl;
         project.URLData.HTMLUrls.Enqueue(item);
         //解析出该网页上的图片资源
         string htmlcode = TiebaHTMLService.DownloadUrl(item);
         onePageImgUrl = TiebaImgParse.GetImgUrls(htmlcode);
         foreach (string imgurl in onePageImgUrl)
         {
             if (true /*一些筛选条件*/)
             {
                 imgUrls.Add(imgurl);
             }
         }
     }
     ////使用并行方式提高爬取速度
     //ArrayList imgUrls = ArrayList.Synchronized(new ArrayList());
     //Parallel.ForEach(urls, url => {
     //    string htmlCode = TiebaHTMLService.DownloadUrl(url);
     //    List<string> onePageImgUrls = TiebaHTMLService.Parse(htmlCode);
     //    foreach(string imgurl in onePageImgUrls)
     //    {
     //        imgUrls.Add(imgUrls);
     //    }
     //});
 }