public static void CrawlBaiduImg(string BaiduImgUrl, out List <string> imgUrls) { var HtmlCode = HTMLService.DownloadUrl(BaiduImgUrl); string pattern = @"""objURL"":""(?<url>.*?)"""; imgUrls = new List <string>(); foreach (Match match in Regex.Matches(HtmlCode, pattern)) { imgUrls.Add(match.Groups["url"].Value); } }
public static void DefaultCrawl(Project project, out List <string> urls, out List <string> imgUrls) { //下载当前url网页的html代码并保存 project.HTMLData.HTMLCodes.Enqueue(HTMLService.DownloadUrl(project.ImgInputData.Url)); string htmlcode = project.HTMLData.HTMLCodes.Dequeue(); //解析出该网页链接到下一个网页的url urls = HTMLService.Parse(htmlcode); foreach (string item in urls) { if (true /*一些筛选条件*/) { project.URLData.HTMLUrls.Enqueue(item); } } //解析出该网页上的图片资源 imgUrls = ImgParseService.Parse(htmlcode); }
public static /*override*/ List <string> Parse(string htmlCode) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(htmlCode); //获取网页链接:寻找img标签 HtmlNodeCollection imgList = htmlDocument.DocumentNode.SelectNodes(".//img[@src]"); List <string> imgUrlList = new List <string>(); if (imgUrlList != null) { foreach (HtmlNode img in imgList) { //获得网页url string url; //懒加载情况下,真正的url存储在data-src里 if (img.Attributes["data-src"] != null) { url = img.Attributes["data-src"].Value; } else { url = img.Attributes["src"].Value; } //svg格式的图标暂不支持下载 if (!url.Contains("/") || url.Contains("svg")) { continue; } //将相对地址转换成绝对地址 url = HTMLService.TransferUrl(url); //字符转义 url = HTMLService.ReplaceChar(url); if (!imgUrlList.Contains(url)) { imgUrlList.Add(url); } } } return(imgUrlList); }