public static void CrawlBaiduImg(string BaiduImgUrl, out List <string> imgUrls) { var HtmlCode = HTMLService.DownloadUrl(BaiduImgUrl); string pattern = @"""objURL"":""(?<url>.*?)"""; imgUrls = new List <string>(); foreach (Match match in Regex.Matches(HtmlCode, pattern)) { imgUrls.Add(match.Groups["url"].Value); } }
public static void DefaultCrawl(Project project, out List <string> urls, out List <string> imgUrls) { //下载当前url网页的html代码并保存 project.HTMLData.HTMLCodes.Enqueue(HTMLService.DownloadUrl(project.ImgInputData.Url)); string htmlcode = project.HTMLData.HTMLCodes.Dequeue(); //解析出该网页链接到下一个网页的url urls = HTMLService.Parse(htmlcode); foreach (string item in urls) { if (true /*一些筛选条件*/) { project.URLData.HTMLUrls.Enqueue(item); } } //解析出该网页上的图片资源 imgUrls = ImgParseService.Parse(htmlcode); }