Esempio n. 1
0
        public static void CrawlBaiduImg(string BaiduImgUrl, out List <string> imgUrls)
        {
            var    HtmlCode = HTMLService.DownloadUrl(BaiduImgUrl);
            string pattern  = @"""objURL"":""(?<url>.*?)""";

            imgUrls = new List <string>();
            foreach (Match match in Regex.Matches(HtmlCode, pattern))
            {
                imgUrls.Add(match.Groups["url"].Value);
            }
        }
Esempio n. 2
0
        public static void DefaultCrawl(Project project, out List <string> urls, out List <string> imgUrls)
        {
            //下载当前url网页的html代码并保存
            project.HTMLData.HTMLCodes.Enqueue(HTMLService.DownloadUrl(project.ImgInputData.Url));
            string htmlcode = project.HTMLData.HTMLCodes.Dequeue();

            //解析出该网页链接到下一个网页的url
            urls = HTMLService.Parse(htmlcode);
            foreach (string item in urls)
            {
                if (true /*一些筛选条件*/)
                {
                    project.URLData.HTMLUrls.Enqueue(item);
                }
            }
            //解析出该网页上的图片资源
            imgUrls = ImgParseService.Parse(htmlcode);
        }
Esempio n. 3
0
        public static /*override*/ List <string> Parse(string htmlCode)
        {
            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(htmlCode);
            //获取网页链接:寻找img标签
            HtmlNodeCollection imgList    = htmlDocument.DocumentNode.SelectNodes(".//img[@src]");
            List <string>      imgUrlList = new List <string>();

            if (imgUrlList != null)
            {
                foreach (HtmlNode img in imgList)
                {
                    //获得网页url
                    string url;
                    //懒加载情况下,真正的url存储在data-src里
                    if (img.Attributes["data-src"] != null)
                    {
                        url = img.Attributes["data-src"].Value;
                    }
                    else
                    {
                        url = img.Attributes["src"].Value;
                    }
                    //svg格式的图标暂不支持下载
                    if (!url.Contains("/") || url.Contains("svg"))
                    {
                        continue;
                    }
                    //将相对地址转换成绝对地址
                    url = HTMLService.TransferUrl(url);
                    //字符转义
                    url = HTMLService.ReplaceChar(url);
                    if (!imgUrlList.Contains(url))
                    {
                        imgUrlList.Add(url);
                    }
                }
            }
            return(imgUrlList);
        }