public static List <string> GetPageDownUrlAuto(string baseUrl) { //自动获取翻页 //1、直接在url后面加 _page //2、将url最后一段转换成数字加1 List <string> list = new List <string>(); var urlSuffix = baseUrl.Substring(baseUrl.LastIndexOf(".")); var subUrl = baseUrl.Substring(baseUrl.LastIndexOf("/") + 1).Replace(urlSuffix, ""); var resultUrl = ""; var pageStr = ""; var page = 0; //获取最后一段数字区域 pageStr = RegexUtil.ExtractDigit(subUrl); if (int.TryParse(pageStr, out page) == true) { resultUrl = subUrl.Replace(pageStr, (++page).ToString()); } resultUrl = baseUrl.Replace(subUrl, resultUrl); //添加到url列表中 list.Add(resultUrl); resultUrl = subUrl + "_2"; resultUrl = baseUrl.Replace(subUrl, resultUrl); //添加到url列表中 list.Add(resultUrl); return(list); }
public static bool IsAvailableFileUrl(string url) { //常见的文件url //https://qd.myapp.com/myapp/qqteam/pcqq/PCQQ2019.exe //http://qzonestyle.gtimg.cn/qzone/qzactStatics/imgs/20190626150615_2860ae.png //https://res.vmallres.com/pimages//common/config/logo/SXppnESYv4K11DBxDFc2.png //文件类型太多,这里只写几个需要的 // http\S*\.(jpg|png|bmp|mp4|exe|rar|zip) //另一种文件url // /img/2019/flower.jpg // /\S*\.(jpg|png|bmp|mp4|exe|rar|zip) var tempUrl = url.ToUpper(); if (tempUrl.StartsWith("HTTP") || tempUrl.StartsWith("HTTPS")) { var match = RegexUtil.RegexMatch(url, RegexPattern.MatchFileUrlWithHttpPattern); return(match.Success); } else if (tempUrl.StartsWith("/")) { var match = RegexUtil.RegexMatch(url, RegexPattern.MatchFileUrlWithForwardSlash); return(match.Success); } else { return(false); } }
public async static Task <List <TagImg> > GetImgFromHtml(string html, bool isHotspot = false) { Task <List <TagImg> > task = Task.Run(() => { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); var imgList = doc.DocumentNode.SelectNodes("//img"); var w = 0; var h = 0; HtmlAgilityPack.HtmlAttribute tempAttribute = null; List <TagImg> list = new List <TagImg>(); foreach (var item in imgList) { TagImg tagImg = new TagImg(); tempAttribute = item.Attributes["alt"]; tagImg.Alt = tempAttribute == null ? "":tempAttribute.Value; tempAttribute = item.Attributes["src"]; tagImg.Src = tempAttribute == null ? "" : tempAttribute.Value; tempAttribute = item.Attributes["h"]; if (tempAttribute != null) { int.TryParse(tempAttribute.Value, out h); } tempAttribute = item.Attributes["w"]; if (tempAttribute != null) { int.TryParse(tempAttribute.Value, out w); } //Search Detail /* * <a class="iusc" style="height:208px;width:333px" m="{"cid":"1jz2ZvDM","purl":"https://www.927tour.com/News_newsDetail_id_20180408195735146766.html","murl":"http://ynwgm.ynurl.cn/uploadfile/s10/2018/0408/20180408075500850.jpg","turl":"https://tse1-mm.cn.bing.net/th?id=OIP.1jz2ZvDMIyhtns4hK1ay-AHaFJ&pid=15.1","md5":"d63cf666f0cc23286d9ece212b56b2f8","shkey":"","t":"铁路、民航保障游客正常出游","mid":"1034F8C523DE0FCD1B8302CF3C0D52E2DA5E1CD3","desc":""}" onclick="sj_evt.fire('IFrame.Navigate', this.href); return false;" href="/images/search?view=detailV2&ccid=1jz2ZvDM&id=1034F8C523DE0FCD1B8302CF3C0D52E2DA5E1CD3&thid=OIP.1jz2ZvDMIyhtns4hK1ay-AHaFJ&mediaurl=http%3a%2f%2fynwgm.ynurl.cn%2fuploadfile%2fs10%2f2018%2f0408%2f20180408075500850.jpg&exph=407&expw=585&q=%e6%b8%85%e6%98%8e%e5%81%87%e6%9c%9f%e5%9b%bd%e5%86%85%e6%97%85%e6%b8%b8%e6%8e%a5%e5%be%85%e6%80%bb%e4%ba%ba%e6%95%b01.12%e4%ba%bf&simid=608053044385353052&selectedIndex=32&qft=+filterui%3aphoto-photo" h="ID=images.5601_7,5217.1"> * <div class="img_cont hoff"> * <img class="mimg" style="background-color:#c10a34;color:#c10a34" height="208" width="299" src="https://tse3-mm.cn.bing.net/th?id=OIP.1jz2ZvDMIyhtns4hK1ay-AHaFJ&w=299&h=208&c=7&o=5&pid=1.7" alt="清明假期国内旅游接待总人数1.12亿 的图像结果" /> * </div> * </a> */ /*< a class="iusc" style="height:207px;width:276px" m="{"cid":"Ox2V7JRH","purl":"http://www.wall001.com/nature/under_sky/html/image8.html","murl":"http://wall001.com/nature/under_sky/mxxx01/[wall001.com]_sky_AP23070.jpg","turl":"https://tse2-mm.cn.bing.net/th?id=OIP.Ox2V7JRHXMInhT3_WlPpVgHaFj&pid=15.1","md5":"3b1d95ec94475cc227853dff5a53e956","shkey":"","t":"桌布天堂 --- 晴朗天空 - 藍天白云8","mid":"8A372FC995FECC38853858A07F4171C439B8FA58","desc":""}" onclick="sj_evt.fire('IFrame.Navigate', this.href); return false;" href="/images/search?view=detailV2&ccid=Ox2V7JRH&id=8A372FC995FECC38853858A07F4171C439B8FA58&thid=OIP.Ox2V7JRHXMInhT3_WlPpVgHaFj&mediaurl=http%3a%2f%2fwall001.com%2fnature%2funder_sky%2fmxxx01%2f%5bwall001.com%5d_sky_AP23070.jpg&exph=768&expw=1024&q=%e5%a4%a9%e7%a9%ba&simid=608010515721882861&selectedIndex=5&qft=+filterui%3aphoto-photo" h="ID=images.5601_7,5055.1"><div class="img_cont hoff"><img class="mimg" style="background-color:#1543b6;color:#1543b6" height="207" width="276" src="https://tse4-mm.cn.bing.net/th?id=OIP.Ox2V7JRHXMInhT3_WlPpVgHaFj&w=276&h=207&c=7&o=5&pid=1.7" alt="天空 的图像结果"></div></a>*/ Tuple <bool, string> extractResult = RegexUtil.ExtractBingImage(item.ParentNode.ParentNode.OuterHtml); if (extractResult.Item1 == true || isHotspot == true) { tagImg.DetailUrl = extractResult.Item2; tagImg.Width = w; tagImg.Height = h; list.Add(tagImg); } } return(list); }); return(await task); }
public static string GetPageDownUrlManual(string baseUrl, string pageDownUrl) { //目前只考虑简单的翻页,太复杂的翻页还是具体情况具体分析 var url = ""; int i; for (i = 0; i < baseUrl.Length; i++) { if (baseUrl[i] == pageDownUrl[i]) { continue; } break; } url = pageDownUrl.Substring(i); //补齐 baseUrl = baseUrl.PadLeft(pageDownUrl.Length, ' '); for (i = pageDownUrl.Length - 1; i >= 0; i--) { if (pageDownUrl[i] == baseUrl[i]) { continue; } break; } url = pageDownUrl.Substring(i); var pageStr = RegexUtil.ExtractDigit(url); if (string.IsNullOrEmpty(pageStr)) { return(pageDownUrl); } var page = 0; int.TryParse(pageStr, out page); url = url.Replace(pageStr, (++page).ToString()); //防止前面有数字重合的,再获取最后一段url var subUrl = pageDownUrl.Substring(pageDownUrl.LastIndexOf("/") + 1); url = pageDownUrl.Replace(subUrl, subUrl.Replace(pageStr, url)); return(url); }