private void downloadPageImg(ImgInfo myImgInfo) { if (myImgInfo.imgUrl == null || myImgInfo.imgUrl.Length == 0) { return; } //MyHttpParameter.host = "i1.1100lu.xyz"; int downloadCount = 0; myWriteLine("Download Task Created:" + myImgInfo.picIndex, ConsoleColor.Yellow); string myImgPath = savePath + myImgInfo.picIndex + "-" + myImgInfo.title + @"\";//合法目录 if (!Directory.Exists(myImgPath)) { Directory.CreateDirectory(myImgPath); //创建文件目录 } Parallel.For(0, myImgInfo.imgUrl.Length, async(i, state) => { myImgInfo.imgUrl[i] = myImgInfo.imgUrl[i].IndexOf("http") != -1 ? myImgInfo.imgUrl[i] : "https:" + myImgInfo.imgUrl[i]; //url纠正 string saveRe = await MyHttp.getImg(myImgInfo.imgUrl[i], myImgPath, (i + 1) + "-" + myImgInfo.picIndex, myImgInfo.HP); //异步获取图片 myWriteLine("Img Downloaded Result:" + saveRe, saveRe.IndexOf("Error:") != -1 ? ConsoleColor.Red : ConsoleColor.Green); //检测是否发生错误 finishImgCount++; if (++downloadCount == myImgInfo.imgUrl.Length) //检测整页完成 { lock (finishListLocker) { finishPageCount++; OnPageFinished(null, new OnFinishedEventArgs(myImgInfo)); } } } ); }
private void fetchImgFromContentPage(string pageUrl, string picIndex) { string cookies = string.Empty; string myPageHtml = MyHttp.getHtml(pageUrl, out cookies);//获取图片url列表 //string[] imgUrl = MyWebsiteConverter.imgReg(myPageHtml);//获取图片url string[] imgUrl = WebSiteConverter.imgReg(myPageHtml, ws.imgKeys); for (int i = 0; i < imgUrl.Length; i++) { if (imgUrl[i].StartsWith("//")) { imgUrl[i] = "http:" + imgUrl[i]; } else if (!imgUrl[i].StartsWith("http")) { imgUrl[i] = domain + imgUrl[i]; } } string myTitle = MyHttp.getValidFileName(MyHttp.getHtmlTitle(myPageHtml));//获取合法标题 if (imgUrl.Length > 0) { myWriteLine("Fetch Page Successful:Index[" + picIndex + "]Title:" + myTitle); lock (addListLocker) { ImgInfo tmpImgInfo = new ImgInfo(imgUrl, myTitle, picIndex, new MyHttp.httpParameter(cookies, "", pageUrl)); fetchPageCount++; //计算获取页面数 fetchImgCount += imgUrl.Length; //计算获取图片数 OnPageFetched(null, new OnFetchedEventArgs(tmpImgInfo)); } } else { myWriteLine("Fetch Page Invalid:Index[" + picIndex + "]Title:" + myTitle, ConsoleColor.Red); } }
private void fetchImgUrlFromContentPage(string[] page) { foreach (string pageUrl in page) { do { System.Threading.Thread.Sleep(1); } while (fetchPageCount - finishPageCount >= MAX_WAIT_COUNT && !stopWorkFlag); if (stopWorkFlag) { break; } //检测退出 string myPageUrl = pageUrl.IndexOf(@"://") != -1 ? pageUrl : domain + pageUrl; //得出完整url string picIndex = pageUrl.Substring(pageUrl.LastIndexOf("/") + 1); //得出图片页面号 if (picIndex.LastIndexOf(".") != -1) { picIndex = picIndex.Substring(0, picIndex.LastIndexOf(".")); } else { picIndex = MyHttp.get16bitMd5Str(pageUrl); } if (JUMP_REPEATED_PAGE == true && allDirectiories.IndexOf(@"\" + picIndex + "-") != -1) //跳过重复项 { myWriteLine("Fetch Page Jumped:Index[" + picIndex + "]", ConsoleColor.Red); } else { fetchImgFromContentPage(myPageUrl, picIndex); } } }
private string[] fetchContentPageUrl(string domain, string imgType, long pageIndex) { //string myUrl = MyWebsiteConverter.urlConvert(domain, imgType, pageIndex); //得出url string myUrl = WebSiteConverter.getUrl(ws.urlPattern, ws.firstPageUrlPattern, ws.domain, ws.imgType, pageIndex); string myHtml = MyHttp.getHtml(myUrl, out cookies); //获取html //string[] myContentPageUrl = MyWebsiteConverter.pageReg(myHtml); //获取页面链接 string[] myContentPageUrl = WebSiteConverter.pageReg(myHtml, ws.pageRegex); return(myContentPageUrl); }
public string[] pageReg(string html) { return(MyHttp.regArr(html, pagePattern)); }
public string[] imgReg(string html) { return(MyHttp.getHtmlImgWithKey(MyHttp.getHtmlImg(html), imgKeys)); }
private void parseLinks(Uri uri) { string html = MyHttp.getHtml(uri.AbsoluteUri); var urlDictionary = new Dictionary <string, string>(); Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (Settings.EscapeLinks != null && Settings.EscapeLinks.Count > 0) { if (Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (Settings.HrefKeywords != null && Settings.HrefKeywords.Count > 0) { if (!Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (Settings.TextKeywords != null && Settings.TextKeywords.Count > 0) { if (!Settings.TextKeywords.Any(text.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = uri; Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; myWriteLine(url); } } } }