public void DownloadImageLoop() { WriteLog("Download Thread is starting. id is:" + Thread.CurrentThread.ManagedThreadId); if (!Directory.Exists("D:\\WebDownload")) { Directory.CreateDirectory("D:\\WebDownload"); } string downurl = ""; DownloadInfo downinfo = null; int trytag = 0; while (true) { if (!AppRunning) { break; } if (!bIsStartWork) { Thread.CurrentThread.Suspend(); } try { if (workList.Count <= 0 && downloadList.Count <= 0) { if (trytag <= 5) { trytag++; Thread.Sleep(500); continue; } else { break; } } lock (downloadList) { if (downloadList.Count == 0) { Thread.Sleep(500); continue; } downurl = downloadList.Dequeue(); downinfo = imageList[downurl]; } var filepath = GetImageSavePath(downinfo.imageUrl); // 如果文件已经存在,则认为已经下载过.直接跳过 if (File.Exists(filepath)) { continue; } try { WebClient mywebclient = new WebClient(); Byte[] imgdata = mywebclient.DownloadData(downinfo.imageUrl); MemoryStream ms = new MemoryStream(imgdata); Image img = Image.FromStream(ms); downinfo.imgSize = img.Size; if (img.Width > 400 && img.Height > 400) { var savefolder = GetImageSaveFolder(downinfo.imageUrl); if (!Directory.Exists(savefolder)) { Directory.CreateDirectory(savefolder); } try { img.Save(filepath); } catch (Exception exp) { WriteLog("Save Image error:" + exp.Message.ToString() + "\r\nThe url is :" + downinfo.workUrl + "\r\nThe save path is:" + filepath); lock (downloadList) { downloadList.Enqueue(downurl); } } } } catch (Exception ex) { WriteLog("Download image failed:" + ex.Message.ToString() + "\r\nThe URL is :" + downurl); } } catch (Exception e) { WriteLog("Download image failed:" + e.Message.ToString()); } } WriteLog("Download Thread is end. id is:" + Thread.CurrentThread.ManagedThreadId); }
public void ParserLoop() { WriteLog("Parser Thread is starting. id is:" + Thread.CurrentThread.ManagedThreadId); while (true) { if (!AppRunning) break; if (!bIsStartWork) { Thread.CurrentThread.Suspend(); } if (workList.Count <= 0) { Thread.Sleep(500); continue; } string workurl = ""; lock (workList) { if (workList.Count <= 0) { Thread.Sleep(500); continue; } workurl = workList.Dequeue(); } // 全局相对路径,解析当前url的根路径 string orgbaseurl = ""; if (workurl.IndexOf('/', 8) == -1) { orgbaseurl = workurl; } else { orgbaseurl = workurl.Substring(0, workurl.IndexOf('/', 8)); } try { int end = -1; int start = -1; WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据 string pageHtml = ""; pageHtml = GetHtml(workurl, null); pageHtml = pageHtml.Replace("<", "\n<"); string[] listtemp = Regex.Split(pageHtml, "\n", RegexOptions.IgnoreCase); List<string> list = new List<string>(); foreach (var item in listtemp) { var temp = item.Replace("\t", ""); temp = temp.Trim(); if (temp.Length > 5) list.Add(temp); } string title = "no title"; foreach (var item in list) { string splitstring = ""; bool bParserIMG = false; if (-1 != item.IndexOf("<img", StringComparison.OrdinalIgnoreCase)) { splitstring = "src="; bParserIMG = true; } else if (-1 != item.IndexOf("<a", StringComparison.OrdinalIgnoreCase)) { splitstring = "href="; bParserIMG = false; } else if (-1 != item.IndexOf("<title", StringComparison.OrdinalIgnoreCase)) { start = item.IndexOf("<title>", StringComparison.OrdinalIgnoreCase) + 7; end = item.IndexOf(@"</title>", StringComparison.OrdinalIgnoreCase); if (end == -1) end = item.Length; title = item.Substring(start, end - start); continue; } else { continue; } // parser the url if (-1 == item.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase)) continue; string url = ""; string[] resu = item.Split(' '); foreach (var subitem in resu) { if (-1 != subitem.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase)) { url = subitem.Substring(subitem.IndexOf("=") + 1); break; } } url = GetRealURL(url, orgbaseurl, workurl); if (!bParserIMG && url.IndexOf(baseurl, StringComparison.OrdinalIgnoreCase) == -1)//&& url.IndexOf(orgbaseurl) == -1 ) { //WriteLog("The url is not current site:" + url); continue; } string workpage = config.workUrl.Substring(0, config.workUrl.LastIndexOf('.')); if (!bParserIMG && config.OnlyCurPage && url.IndexOf(workpage) == -1) { continue; } // some a tag's url is a image. check it. if (bParserIMG == false) { string temp = url.Substring(url.Length - 4).ToLower(); if (temp == "jpeg" || temp == ".jpg" || temp == ".png" || temp == ".bmp" || temp == ".gif") { url = url.Substring(url.LastIndexOf("://") - 4); bParserIMG = true; } } if (bParserIMG) { if (!imageList.ContainsKey(url)) { string samiltitle = title; // 处理title foreach (var split in config.pageSplit) { Regex rg = new Regex(split); var res = rg.Match(title); if (res.Value != "") { samiltitle = Regex.Replace(title, split, ""); break; } } // 去除多页时通常使用的分隔 samiltitle = samiltitle.Replace("-", "").Trim(); // 准备信息 DownloadInfo info = new DownloadInfo(); info.imageUrl = url; info.baseUrl = orgbaseurl; info.title = title; info.samilTitle = samiltitle; info.workUrl = workurl; imageList.Add(url, info); downloadList.Enqueue(url); lock (addToViewList) { addToViewList.Add(url); } } } else { //lock(linkList) { if (config.UrlKeyWords != null && -1 == url.IndexOf(config.UrlKeyWords)) continue; if (!linkList.ContainsKey(url)) { linkList.Add(url, null); lock (workList) { workList.Enqueue(url); } } } } } } catch (Exception e) { WriteLog("Parser fialed:" + e.ToString()); } } WriteLog("Parser Thread is end. id is:" + Thread.CurrentThread.ManagedThreadId); }
public void ParserLoop() { WriteLog("Parser Thread is starting. id is:" + Thread.CurrentThread.ManagedThreadId); while (true) { if (!AppRunning) { break; } if (!bIsStartWork) { Thread.CurrentThread.Suspend(); } if (workList.Count <= 0) { Thread.Sleep(500); continue; } string workurl = ""; lock (workList) { if (workList.Count <= 0) { Thread.Sleep(500); continue; } workurl = workList.Dequeue(); } // 全局相对路径,解析当前url的根路径 string orgbaseurl = ""; if (workurl.IndexOf('/', 8) == -1) { orgbaseurl = workurl; } else { orgbaseurl = workurl.Substring(0, workurl.IndexOf('/', 8)); } try { int end = -1; int start = -1; WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据 string pageHtml = ""; pageHtml = GetHtml(workurl, null); pageHtml = pageHtml.Replace("<", "\n<"); string[] listtemp = Regex.Split(pageHtml, "\n", RegexOptions.IgnoreCase); List <string> list = new List <string>(); foreach (var item in listtemp) { var temp = item.Replace("\t", ""); temp = temp.Trim(); if (temp.Length > 5) { list.Add(temp); } } string title = "no title"; foreach (var item in list) { string splitstring = ""; bool bParserIMG = false; if (-1 != item.IndexOf("<img", StringComparison.OrdinalIgnoreCase)) { splitstring = "src="; bParserIMG = true; } else if (-1 != item.IndexOf("<a", StringComparison.OrdinalIgnoreCase)) { splitstring = "href="; bParserIMG = false; } else if (-1 != item.IndexOf("<title", StringComparison.OrdinalIgnoreCase)) { start = item.IndexOf("<title>", StringComparison.OrdinalIgnoreCase) + 7; end = item.IndexOf(@"</title>", StringComparison.OrdinalIgnoreCase); if (end == -1) { end = item.Length; } title = item.Substring(start, end - start); continue; } else { continue; } // parser the url if (-1 == item.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase)) { continue; } string url = ""; string[] resu = item.Split(' '); foreach (var subitem in resu) { if (-1 != subitem.IndexOf(splitstring, StringComparison.OrdinalIgnoreCase)) { url = subitem.Substring(subitem.IndexOf("=") + 1); break; } } url = GetRealURL(url, orgbaseurl, workurl); if (!bParserIMG && url.IndexOf(baseurl, StringComparison.OrdinalIgnoreCase) == -1)//&& url.IndexOf(orgbaseurl) == -1 ) { //WriteLog("The url is not current site:" + url); continue; } string workpage = config.workUrl.Substring(0, config.workUrl.LastIndexOf('.')); if (!bParserIMG && config.OnlyCurPage && url.IndexOf(workpage) == -1) { continue; } // some a tag's url is a image. check it. if (bParserIMG == false) { string temp = url.Substring(url.Length - 4).ToLower(); if (temp == "jpeg" || temp == ".jpg" || temp == ".png" || temp == ".bmp" || temp == ".gif") { url = url.Substring(url.LastIndexOf("://") - 4); bParserIMG = true; } } if (bParserIMG) { if (!imageList.ContainsKey(url)) { string samiltitle = title; // 处理title foreach (var split in config.pageSplit) { Regex rg = new Regex(split); var res = rg.Match(title); if (res.Value != "") { samiltitle = Regex.Replace(title, split, ""); break; } } // 去除多页时通常使用的分隔 samiltitle = samiltitle.Replace("-", "").Trim(); // 准备信息 DownloadInfo info = new DownloadInfo(); info.imageUrl = url; info.baseUrl = orgbaseurl; info.title = title; info.samilTitle = samiltitle; info.workUrl = workurl; imageList.Add(url, info); downloadList.Enqueue(url); lock (addToViewList) { addToViewList.Add(url); } } } else { //lock(linkList) { if (config.UrlKeyWords != null && -1 == url.IndexOf(config.UrlKeyWords)) { continue; } if (!linkList.ContainsKey(url)) { linkList.Add(url, null); lock (workList) { workList.Enqueue(url); } } } } } } catch (Exception e) { WriteLog("Parser fialed:" + e.ToString()); } } WriteLog("Parser Thread is end. id is:" + Thread.CurrentThread.ManagedThreadId); }