/// <summary> /// 调用分配捕捉到的资源的多线程来下载 /// </summary> private void threadDownloadUris(object _hunter) { HunterDownloadThread hunterDownloadThread = (HunterDownloadThread)_hunter; DownloadInfo downloadFile; while (isDownloadingUris) { Thread.Sleep(100); //休眠,节约CPU资源 while (uriQueue.Count > 0) { hunterSwitch.WaitOne(); hDownload = new HunterDownload(this); UriResource u; lock (this) { if (uriQueue.Count > 0) { u = uriQueue.Dequeue(); } else { break; } } downloadFile = hDownload.DownloadFile(u, this, hunterDownloadThread); if (downloadFile == null) //如果没有下载这个文件 { //什么也不做 } else if (downloadFile.Remove) //如果没有入库成功,则删除这个下载到一半的文件 { try { File.Delete(downloadFile.Filepath); } catch (Exception e) { mHunterConsole.WriteException(e); } } else { //如果入库成功,则更新进度 projectInfo.strategy.RefreshProgress(downloadFile.Index, downloadFile.Keyword); } } } //如果仅仅剩下此线程存活 lock (this) { if (AliveHunterThreadsCount == 1) { mHunterConsole.Done(); } } }
/// <summary> /// 下载指定uri中的所有文件。如果为null表示跳过下载,为Empty表示下载没有问题,为路径表示下载不成功。 /// </summary> /// <param name="uri">文件资源定位</param> /// <returns>下载信息</returns> public DownloadInfo DownloadFile(UriResource uriRes, Hunter h, HunterDownloadThread thisThread) { Database db = null; if (h.projectInfo.DatabaseHelper != null) { db = h.projectInfo.DatabaseHelper.GetDatabaseInstance(); } flowCalculator.Interval = 1000; flowCalculator.Elapsed += new ElapsedEventHandler(flowCalculator_Elapsed); try { HunterWebClient wc = new HunterWebClient(); #region 判断文件是否重复 bool isExist = false; //检查文件在本地是否重复 isExist = database.LinkExists(uriRes.Url); //如果文件不重复,而又为网络模式,则要检查数据库内的内容 if (!isExist && (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network)) //网络模式需要对比数据库和HunterXML { if (db == null) { hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + uriRes.Url)); } else { bool OpenFailed = false; try { db.DbOpen(); } catch (Exception ex) { hunterConsole.WriteException(ex); } bool KRESULT = db.IsRecordExists("tb_file_infos", out isExist, new FieldValue("file_link", uriRes.Url.Replace("'", "\\'").Replace("\"", "\\\""))); if (!KRESULT || OpenFailed) { //hunterConsole.WriteException(new Exceptions("数据库连接失败,使用本地模式判重:" + uriRes.Url)); isExist = false; } try { db.DbClose(); } catch (Exception ex) { hunterConsole.WriteException(ex); } } } #endregion if (isExist) #region 链接重复对应的措施 { hunterConsole.ReportAbandonURI(uriRes, "链接重复"); return(null); } #endregion else { #region 载部分 try { hunterConsole.WriteDownload("正在下载文件:" + uriRes.Url); hunterConsole.WriteDownload( "线程ID:" + Thread.CurrentThread.ManagedThreadId + Environment.NewLine + "正在下载的文件:" + Environment.NewLine + "下载地址:" + uriRes.Url + Environment.NewLine + "下载的关键字:" + strategy.GetKeyword(uriRes.Keyword) + Environment.NewLine + "下载的页面页码:" + uriRes.index); } catch (Exception ex) { hunterConsole.WriteException(ex); } wc.DownloadFileCompleted += new System.ComponentModel.AsyncCompletedEventHandler(wc_DownloadFileCompleted); //绑定文件下载事件 wc.DownloadProgressChanged += new DownloadProgressChangedEventHandler(wc_DownloadProgressChanged); //绑定下载进度改变事件 //临时文件命名 string filepath = HunterUtilities.GetFilenameFromUrl(pInfo, strategy, uriRes); //获取合适的文件名 int timeout = 0; flowCalculator.Start(); //开始计算流量 receive1 = 0; //最开始第一次获得的数据量为0 wc.DownloadKeyword = strategy.GetKeyword(uriRes.Keyword); wc.DownloadSource = uriRes.Url; wc.DownloadDestination = filepath; if (!Directory.Exists(Path.GetDirectoryName(filepath))) { Directory.CreateDirectory(Path.GetDirectoryName(filepath)); } wc.DownloadFileAsync(new Uri(uriRes.Url), filepath); //开始下载 bool downloadProblem = false; while (wc.IsBusy) { Thread.Sleep(1000); //使用进程休眠 timeout++; if (timeout >= pInfo.timeout || h.DownloadCancelled) //如果超时或下载取消 { wc.CancelAsync(); wc.Dispose(); if (!downloadProblem) { hunterConsole.WriteDownload( "线程ID:" + Thread.CurrentThread.ManagedThreadId + Environment.NewLine + "下载超时,取消下载。"); try { hunterConsole.ReportAbandonDownloadInfo(new DownloadInfo("", uriRes.Keyword, uriRes.index, uriRes.Url, "", true, "超时", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword)) , "超时"); } catch { } } downloadProblem = true; } } flowCalculator.Stop(); receive1 = 0; receive2 = 0; //清空流量计 hunterConsole.outputSpeedInfo(DateTime.Now, 0); //流量计清零 wc.Dispose(); #endregion if (!downloadProblem) { //如果没有下载问题 string MD5 = string.Empty; try { MD5 = HunterUtilities.GetMD5Hash(filepath); } catch (Exception e) { hunterConsole.WriteException(new Exception("无法获取MD5。")); hunterConsole.WriteException(e); } #region 文件MD5是否重复 bool isDuplicate; //记录文件是否重复 //判断是否与本地XML重复 isDuplicate = database.isDuplicate(uriRes.Url, MD5); if (!isDuplicate && (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network)) //网络模式需要对比数据库和HunterXML { if (db == null) { hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + MD5)); } else { bool OpenFailed = false; try { db.DbOpen(); } catch (Exception ex) { hunterConsole.WriteException(ex); } bool KRESULT = db.IsFileExists("tb_file_infos", out isDuplicate, MD5); if (!KRESULT || OpenFailed) { hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + MD5)); isDuplicate = false; } try { db.DbClose(); } catch (Exception ex) { hunterConsole.WriteException(ex); } } } #endregion #region 文件重复、不重复对应的动作 if (!isDuplicate) { //检测是否重复。如果不重复则入库 wc.XMLFile = Path.Combine(hunterProject.projectInfo.filefolder, "$__" + Path.GetFileName(wc.DownloadDestination)) + ".xml"; try { HunterUtilities.WriteDownloadFileXML(wc.DownloadSource, wc.DownloadKeyword, Path.GetFileName(wc.DownloadDestination), (hunterProject.projectInfo.search_language == ProjectInfo.Language.none ? null : hunterProject.projectInfo.search_language.ToString()), wc.XMLFile); } catch (Exception ex) { hunterConsole.WriteException(ex); } database.addNewRecord(uriRes.Url, wc.DownloadKeyword, filepath, MD5); DownloadInfo d = new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, MD5, false, "已下载", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword)); //Network模式:自动上传样张 if (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network) #region 网络模式上传样张 { String t_md5 = HunterUtilities.GetMD5Hash(filepath); //获得文件MD5码 String filename = t_md5 + "_" + Path.GetFileName(filepath); String combinedPath = Path.Combine(pInfo.share_remote_path, "cache", ProjectInfo.IP_ADDRESS + " (" + pInfo.name + ")"); String combinedFullPath = Path.Combine(combinedPath, filename); if (!Directory.Exists(combinedPath)) { Directory.CreateDirectory(combinedPath); } bool fileMoveSuccess = false; const int maxMoveCount = 5; int moveCount = 0; try { if (File.Exists(wc.XMLFile)) { File.Delete(Path.Combine(combinedPath, Path.GetFileName(wc.XMLFile))); File.Move(wc.XMLFile, Path.Combine(combinedPath, Path.GetFileName(wc.XMLFile))); } } catch (Exception e) { hunterConsole.WriteException(e); } while (!fileMoveSuccess) { try { if (moveCount > maxMoveCount) { break; } File.Move(filepath, combinedFullPath); fileMoveSuccess = true; } catch (Exception e) { moveCount++; hunterConsole.WriteException(e); } } } #endregion count++; hunterConsole.outputDownloadedFileNum(DateTime.Now, count); hunterConsole.ReportDownloadInfo(d); return(d); } else { DownloadInfo d = new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, MD5, true, "MD5重复", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword)); hunterConsole.ReportAbandonDownloadInfo(d, "MD5重复"); return(d); //删除文件 } #endregion } return(new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, null, true, "重复", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword))); //删除文件 } } catch (Exception e) { //*此处预留错误处理 hunterConsole.WriteException(e); return(null); } }
public Hunter(HunterConsole oh, HunterConfig config, ProjectInfo _pj, HunterForm main) { try { MainForm = main; Error = false; mHunterConsole = oh; projectInfo = _pj; mHunterConfig = config; ProxyFetcher = new HunterProxyFetcher(AvailableProxies); projectInfo = ProjectInfo.LoadProject(_pj.mHunterConsole, _pj.projectPath, _pj.strategyPath, true); downloadThreadNum = int.Parse(projectInfo.threadnum); hunterThreads = new HunterDownloadThread[downloadThreadNum]; //获取代理的线程 ProxyGetThreads = new Thread[downloadThreadNum]; if (mHunterConfig.UseProxy == true) { FileStream fs = new FileStream("proxy.hip", FileMode.Open, FileAccess.Read); StreamReader sr = new StreamReader(fs); ProxyText = sr.ReadToEnd(); sr.Close(); fs.Close(); AllProxies = HunterProxy.GetProxy(ProxyText, mHunterConfig.ProxyFilterKeywords); } mHunterConsole.WriteMessage(projectInfo.ConfigInformation()); mHunterConsole.WriteMessage(""); mHunterConsole.WriteMessage(projectInfo.strategy.GetStrategyInformation()); xmlDatabase = new XMLDatabase(projectInfo.database, mHunterConsole); xmlDatabase.openDatabase(); try { if (downloadThreadNum <= 0) { mHunterConsole.WriteMessage("配置错误:下载线程数不能小于0。"); return; } } catch (Exception ex) { mHunterConsole.WriteException(ex); return; } for (int i = 0; i < hunterThreads.Length; i++) { hunterThreads[i] = new HunterDownloadThread(); hunterThreads[i].downloadThread = new Thread(threadDownloadUris); } if (mHunterConfig.UseProxy) { for (int i = 0; i < ProxyGetThreads.Length; i++) { ProxyGetThreads[i] = new Thread(GetAvaliableProxies); } } if (projectInfo.strategy.Keywords.Count <= 0) { projectInfo.mHunterConsole.WriteMessage("没有找到关键字,任务取消。"); Error = true; return; } thHuntUris = new Thread(threadHuntUris); thHuntUris.SetApartmentState(ApartmentState.STA); hUri = new HunterUri(this); projectInfo.strategy.RecordFirstWord(); mHunterConsole.WriteMessage("下载线程总数:" + hunterThreads.Length); mHunterConsole.WriteMessage("读取配置完毕。"); mHunterConsole.WriteMessage("正在运行任务..."); } catch (Exception e) { mHunterConsole.WriteException(e); } }