void ReportAbandonURI(UriResource u, string reason) { try { AbandonLinks++; lbAbandonLinks.Text = strAbandonLinks + AbandonLinks; AbandonUri a = new AbandonUri(); a.Info = u; a.Reason = reason; AbandonLinkList.Add(a); if (AbandonLinkList.Count > 100) { AbandonLinkList.RemoveAt(0); } try { a.Keyword = hunter.projectInfo.strategy.GetKeyword(u.Keyword); } catch { a.Keyword = "[获取出错]"; } WriteAbandonUri(a); } catch (Exception ex) { WriteException(ex); } }
public void ReportAbandonURI(UriResource u, string reason) { if (onReportAbandonURI != null) { onReportAbandonURI(u, reason); } }
/// <summary> /// 下载指定uri中的所有文件。如果为null表示跳过下载,为Empty表示下载没有问题,为路径表示下载不成功。 /// </summary> /// <param name="uri">文件资源定位</param> /// <returns>下载信息</returns> public DownloadInfo DownloadFile(UriResource uriRes, Hunter h, HunterDownloadThread thisThread) { Database db = null; if (h.projectInfo.DatabaseHelper != null) { db = h.projectInfo.DatabaseHelper.GetDatabaseInstance(); } flowCalculator.Interval = 1000; flowCalculator.Elapsed += new ElapsedEventHandler(flowCalculator_Elapsed); try { HunterWebClient wc = new HunterWebClient(); #region 判断文件是否重复 bool isExist = false; //检查文件在本地是否重复 isExist = database.LinkExists(uriRes.Url); //如果文件不重复,而又为网络模式,则要检查数据库内的内容 if (!isExist && (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network)) //网络模式需要对比数据库和HunterXML { if (db == null) { hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + uriRes.Url)); } else { bool OpenFailed = false; try { db.DbOpen(); } catch (Exception ex) { hunterConsole.WriteException(ex); } bool KRESULT = db.IsRecordExists("tb_file_infos", out isExist, new FieldValue("file_link", uriRes.Url.Replace("'", "\\'").Replace("\"", "\\\""))); if (!KRESULT || OpenFailed) { //hunterConsole.WriteException(new Exceptions("数据库连接失败,使用本地模式判重:" + uriRes.Url)); isExist = false; } try { db.DbClose(); } catch (Exception ex) { hunterConsole.WriteException(ex); } } } #endregion if (isExist) #region 链接重复对应的措施 { hunterConsole.ReportAbandonURI(uriRes, "链接重复"); return(null); } #endregion else { #region 载部分 try { hunterConsole.WriteDownload("正在下载文件:" + uriRes.Url); hunterConsole.WriteDownload( "线程ID:" + Thread.CurrentThread.ManagedThreadId + Environment.NewLine + "正在下载的文件:" + Environment.NewLine + "下载地址:" + uriRes.Url + Environment.NewLine + "下载的关键字:" + strategy.GetKeyword(uriRes.Keyword) + Environment.NewLine + "下载的页面页码:" + uriRes.index); } catch (Exception ex) { hunterConsole.WriteException(ex); } wc.DownloadFileCompleted += new System.ComponentModel.AsyncCompletedEventHandler(wc_DownloadFileCompleted); //绑定文件下载事件 wc.DownloadProgressChanged += new DownloadProgressChangedEventHandler(wc_DownloadProgressChanged); //绑定下载进度改变事件 //临时文件命名 string filepath = HunterUtilities.GetFilenameFromUrl(pInfo, strategy, uriRes); //获取合适的文件名 int timeout = 0; flowCalculator.Start(); //开始计算流量 receive1 = 0; //最开始第一次获得的数据量为0 wc.DownloadKeyword = strategy.GetKeyword(uriRes.Keyword); wc.DownloadSource = uriRes.Url; wc.DownloadDestination = filepath; if (!Directory.Exists(Path.GetDirectoryName(filepath))) { Directory.CreateDirectory(Path.GetDirectoryName(filepath)); } wc.DownloadFileAsync(new Uri(uriRes.Url), filepath); //开始下载 bool downloadProblem = false; while (wc.IsBusy) { Thread.Sleep(1000); //使用进程休眠 timeout++; if (timeout >= pInfo.timeout || h.DownloadCancelled) //如果超时或下载取消 { wc.CancelAsync(); wc.Dispose(); if (!downloadProblem) { hunterConsole.WriteDownload( "线程ID:" + Thread.CurrentThread.ManagedThreadId + Environment.NewLine + "下载超时,取消下载。"); try { hunterConsole.ReportAbandonDownloadInfo(new DownloadInfo("", uriRes.Keyword, uriRes.index, uriRes.Url, "", true, "超时", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword)) , "超时"); } catch { } } downloadProblem = true; } } flowCalculator.Stop(); receive1 = 0; receive2 = 0; //清空流量计 hunterConsole.outputSpeedInfo(DateTime.Now, 0); //流量计清零 wc.Dispose(); #endregion if (!downloadProblem) { //如果没有下载问题 string MD5 = string.Empty; try { MD5 = HunterUtilities.GetMD5Hash(filepath); } catch (Exception e) { hunterConsole.WriteException(new Exception("无法获取MD5。")); hunterConsole.WriteException(e); } #region 文件MD5是否重复 bool isDuplicate; //记录文件是否重复 //判断是否与本地XML重复 isDuplicate = database.isDuplicate(uriRes.Url, MD5); if (!isDuplicate && (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network)) //网络模式需要对比数据库和HunterXML { if (db == null) { hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + MD5)); } else { bool OpenFailed = false; try { db.DbOpen(); } catch (Exception ex) { hunterConsole.WriteException(ex); } bool KRESULT = db.IsFileExists("tb_file_infos", out isDuplicate, MD5); if (!KRESULT || OpenFailed) { hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + MD5)); isDuplicate = false; } try { db.DbClose(); } catch (Exception ex) { hunterConsole.WriteException(ex); } } } #endregion #region 文件重复、不重复对应的动作 if (!isDuplicate) { //检测是否重复。如果不重复则入库 wc.XMLFile = Path.Combine(hunterProject.projectInfo.filefolder, "$__" + Path.GetFileName(wc.DownloadDestination)) + ".xml"; try { HunterUtilities.WriteDownloadFileXML(wc.DownloadSource, wc.DownloadKeyword, Path.GetFileName(wc.DownloadDestination), (hunterProject.projectInfo.search_language == ProjectInfo.Language.none ? null : hunterProject.projectInfo.search_language.ToString()), wc.XMLFile); } catch (Exception ex) { hunterConsole.WriteException(ex); } database.addNewRecord(uriRes.Url, wc.DownloadKeyword, filepath, MD5); DownloadInfo d = new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, MD5, false, "已下载", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword)); //Network模式:自动上传样张 if (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network) #region 网络模式上传样张 { String t_md5 = HunterUtilities.GetMD5Hash(filepath); //获得文件MD5码 String filename = t_md5 + "_" + Path.GetFileName(filepath); String combinedPath = Path.Combine(pInfo.share_remote_path, "cache", ProjectInfo.IP_ADDRESS + " (" + pInfo.name + ")"); String combinedFullPath = Path.Combine(combinedPath, filename); if (!Directory.Exists(combinedPath)) { Directory.CreateDirectory(combinedPath); } bool fileMoveSuccess = false; const int maxMoveCount = 5; int moveCount = 0; try { if (File.Exists(wc.XMLFile)) { File.Delete(Path.Combine(combinedPath, Path.GetFileName(wc.XMLFile))); File.Move(wc.XMLFile, Path.Combine(combinedPath, Path.GetFileName(wc.XMLFile))); } } catch (Exception e) { hunterConsole.WriteException(e); } while (!fileMoveSuccess) { try { if (moveCount > maxMoveCount) { break; } File.Move(filepath, combinedFullPath); fileMoveSuccess = true; } catch (Exception e) { moveCount++; hunterConsole.WriteException(e); } } } #endregion count++; hunterConsole.outputDownloadedFileNum(DateTime.Now, count); hunterConsole.ReportDownloadInfo(d); return(d); } else { DownloadInfo d = new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, MD5, true, "MD5重复", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword)); hunterConsole.ReportAbandonDownloadInfo(d, "MD5重复"); return(d); //删除文件 } #endregion } return(new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, null, true, "重复", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword))); //删除文件 } } catch (Exception e) { //*此处预留错误处理 hunterConsole.WriteException(e); return(null); } }
/// <summary> /// 捕获带有filetype结尾的链接。proxy为代理,为null表示不使用代理。 /// </summary> /// <returns>返回一个本次页面中捕获的链接序列</returns> public List <string> HuntUris(HunterProxy proxy, HunterForm main) { Regex linkReg = null; String htmlCode = null; List <string> thisURL = new List <string>(); //记录本次匹配的所有URL项 try { linkReg = new Regex(strategy.StrategyData.configuration.Regex); //超链接+超链接文本 WebProxy webproxy; if (proxy != null) { webproxy = new WebProxy(proxy.IPAndPort); } else { webproxy = null; } if (proxy != null) { mHunterConsole.WriteDetails("正在使用代理:" + proxy.IPAndPort + "(" + proxy.Description + ")"); } mHunterConsole.WriteDetails("准备分析页面:" + urlAddress); htmlCode = GetPageHtml(webproxy, main); mHunterConsole.WriteHTML(htmlCode); } catch (WebException ex) //如果是返回超时,返回一个Count>0的随机结果 { thisURL.Add("{/WebException/}" + new Random().Next().ToString()); mHunterConsole.WriteDetails("页面" + urlAddress + "请求失败。原因:" + ex.Message); mHunterConsole.ReportAbandonURI(new UriResource(urlAddress, strategy.CurrentKeywordProgress, strategy.CurrentSearchProgress, null), ex.Message); return(thisURL); } catch (Exception ex) { mHunterConsole.WriteException(ex); } try { Match m = linkReg.Match(htmlCode); while (m.Success) { allCount++; mHunterConsole.outputAnalysedUris(DateTime.Now, allCount); //得到一个网址后,保存起来 string linkText = m.Result("${text}"); thisURL.Add(linkText); //记录本次获取到的linkText if (strategy.HasForbiddenWord(linkText)) { m = m.NextMatch(); continue; //如果含有违禁词语 则放弃下载 继续下一个 } //对linkText中的内容进行处理,去掉里面的尖括号 Regex r = new Regex("<(.*?)>"); linkText = r.Replace(linkText, ""); string uri = null; try { uri = (strategy.StrategyData.configuration.Redirect.ToLower() == "true") ? GetTheRedirectUrl(m.Result("${url}")) : (m.Result("${url}")); } catch (WebException) { mHunterConsole.WriteDetails("链接" + (m.Result("${url}") + "重定向超时。")); mHunterConsole.ReportAbandonURI(new UriResource((m.Result("${url}")), strategy.CurrentKeywordProgress, strategy.CurrentSearchProgress, null), "重定向超时"); m = m.NextMatch(); continue; } if (uri.EndsWith("." + strategy.Filetype)) { availableCount++; mHunterConsole.outputAvailableUris(DateTime.Now, availableCount); UriResource u = new UriResource(uri, strategy.CurrentKeywordProgress , strategy.CurrentSearchProgress, linkText); //封装成一个Uri资源 if (!uriQueue.Contains(u)) //考虑在多线程中,可能会出现重复项目 { uriQueue.Enqueue(u); //将一个资源放入队列 } mHunterConsole.outputDownloadingUriInfo(DateTime.Now, "找到的资源的URL:" + u.Url + Environment.NewLine + "标题:" + u.Text + Environment.NewLine + "关键字:" + strategy.GetKeyword(u.Keyword) + Environment.NewLine + "搜索页码:" + u.index + Environment.NewLine + "已列入下载队列。"); mHunterConsole.WriteDetails("正在获得有效URI:" + uri); } m = m.NextMatch(); } } catch (Exception ex) { mHunterConsole.WriteException(ex); } return(thisURL); }
/// <summary> /// 从UriResource获得合适的保存文件名 /// </summary> /// <param name="pInfo"></param> /// <param name="strategy"></param> /// <param name="uriRes"></param> /// <returns></returns> public static string GetFilenameFromUrl(ProjectInfo pInfo, Strategy strategy, UriResource uriRes) { /* 得到文件名的3套策略: * 1、获得超链接的标题 * 2、如果超链接的标题有乱码,则获得网络路径 * 3、如果不能正常匹配到网络路径,则以时间命名(一般不会出现这种情况) */ //删除某些标记,并Html解码,然后合法化用户名 string filepath, basicName = HunterUtilities.LegalizeFile( WebUtility.HtmlDecode(uriRes.Text) ); if (strategy.HasConfusionString(basicName)) //如果含有违禁词(如乱码) { Regex regFilename = new Regex("^http://(.*)/(?<filename>.*)\\." + strategy.Filetype); Match mFilename = regFilename.Match(uriRes.Url); //获得网络路径 if (mFilename.Success) { basicName = mFilename.Result("${filename}"); } else { basicName = DateTime.Now.ToFileTime().ToString(); } } //文件判定重复措施 filepath = Path.Combine(pInfo.filefolder, basicName + "." + strategy.Filetype); int i = 0; bool keepOriginal = true; string tempName = null; while (File.Exists(filepath)) { i++; keepOriginal = false; tempName = basicName + "_" + i.ToString(); filepath = Path.Combine(pInfo.filefolder, tempName + "." + strategy.Filetype); } ; if (!keepOriginal) { basicName = tempName; //出现重名,则在后面添加编号 } filepath = Path.Combine(pInfo.filefolder, basicName + "." + strategy.Filetype); return(filepath); }