Пример #1
0
        void ReportAbandonURI(UriResource u, string reason)
        {
            try
            {
                AbandonLinks++;
                lbAbandonLinks.Text = strAbandonLinks + AbandonLinks;
                AbandonUri a = new AbandonUri();
                a.Info   = u;
                a.Reason = reason;
                AbandonLinkList.Add(a);
                if (AbandonLinkList.Count > 100)
                {
                    AbandonLinkList.RemoveAt(0);
                }

                try
                {
                    a.Keyword = hunter.projectInfo.strategy.GetKeyword(u.Keyword);
                }
                catch
                {
                    a.Keyword = "[获取出错]";
                }
                WriteAbandonUri(a);
            }
            catch (Exception ex) { WriteException(ex); }
        }
Пример #2
0
 public void ReportAbandonURI(UriResource u, string reason)
 {
     if (onReportAbandonURI != null)
     {
         onReportAbandonURI(u, reason);
     }
 }
Пример #3
0
        /// <summary>
        /// 下载指定uri中的所有文件。如果为null表示跳过下载,为Empty表示下载没有问题,为路径表示下载不成功。
        /// </summary>
        /// <param name="uri">文件资源定位</param>
        /// <returns>下载信息</returns>
        public DownloadInfo DownloadFile(UriResource uriRes, Hunter h, HunterDownloadThread thisThread)
        {
            Database db = null;

            if (h.projectInfo.DatabaseHelper != null)
            {
                db = h.projectInfo.DatabaseHelper.GetDatabaseInstance();
            }

            flowCalculator.Interval = 1000;
            flowCalculator.Elapsed += new ElapsedEventHandler(flowCalculator_Elapsed);
            try
            {
                HunterWebClient wc = new HunterWebClient();

                #region 判断文件是否重复
                bool isExist = false;
                //检查文件在本地是否重复
                isExist = database.LinkExists(uriRes.Url);
                //如果文件不重复,而又为网络模式,则要检查数据库内的内容

                if (!isExist && (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network))    //网络模式需要对比数据库和HunterXML
                {
                    if (db == null)
                    {
                        hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + uriRes.Url));
                    }
                    else
                    {
                        bool OpenFailed = false;
                        try
                        {
                            db.DbOpen();
                        }
                        catch (Exception ex)
                        {
                            hunterConsole.WriteException(ex);
                        }
                        bool KRESULT = db.IsRecordExists("tb_file_infos", out isExist, new FieldValue("file_link", uriRes.Url.Replace("'", "\\'").Replace("\"", "\\\"")));
                        if (!KRESULT || OpenFailed)
                        {
                            //hunterConsole.WriteException(new Exceptions("数据库连接失败,使用本地模式判重:" + uriRes.Url));
                            isExist = false;
                        }
                        try
                        {
                            db.DbClose();
                        }
                        catch (Exception ex)
                        {
                            hunterConsole.WriteException(ex);
                        }
                    }
                }
                #endregion

                if (isExist)
                #region 链接重复对应的措施
                {
                    hunterConsole.ReportAbandonURI(uriRes, "链接重复");
                    return(null);
                }
                #endregion
                else
                {
                    #region  载部分
                    try
                    {
                        hunterConsole.WriteDownload("正在下载文件:" + uriRes.Url);
                        hunterConsole.WriteDownload(
                            "线程ID:" + Thread.CurrentThread.ManagedThreadId + Environment.NewLine +
                            "正在下载的文件:" + Environment.NewLine +
                            "下载地址:" + uriRes.Url + Environment.NewLine +
                            "下载的关键字:" + strategy.GetKeyword(uriRes.Keyword) + Environment.NewLine +
                            "下载的页面页码:" + uriRes.index);
                    }
                    catch (Exception ex)
                    {
                        hunterConsole.WriteException(ex);
                    }


                    wc.DownloadFileCompleted   += new System.ComponentModel.AsyncCompletedEventHandler(wc_DownloadFileCompleted); //绑定文件下载事件
                    wc.DownloadProgressChanged += new DownloadProgressChangedEventHandler(wc_DownloadProgressChanged);            //绑定下载进度改变事件

                    //临时文件命名
                    string filepath = HunterUtilities.GetFilenameFromUrl(pInfo, strategy, uriRes); //获取合适的文件名

                    int timeout = 0;

                    flowCalculator.Start(); //开始计算流量
                    receive1 = 0;           //最开始第一次获得的数据量为0

                    wc.DownloadKeyword     = strategy.GetKeyword(uriRes.Keyword);
                    wc.DownloadSource      = uriRes.Url;
                    wc.DownloadDestination = filepath;
                    if (!Directory.Exists(Path.GetDirectoryName(filepath)))
                    {
                        Directory.CreateDirectory(Path.GetDirectoryName(filepath));
                    }

                    wc.DownloadFileAsync(new Uri(uriRes.Url), filepath);    //开始下载

                    bool downloadProblem = false;
                    while (wc.IsBusy)
                    {
                        Thread.Sleep(1000);                                  //使用进程休眠
                        timeout++;
                        if (timeout >= pInfo.timeout || h.DownloadCancelled) //如果超时或下载取消
                        {
                            wc.CancelAsync();
                            wc.Dispose();

                            if (!downloadProblem)
                            {
                                hunterConsole.WriteDownload(
                                    "线程ID:" + Thread.CurrentThread.ManagedThreadId + Environment.NewLine +
                                    "下载超时,取消下载。");
                                try
                                {
                                    hunterConsole.ReportAbandonDownloadInfo(new DownloadInfo("", uriRes.Keyword, uriRes.index, uriRes.Url, "", true, "超时", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword))
                                                                            , "超时");
                                }
                                catch { }
                            }
                            downloadProblem = true;
                        }
                    }
                    flowCalculator.Stop();
                    receive1 = 0;
                    receive2 = 0;                                   //清空流量计
                    hunterConsole.outputSpeedInfo(DateTime.Now, 0); //流量计清零

                    wc.Dispose();
                    #endregion

                    if (!downloadProblem)
                    {   //如果没有下载问题
                        string MD5 = string.Empty;
                        try
                        {
                            MD5 = HunterUtilities.GetMD5Hash(filepath);
                        }
                        catch (Exception e)
                        {
                            hunterConsole.WriteException(new Exception("无法获取MD5。"));
                            hunterConsole.WriteException(e);
                        }

                        #region 文件MD5是否重复
                        bool isDuplicate; //记录文件是否重复
                        //判断是否与本地XML重复
                        isDuplicate = database.isDuplicate(uriRes.Url, MD5);
                        if (!isDuplicate && (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network))    //网络模式需要对比数据库和HunterXML
                        {
                            if (db == null)
                            {
                                hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + MD5));
                            }
                            else
                            {
                                bool OpenFailed = false;
                                try
                                {
                                    db.DbOpen();
                                }
                                catch (Exception ex)
                                {
                                    hunterConsole.WriteException(ex);
                                }
                                bool KRESULT = db.IsFileExists("tb_file_infos", out isDuplicate, MD5);
                                if (!KRESULT || OpenFailed)
                                {
                                    hunterConsole.WriteException(new Exception("数据库连接失败,使用本地模式判重:" + MD5));
                                    isDuplicate = false;
                                }
                                try
                                {
                                    db.DbClose();
                                }
                                catch (Exception ex)
                                {
                                    hunterConsole.WriteException(ex);
                                }
                            }
                        }

                        #endregion

                        #region 文件重复、不重复对应的动作
                        if (!isDuplicate)
                        {  //检测是否重复。如果不重复则入库
                            wc.XMLFile = Path.Combine(hunterProject.projectInfo.filefolder, "$__" + Path.GetFileName(wc.DownloadDestination)) + ".xml";
                            try
                            {
                                HunterUtilities.WriteDownloadFileXML(wc.DownloadSource, wc.DownloadKeyword, Path.GetFileName(wc.DownloadDestination), (hunterProject.projectInfo.search_language == ProjectInfo.Language.none ? null : hunterProject.projectInfo.search_language.ToString()),
                                                                     wc.XMLFile);
                            }
                            catch (Exception ex)
                            {
                                hunterConsole.WriteException(ex);
                            }
                            database.addNewRecord(uriRes.Url, wc.DownloadKeyword, filepath, MD5);
                            DownloadInfo d = new DownloadInfo(filepath, uriRes.Keyword, uriRes.index, uriRes.Url, MD5, false, "已下载", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword));
                            //Network模式:自动上传样张
                            if (h.projectInfo.CurrentMode == ProjectInfo.HunterMode.network)
                            #region 网络模式上传样张
                            {
                                String t_md5            = HunterUtilities.GetMD5Hash(filepath); //获得文件MD5码
                                String filename         = t_md5 + "_" + Path.GetFileName(filepath);
                                String combinedPath     = Path.Combine(pInfo.share_remote_path, "cache", ProjectInfo.IP_ADDRESS + " (" + pInfo.name + ")");
                                String combinedFullPath = Path.Combine(combinedPath, filename);
                                if (!Directory.Exists(combinedPath))
                                {
                                    Directory.CreateDirectory(combinedPath);
                                }

                                bool      fileMoveSuccess = false;
                                const int maxMoveCount    = 5;
                                int       moveCount       = 0;

                                try
                                {
                                    if (File.Exists(wc.XMLFile))
                                    {
                                        File.Delete(Path.Combine(combinedPath, Path.GetFileName(wc.XMLFile)));
                                        File.Move(wc.XMLFile, Path.Combine(combinedPath, Path.GetFileName(wc.XMLFile)));
                                    }
                                }
                                catch (Exception e)
                                {
                                    hunterConsole.WriteException(e);
                                }

                                while (!fileMoveSuccess)
                                {
                                    try
                                    {
                                        if (moveCount > maxMoveCount)
                                        {
                                            break;
                                        }
                                        File.Move(filepath, combinedFullPath);
                                        fileMoveSuccess = true;
                                    }
                                    catch (Exception e)
                                    {
                                        moveCount++;
                                        hunterConsole.WriteException(e);
                                    }
                                }
                            }
                            #endregion

                            count++;
                            hunterConsole.outputDownloadedFileNum(DateTime.Now, count);
                            hunterConsole.ReportDownloadInfo(d);
                            return(d);
                        }
                        else
                        {
                            DownloadInfo d = new DownloadInfo(filepath, uriRes.Keyword,
                                                              uriRes.index, uriRes.Url, MD5, true, "MD5重复", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword));
                            hunterConsole.ReportAbandonDownloadInfo(d, "MD5重复");
                            return(d);   //删除文件
                        }
                        #endregion
                    }
                    return(new DownloadInfo(filepath, uriRes.Keyword, uriRes.index,
                                            uriRes.Url, null, true, "重复", hunterProject.projectInfo.strategy.GetKeyword(uriRes.Keyword))); //删除文件
                }
            }
            catch (Exception e)
            {
                //*此处预留错误处理
                hunterConsole.WriteException(e);
                return(null);
            }
        }
Пример #4
0
        /// <summary>
        /// 捕获带有filetype结尾的链接。proxy为代理,为null表示不使用代理。
        /// </summary>
        /// <returns>返回一个本次页面中捕获的链接序列</returns>
        public List <string> HuntUris(HunterProxy proxy, HunterForm main)
        {
            Regex         linkReg  = null;
            String        htmlCode = null;
            List <string> thisURL  = new List <string>(); //记录本次匹配的所有URL项

            try
            {
                linkReg = new Regex(strategy.StrategyData.configuration.Regex);    //超链接+超链接文本

                WebProxy webproxy;
                if (proxy != null)
                {
                    webproxy = new WebProxy(proxy.IPAndPort);
                }
                else
                {
                    webproxy = null;
                }

                if (proxy != null)
                {
                    mHunterConsole.WriteDetails("正在使用代理:" + proxy.IPAndPort + "(" + proxy.Description + ")");
                }
                mHunterConsole.WriteDetails("准备分析页面:" + urlAddress);
                htmlCode = GetPageHtml(webproxy, main);
                mHunterConsole.WriteHTML(htmlCode);
            }
            catch (WebException ex)    //如果是返回超时,返回一个Count>0的随机结果
            {
                thisURL.Add("{/WebException/}" + new Random().Next().ToString());

                mHunterConsole.WriteDetails("页面" + urlAddress + "请求失败。原因:" + ex.Message);
                mHunterConsole.ReportAbandonURI(new UriResource(urlAddress, strategy.CurrentKeywordProgress,
                                                                strategy.CurrentSearchProgress, null), ex.Message);
                return(thisURL);
            }
            catch (Exception ex)
            {
                mHunterConsole.WriteException(ex);
            }

            try
            {
                Match m = linkReg.Match(htmlCode);
                while (m.Success)
                {
                    allCount++;
                    mHunterConsole.outputAnalysedUris(DateTime.Now, allCount);

                    //得到一个网址后,保存起来
                    string linkText = m.Result("${text}");

                    thisURL.Add(linkText);   //记录本次获取到的linkText

                    if (strategy.HasForbiddenWord(linkText))
                    {
                        m = m.NextMatch();
                        continue;    //如果含有违禁词语 则放弃下载 继续下一个
                    }

                    //对linkText中的内容进行处理,去掉里面的尖括号
                    Regex r = new Regex("<(.*?)>");
                    linkText = r.Replace(linkText, "");

                    string uri = null;
                    try
                    {
                        uri = (strategy.StrategyData.configuration.Redirect.ToLower() == "true") ? GetTheRedirectUrl(m.Result("${url}")) : (m.Result("${url}"));
                    }
                    catch (WebException)
                    {
                        mHunterConsole.WriteDetails("链接" + (m.Result("${url}") + "重定向超时。"));
                        mHunterConsole.ReportAbandonURI(new UriResource((m.Result("${url}")), strategy.CurrentKeywordProgress,
                                                                        strategy.CurrentSearchProgress, null), "重定向超时");
                        m = m.NextMatch();
                        continue;
                    }

                    if (uri.EndsWith("." + strategy.Filetype))
                    {
                        availableCount++;
                        mHunterConsole.outputAvailableUris(DateTime.Now, availableCount);

                        UriResource u = new UriResource(uri, strategy.CurrentKeywordProgress
                                                        , strategy.CurrentSearchProgress, linkText); //封装成一个Uri资源

                        if (!uriQueue.Contains(u))                                                   //考虑在多线程中,可能会出现重复项目
                        {
                            uriQueue.Enqueue(u);                                                     //将一个资源放入队列
                        }
                        mHunterConsole.outputDownloadingUriInfo(DateTime.Now,
                                                                "找到的资源的URL:" + u.Url + Environment.NewLine +
                                                                "标题:" + u.Text + Environment.NewLine +
                                                                "关键字:" + strategy.GetKeyword(u.Keyword) + Environment.NewLine +
                                                                "搜索页码:" + u.index + Environment.NewLine +
                                                                "已列入下载队列。");

                        mHunterConsole.WriteDetails("正在获得有效URI:" + uri);
                    }

                    m = m.NextMatch();
                }
            }
            catch (Exception ex)
            {
                mHunterConsole.WriteException(ex);
            }

            return(thisURL);
        }
Пример #5
0
        /// <summary>
        /// 从UriResource获得合适的保存文件名
        /// </summary>
        /// <param name="pInfo"></param>
        /// <param name="strategy"></param>
        /// <param name="uriRes"></param>
        /// <returns></returns>
        public static string GetFilenameFromUrl(ProjectInfo pInfo, Strategy strategy, UriResource uriRes)
        {
            /* 得到文件名的3套策略:
             * 1、获得超链接的标题
             * 2、如果超链接的标题有乱码,则获得网络路径
             * 3、如果不能正常匹配到网络路径,则以时间命名(一般不会出现这种情况)
             */

            //删除某些标记,并Html解码,然后合法化用户名

            string filepath, basicName = HunterUtilities.LegalizeFile(
                WebUtility.HtmlDecode(uriRes.Text)
                );

            if (strategy.HasConfusionString(basicName))   //如果含有违禁词(如乱码)
            {
                Regex regFilename = new Regex("^http://(.*)/(?<filename>.*)\\." + strategy.Filetype);
                Match mFilename   = regFilename.Match(uriRes.Url);  //获得网络路径

                if (mFilename.Success)
                {
                    basicName = mFilename.Result("${filename}");
                }
                else
                {
                    basicName = DateTime.Now.ToFileTime().ToString();
                }
            }

            //文件判定重复措施
            filepath = Path.Combine(pInfo.filefolder, basicName + "." + strategy.Filetype);
            int  i = 0;
            bool keepOriginal = true; string tempName = null;

            while (File.Exists(filepath))
            {
                i++;
                keepOriginal = false;
                tempName     = basicName + "_" + i.ToString();
                filepath     = Path.Combine(pInfo.filefolder, tempName + "." + strategy.Filetype);
            }
            ;
            if (!keepOriginal)
            {
                basicName = tempName;   //出现重名,则在后面添加编号
            }
            filepath = Path.Combine(pInfo.filefolder, basicName + "." + strategy.Filetype);

            return(filepath);
        }