public static Queue <HunterProxy> GetProxy(String r, List <String> pfilter) { Queue <HunterProxy> ResultList = new Queue <HunterProxy>(); Regex regex = new Regex(StrRegex); Match m = regex.Match(r); List <String> CommentRemovedFilter = new List <String>(); foreach (String s in pfilter) { String key = s.Substring(0, s.IndexOf('\'') >= 0 ? s.IndexOf('\'') : s.Length); if (key.Trim() == "") { continue; } CommentRemovedFilter.Add(key); } while (m.Success) { bool Legal = true; HunterProxy tempProxy = new HunterProxy(); tempProxy.IPAndPort = m.Result("${ipandport}"); tempProxy.Type = m.Result("${type}"); tempProxy.Speed = m.Result("${speed}"); tempProxy.Description = m.Result("${description}").Replace("\n", "").Replace("\r", "").Trim(); if (CommentRemovedFilter.Count > 0) { //过滤器有内容,则进行筛选。同时要去除以'开头的注释 foreach (String f in CommentRemovedFilter) { if (tempProxy.Description.Contains(f.Trim())) { Legal = true; break; } Legal = false; } } if (Legal) { ResultList.Enqueue(tempProxy); } m = m.NextMatch(); } return(ResultList); }
/// <summary> /// 捕获带有filetype结尾的链接。proxy为代理,为null表示不使用代理。 /// </summary> /// <returns>返回一个本次页面中捕获的链接序列</returns> public List <string> HuntUris(HunterProxy proxy, HunterForm main) { Regex linkReg = null; String htmlCode = null; List <string> thisURL = new List <string>(); //记录本次匹配的所有URL项 try { linkReg = new Regex(strategy.StrategyData.configuration.Regex); //超链接+超链接文本 WebProxy webproxy; if (proxy != null) { webproxy = new WebProxy(proxy.IPAndPort); } else { webproxy = null; } if (proxy != null) { mHunterConsole.WriteDetails("正在使用代理:" + proxy.IPAndPort + "(" + proxy.Description + ")"); } mHunterConsole.WriteDetails("准备分析页面:" + urlAddress); htmlCode = GetPageHtml(webproxy, main); mHunterConsole.WriteHTML(htmlCode); } catch (WebException ex) //如果是返回超时,返回一个Count>0的随机结果 { thisURL.Add("{/WebException/}" + new Random().Next().ToString()); mHunterConsole.WriteDetails("页面" + urlAddress + "请求失败。原因:" + ex.Message); mHunterConsole.ReportAbandonURI(new UriResource(urlAddress, strategy.CurrentKeywordProgress, strategy.CurrentSearchProgress, null), ex.Message); return(thisURL); } catch (Exception ex) { mHunterConsole.WriteException(ex); } try { Match m = linkReg.Match(htmlCode); while (m.Success) { allCount++; mHunterConsole.outputAnalysedUris(DateTime.Now, allCount); //得到一个网址后,保存起来 string linkText = m.Result("${text}"); thisURL.Add(linkText); //记录本次获取到的linkText if (strategy.HasForbiddenWord(linkText)) { m = m.NextMatch(); continue; //如果含有违禁词语 则放弃下载 继续下一个 } //对linkText中的内容进行处理,去掉里面的尖括号 Regex r = new Regex("<(.*?)>"); linkText = r.Replace(linkText, ""); string uri = null; try { uri = (strategy.StrategyData.configuration.Redirect.ToLower() == "true") ? GetTheRedirectUrl(m.Result("${url}")) : (m.Result("${url}")); } catch (WebException) { mHunterConsole.WriteDetails("链接" + (m.Result("${url}") + "重定向超时。")); mHunterConsole.ReportAbandonURI(new UriResource((m.Result("${url}")), strategy.CurrentKeywordProgress, strategy.CurrentSearchProgress, null), "重定向超时"); m = m.NextMatch(); continue; } if (uri.EndsWith("." + strategy.Filetype)) { availableCount++; mHunterConsole.outputAvailableUris(DateTime.Now, availableCount); UriResource u = new UriResource(uri, strategy.CurrentKeywordProgress , strategy.CurrentSearchProgress, linkText); //封装成一个Uri资源 if (!uriQueue.Contains(u)) //考虑在多线程中,可能会出现重复项目 { uriQueue.Enqueue(u); //将一个资源放入队列 } mHunterConsole.outputDownloadingUriInfo(DateTime.Now, "找到的资源的URL:" + u.Url + Environment.NewLine + "标题:" + u.Text + Environment.NewLine + "关键字:" + strategy.GetKeyword(u.Keyword) + Environment.NewLine + "搜索页码:" + u.index + Environment.NewLine + "已列入下载队列。"); mHunterConsole.WriteDetails("正在获得有效URI:" + uri); } m = m.NextMatch(); } } catch (Exception ex) { mHunterConsole.WriteException(ex); } return(thisURL); }
public string GetPageHtml(Encoding encoding, WebProxy proxy, HunterForm main) { if ((strategy.StrategyData.configuration.UseIE && ((HunterMain)main).mHunterConfig.HunterCore == HunterConfig.Core.Default) || ((HunterMain)main).mHunterConfig.HunterCore == HunterConfig.Core.IE) { lock (this) { try { if (proxy != null) { HunterProxy.InternetSetOption(proxy.Address.ToString().Replace("http://", "").Replace("/", "")); } else { HunterProxy.InternetSetOption(String.Empty); } string html = string.Empty; AutoResetEvent are = new AutoResetEvent(false); ((HunterMain)main).IEBrowser.NewWindow += (object sender, CancelEventArgs e) => { e.Cancel = true; }; ((HunterMain)main).IEBrowser.DocumentCompleted += (object sender, WebBrowserDocumentCompletedEventArgs e) => { ((HunterMain)main).IEBrowser.ScriptErrorsSuppressed = true; html = ((HunterMain)main).IEBrowser.DocumentText; ((HunterMain)main).IEBrowser.Dispose(); ((HunterMain)main).IEBrowser = new WebBrowser(); are.Set(); }; String[] parameters = { urlAddress }; main.Invoke(new StringHandler(((HunterMain)main).IEBrowser.Navigate), parameters); are.WaitOne(15000); if (html != String.Empty) { return(html); } else { throw new WebException(); } } catch (Exception) { throw; } } } else { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(urlAddress); if (proxy != null) { request.Proxy = proxy; } strategy.Disguise(request); response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return(html); } } catch (Exception) { throw; } finally { if (response != null) { response.Close(); response = null; } if (reader != null) { reader.Close(); } if (request != null) { request = null; } } return(string.Empty); } }
public Hunter(HunterConsole oh, HunterConfig config, ProjectInfo _pj, HunterForm main) { try { MainForm = main; Error = false; mHunterConsole = oh; projectInfo = _pj; mHunterConfig = config; ProxyFetcher = new HunterProxyFetcher(AvailableProxies); projectInfo = ProjectInfo.LoadProject(_pj.mHunterConsole, _pj.projectPath, _pj.strategyPath, true); downloadThreadNum = int.Parse(projectInfo.threadnum); hunterThreads = new HunterDownloadThread[downloadThreadNum]; //获取代理的线程 ProxyGetThreads = new Thread[downloadThreadNum]; if (mHunterConfig.UseProxy == true) { FileStream fs = new FileStream("proxy.hip", FileMode.Open, FileAccess.Read); StreamReader sr = new StreamReader(fs); ProxyText = sr.ReadToEnd(); sr.Close(); fs.Close(); AllProxies = HunterProxy.GetProxy(ProxyText, mHunterConfig.ProxyFilterKeywords); } mHunterConsole.WriteMessage(projectInfo.ConfigInformation()); mHunterConsole.WriteMessage(""); mHunterConsole.WriteMessage(projectInfo.strategy.GetStrategyInformation()); xmlDatabase = new XMLDatabase(projectInfo.database, mHunterConsole); xmlDatabase.openDatabase(); try { if (downloadThreadNum <= 0) { mHunterConsole.WriteMessage("配置错误:下载线程数不能小于0。"); return; } } catch (Exception ex) { mHunterConsole.WriteException(ex); return; } for (int i = 0; i < hunterThreads.Length; i++) { hunterThreads[i] = new HunterDownloadThread(); hunterThreads[i].downloadThread = new Thread(threadDownloadUris); } if (mHunterConfig.UseProxy) { for (int i = 0; i < ProxyGetThreads.Length; i++) { ProxyGetThreads[i] = new Thread(GetAvaliableProxies); } } if (projectInfo.strategy.Keywords.Count <= 0) { projectInfo.mHunterConsole.WriteMessage("没有找到关键字,任务取消。"); Error = true; return; } thHuntUris = new Thread(threadHuntUris); thHuntUris.SetApartmentState(ApartmentState.STA); hUri = new HunterUri(this); projectInfo.strategy.RecordFirstWord(); mHunterConsole.WriteMessage("下载线程总数:" + hunterThreads.Length); mHunterConsole.WriteMessage("读取配置完毕。"); mHunterConsole.WriteMessage("正在运行任务..."); } catch (Exception e) { mHunterConsole.WriteException(e); } }