protected override void ProcessFindLink() { try { while (!IS_STOP) { //int number = CounterManager.GetNumber(); _removelinks = this.RemoveLinks.ToString(); string url = string.Empty; if (FrontierURL.Count > 0) { ConcreteLink objUrl; lock (FrontierURL) { try { objUrl = FrontierURL.Dequeue(); } catch { continue; } } string html = string.Empty; if (string.IsNullOrEmpty(objUrl.href)) { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } string baseURL = string.Empty; try { //html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8)); html = DownloadExpress.getResponseString(url, "", out baseURL); } catch (Exception _error) { //------------------------------------------------------------------------------------------------------- string _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url); SpiderArgs args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); //-------------------------------------------------------------------------------------------------------- try { html = DownloadExpress.getResponseString(url, "", out baseURL); } catch (Exception ex) { _errorString = string.Format("{0}-ERROR:{1}-URLDownloadBySocket:{2}", this.ThreadName, ex.ToString(), url); args = new SpiderArgs() { Message = _errorString }; SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args); //-------------------------------------------------------------------------------------------------------- continue; } } if (string.IsNullOrEmpty(html)) { continue; } var _configLinkDetailLink = base.ConfigLinks.configlinks.Where(c => c.link_type == "DETAIL_LINK").ToList(); foreach (var item in base.ConfigLinks.configlinks) { FindLinkBase findLink = null; if (item.pattern_type == "REGEX") { //if(item.link_type == "REMOVE_LINK") if (_removelinks != string.Empty) { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks); } else { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); } } else if (item.pattern_type == "XPATH") { if (_removelinks != string.Empty) { findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, _removelinks); } else { findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, ""); } } if (findLink == null) { continue; } #region SUB_LINK if (item.link_type == "SUB_LINK") { List <string> listLinkSub = findLink.FindLink(html); if (listLinkSub == null || listLinkSub.Count == 0) { continue; } List <string> listSubLinkRemove = new List <string>(); //##################### filter sub link ############################## //##################################################################### if (listSubLinkRemove.Count > 0) { listLinkSub = listLinkSub.Where(s => !listSubLinkRemove.Contains(s)).ToList(); } foreach (string linkSub in listLinkSub) { string _link = linkSub; if (!_link.Contains("http://") && !_link.Contains("https://")) { _link = baseURL + linkSub; } string _url = _link; if (IsNewUrl(ref _url)) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "SUB_LINK"; _concreteLink.href = _url; FrontierURL.Enqueue(_concreteLink); SpiderArgs args = new SpiderArgs() { Message = string.Format("{0}-SUB-{1}", _url, this.ThreadName) }; SpiderSingletonEvent.Instance.OnSpiderInformation(args); } } } #endregion #region DETAIL_LINK else if (item.link_type == "DETAIL_LINK") { List <string> linkDetail = findLink.FindLink(html); if (linkDetail == null || linkDetail.Count == 0) { continue; } foreach (string href in linkDetail) { string _link = href; if (!_link.Contains("http://") && !_link.Contains("https://")) { _link = baseURL + href; } string _url = _link; if (IsNewUrl(ref _url)) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "DETAIL_LINK"; _concreteLink.href = _url; QueueDetailURL.Enqueue(_concreteLink); } } } #endregion } } if (this.Mode == "UPDATE") { if (CounterManager.CurrentTryingCount >= this.MaxTryingCount) { string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now); SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs() { Message = message }); SpiderSingletonEvent.Instance.OnSpiderReloadForUpdate(new BlankSpider.Spider.Events.SpiderArgs() { Message = string.Format("{0} RELOAD FOR UPDATE \r\n", ThreadName) }); Thread.Sleep(this.UpdateSleep); try { this._BaseManagement.TotalUpdateLink = 0; this._BaseManagement.TotalInsertLink = 0; //CounterManager.ResetTryingCount(); //CounterManager.CurrentNumber = 0; UrlStoragClear(); CounterManager.Reset(0, 100); } catch { this._BaseManagement.TotalUpdateLink = 0; this._BaseManagement.TotalInsertLink = 0; //CounterManager.ResetTryingCount(); //CounterManager.CurrentNumber = 0; UrlStoragClear(); CounterManager.Reset(0, 100); } } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string test = ex.Message; } }
protected override void ProcessFindLink() { try { while (!IS_STOP) { int number = CounterManager.GetNumber(); //_removelinks = this.RemoveLinks.ToString(); string url = string.Empty; if (FrontierURL.Count > 0) { ConcreteLink objUrl = FrontierURL.Dequeue(); string html = string.Empty; if (string.IsNullOrEmpty(objUrl.href)) { continue; } url = objUrl.href; if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://")) { url = this.BaseURL + objUrl.href; } try { html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8)); } catch { try { html = DownloadExpress.DownloadBySocket(url, Utility.GetEncoding(EncodingType.UTF8)); } catch (Exception ex) { continue; } } if (string.IsNullOrEmpty(html)) { continue; } foreach (var item in base.ConfigLinks.configlinks) { FindLinkBase findLink = null; if (item.pattern_type == "REGEX") { //if(item.link_type == "REMOVE_LINK") if (_removelinks != string.Empty) { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks); } else { findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, ""); } } if (findLink == null) { continue; } #region SUB_LINK if (item.link_type == "SUB_LINK") { List <string> listLinkSub = findLink.FindLink(html); if (listLinkSub == null || listLinkSub.Count == 0) { continue; } foreach (string linkSub in listLinkSub) { string _url = linkSub; if (IsNewUrl(ref _url)) //check exist link { if (!_url.Contains(".html#") && !_url.Contains(".htm#")) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "SUB_LINK"; _concreteLink.href = linkSub; FrontierURL.Enqueue(_concreteLink); SpiderArgs args = new SpiderArgs() { Message = linkSub + "\r\n" }; SpiderSingletonEvent.Instance.OnSpiderInformation(args); } } } } #endregion #region DETAIL_LINK else if (item.link_type == "DETAIL_LINK") { List <string> linkDetail = findLink.FindLink(html); if (linkDetail == null || linkDetail.Count == 0) { continue; } foreach (string href in linkDetail) { string _url = href; if (IsNewUrl(ref _url)) //check exist link { if (!_url.Contains(".html#") && !_url.Contains(".htm#")) { ConcreteLink _concreteLink = new ConcreteLink(); _concreteLink.link_type = "DETAIL_LINK"; _concreteLink.href = href; QueueDetailURL.Enqueue(_concreteLink); SpiderArgs args = new SpiderArgs() { Message = _url + " - DETAIL" + "\r\n" }; SpiderSingletonEvent.Instance.OnSpiderInformation(args); } } } } #endregion } } Thread.Sleep(this.THREAD_SLEEP); } } catch (Exception ex) { string test = ex.Message; } }