Ejemplo n.º 1
0
        protected override void ProcessFindLink()
        {
            try
            {
                while (!IS_STOP)
                {
                    //int number = CounterManager.GetNumber();
                    _removelinks = this.RemoveLinks.ToString();
                    string url = string.Empty;
                    if (FrontierURL.Count > 0)
                    {
                        ConcreteLink objUrl;
                        lock (FrontierURL)
                        {
                            try
                            {
                                objUrl = FrontierURL.Dequeue();
                            }
                            catch { continue; }
                        }



                        string html = string.Empty;

                        if (string.IsNullOrEmpty(objUrl.href))
                        {
                            continue;
                        }

                        url = objUrl.href;

                        if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://"))
                        {
                            url = this.BaseURL + objUrl.href;
                        }

                        string baseURL = string.Empty;
                        try
                        {
                            //html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8));
                            html = DownloadExpress.getResponseString(url, "", out baseURL);
                        }
                        catch (Exception _error)
                        {
                            //-------------------------------------------------------------------------------------------------------
                            string     _errorString = string.Format("{0}-ERROR:{1}-URLDownload:{2}", this.ThreadName, _error.ToString(), url);
                            SpiderArgs args         = new SpiderArgs()
                            {
                                Message = _errorString
                            };
                            SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args);
                            //--------------------------------------------------------------------------------------------------------
                            try
                            {
                                html = DownloadExpress.getResponseString(url, "", out baseURL);
                            }
                            catch (Exception ex)
                            {
                                _errorString = string.Format("{0}-ERROR:{1}-URLDownloadBySocket:{2}", this.ThreadName, ex.ToString(), url);
                                args         = new SpiderArgs()
                                {
                                    Message = _errorString
                                };
                                SpiderSingletonEvent.Instance.OnSpiderScreenConsole(args);
                                //--------------------------------------------------------------------------------------------------------
                                continue;
                            }
                        }

                        if (string.IsNullOrEmpty(html))
                        {
                            continue;
                        }


                        var _configLinkDetailLink = base.ConfigLinks.configlinks.Where(c => c.link_type == "DETAIL_LINK").ToList();

                        foreach (var item in base.ConfigLinks.configlinks)
                        {
                            FindLinkBase findLink = null;
                            if (item.pattern_type == "REGEX")
                            {
                                //if(item.link_type == "REMOVE_LINK")

                                if (_removelinks != string.Empty)
                                {
                                    findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks);
                                }
                                else
                                {
                                    findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, "");
                                }
                            }
                            else if (item.pattern_type == "XPATH")
                            {
                                if (_removelinks != string.Empty)
                                {
                                    findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, _removelinks);
                                }
                                else
                                {
                                    findLink = new FindLinkByXPath(this.BaseURL, item.url_pattern, "");
                                }
                            }

                            if (findLink == null)
                            {
                                continue;
                            }


                            #region SUB_LINK
                            if (item.link_type == "SUB_LINK")
                            {
                                List <string> listLinkSub = findLink.FindLink(html);
                                if (listLinkSub == null || listLinkSub.Count == 0)
                                {
                                    continue;
                                }

                                List <string> listSubLinkRemove = new List <string>();
                                //##################### filter sub link ##############################
                                //#####################################################################

                                if (listSubLinkRemove.Count > 0)
                                {
                                    listLinkSub = listLinkSub.Where(s => !listSubLinkRemove.Contains(s)).ToList();
                                }

                                foreach (string linkSub in listLinkSub)
                                {
                                    string _link = linkSub;
                                    if (!_link.Contains("http://") && !_link.Contains("https://"))
                                    {
                                        _link = baseURL + linkSub;
                                    }


                                    string _url = _link;
                                    if (IsNewUrl(ref _url))
                                    {
                                        ConcreteLink _concreteLink = new ConcreteLink();
                                        _concreteLink.link_type = "SUB_LINK";
                                        _concreteLink.href      = _url;
                                        FrontierURL.Enqueue(_concreteLink);
                                        SpiderArgs args = new SpiderArgs()
                                        {
                                            Message = string.Format("{0}-SUB-{1}", _url, this.ThreadName)
                                        };
                                        SpiderSingletonEvent.Instance.OnSpiderInformation(args);
                                    }
                                }
                            }
                            #endregion

                            #region DETAIL_LINK
                            else if (item.link_type == "DETAIL_LINK")
                            {
                                List <string> linkDetail = findLink.FindLink(html);
                                if (linkDetail == null || linkDetail.Count == 0)
                                {
                                    continue;
                                }


                                foreach (string href in linkDetail)
                                {
                                    string _link = href;
                                    if (!_link.Contains("http://") && !_link.Contains("https://"))
                                    {
                                        _link = baseURL + href;
                                    }

                                    string _url = _link;
                                    if (IsNewUrl(ref _url))
                                    {
                                        ConcreteLink _concreteLink = new ConcreteLink();
                                        _concreteLink.link_type = "DETAIL_LINK";
                                        _concreteLink.href      = _url;
                                        QueueDetailURL.Enqueue(_concreteLink);
                                    }
                                }
                            }
                            #endregion
                        }
                    }
                    if (this.Mode == "UPDATE")
                    {
                        if (CounterManager.CurrentTryingCount >= this.MaxTryingCount)
                        {
                            string message = string.Format("[{0}] SLEEP FOR UPDATE AFTER {1} seconds, {2}", ThreadName, (this.UpdateSleep / 1000), DateTime.Now);
                            SpiderSingletonEvent.Instance.OnSpiderScreenConsole(new BlankSpider.Spider.Events.SpiderArgs()
                            {
                                Message = message
                            });
                            SpiderSingletonEvent.Instance.OnSpiderReloadForUpdate(new BlankSpider.Spider.Events.SpiderArgs()
                            {
                                Message = string.Format("{0} RELOAD FOR UPDATE \r\n", ThreadName)
                            });

                            Thread.Sleep(this.UpdateSleep);

                            try
                            {
                                this._BaseManagement.TotalUpdateLink = 0;
                                this._BaseManagement.TotalInsertLink = 0;
                                //CounterManager.ResetTryingCount();
                                //CounterManager.CurrentNumber = 0;
                                UrlStoragClear();
                                CounterManager.Reset(0, 100);
                            }
                            catch
                            {
                                this._BaseManagement.TotalUpdateLink = 0;
                                this._BaseManagement.TotalInsertLink = 0;
                                //CounterManager.ResetTryingCount();
                                //CounterManager.CurrentNumber = 0;
                                UrlStoragClear();
                                CounterManager.Reset(0, 100);
                            }
                        }
                    }


                    Thread.Sleep(this.THREAD_SLEEP);
                }
            }
            catch (Exception ex)
            {
                string test = ex.Message;
            }
        }
Ejemplo n.º 2
0
        protected override void ProcessFindLink()
        {
            try
            {
                while (!IS_STOP)
                {
                    int number = CounterManager.GetNumber();
                    //_removelinks = this.RemoveLinks.ToString();
                    string url = string.Empty;
                    if (FrontierURL.Count > 0)
                    {
                        ConcreteLink objUrl = FrontierURL.Dequeue();
                        string       html   = string.Empty;

                        if (string.IsNullOrEmpty(objUrl.href))
                        {
                            continue;
                        }

                        url = objUrl.href;

                        if (!objUrl.href.Contains("https://") && !objUrl.href.Contains("http://"))
                        {
                            url = this.BaseURL + objUrl.href;
                        }

                        try {
                            html = DownloadExpress.Download(url, Utility.GetEncoding(EncodingType.UTF8));
                        }
                        catch {
                            try
                            {
                                html = DownloadExpress.DownloadBySocket(url, Utility.GetEncoding(EncodingType.UTF8));
                            }
                            catch (Exception ex)
                            {
                                continue;
                            }
                        }

                        if (string.IsNullOrEmpty(html))
                        {
                            continue;
                        }



                        foreach (var item in base.ConfigLinks.configlinks)
                        {
                            FindLinkBase findLink = null;
                            if (item.pattern_type == "REGEX")
                            {
                                //if(item.link_type == "REMOVE_LINK")

                                if (_removelinks != string.Empty)
                                {
                                    findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, _removelinks);
                                }
                                else
                                {
                                    findLink = new FindLinkByRegex(this.BaseURL, item.url_pattern, "");
                                }
                            }

                            if (findLink == null)
                            {
                                continue;
                            }


                            #region SUB_LINK
                            if (item.link_type == "SUB_LINK")
                            {
                                List <string> listLinkSub = findLink.FindLink(html);
                                if (listLinkSub == null || listLinkSub.Count == 0)
                                {
                                    continue;
                                }


                                foreach (string linkSub in listLinkSub)
                                {
                                    string _url = linkSub;
                                    if (IsNewUrl(ref _url)) //check exist link
                                    {
                                        if (!_url.Contains(".html#") && !_url.Contains(".htm#"))
                                        {
                                            ConcreteLink _concreteLink = new ConcreteLink();
                                            _concreteLink.link_type = "SUB_LINK";
                                            _concreteLink.href      = linkSub;
                                            FrontierURL.Enqueue(_concreteLink);

                                            SpiderArgs args = new SpiderArgs()
                                            {
                                                Message = linkSub + "\r\n"
                                            };


                                            SpiderSingletonEvent.Instance.OnSpiderInformation(args);
                                        }
                                    }
                                }
                            }
                            #endregion

                            #region DETAIL_LINK
                            else if (item.link_type == "DETAIL_LINK")
                            {
                                List <string> linkDetail = findLink.FindLink(html);
                                if (linkDetail == null || linkDetail.Count == 0)
                                {
                                    continue;
                                }


                                foreach (string href in linkDetail)
                                {
                                    string _url = href;
                                    if (IsNewUrl(ref _url)) //check exist link
                                    {
                                        if (!_url.Contains(".html#") && !_url.Contains(".htm#"))
                                        {
                                            ConcreteLink _concreteLink = new ConcreteLink();
                                            _concreteLink.link_type = "DETAIL_LINK";
                                            _concreteLink.href      = href;

                                            QueueDetailURL.Enqueue(_concreteLink);


                                            SpiderArgs args = new SpiderArgs()
                                            {
                                                Message = _url + " - DETAIL" + "\r\n"
                                            };


                                            SpiderSingletonEvent.Instance.OnSpiderInformation(args);
                                        }
                                    }
                                }
                            }
                            #endregion
                        }
                    }

                    Thread.Sleep(this.THREAD_SLEEP);
                }
            }
            catch (Exception ex)
            {
                string test = ex.Message;
            }
        }