Exemple #1
0
        public void ExecPageDBData()
        {
            DbHelp.Utilities util = new DbHelp.Utilities();

            Log.ClsLog        clsLog     = new Log.ClsLog();
            Common.UrlContorl urlContorl = new Common.UrlContorl();

            while (true)
            {
                Parallel.ForEach <DBEntity>(Program.dbList.GetConsumingEnumerable(), dbEntity =>
                                            //foreach (DBEntity dbEntity in Program.dbList.GetConsumingEnumerable())
                {
                    try
                    {
                        //插入SQL
                        util.ExecNonQuery(dbEntity.myCom, "");
                    }
                    catch (Exception ex)
                    {
                        urlContorl.SaveUrl(dbEntity.pageContentEntity, ex.ToString());
                        clsLog.AddLog(DateTime.Now.ToString(), "执行数据失败" + ex.ToString());
                        clsLog.AddLog(DateTime.Now.ToString(), dbEntity.pageContentEntity.SType + ";" + dbEntity.pageContentEntity.PID + ";" + dbEntity.pageContentEntity.Url + "");
                    }
                    Interlocked.Decrement(ref Program.clsDBSignal);
                });
            }
        }
        public void AnalyseData()
        {
            string SDate = Program.SpiderDate;

            Log.ClsLog        clsLog     = new Log.ClsLog();
            Common.UrlContorl urlContorl = new Common.UrlContorl();

            MainContorl mainContorl = new MainContorl();



            Spider.ClsPageUrl clsPageUrl = new Spider.ClsPageUrl();
            //new ParallelOptions() { MaxDegreeOfParallelism = 5 } 设置最大并行数量
            //Parallel.ForEach<PageContentEntity>(Program.pageContentList.GetConsumingEnumerable(), pageContentEntity =>
            //找出要抓取的Url
            foreach (PageContentEntity pageContentEntity in Program.pageContentList.GetConsumingEnumerable())
            {
                //开始抓取数据
                try
                {
                    #region 分析数据
                    if (pageContentEntity.PContent.Trim() != "" || pageContentEntity.MContent != null)
                    {
                        //分析数据
                        switch (pageContentEntity.SType)
                        {
                        case "Portal":
                            mainContorl.HousePortalAnalysis(pageContentEntity);
                            break;


                        default: break;
                        }
                    }
                    else
                    {
                        //保存抓取失败数据
                        urlContorl.SaveUrl(pageContentEntity, "页面数据为空");
                        clsLog.AddLog(DateTime.Now.ToString(), "抓取数据失败");
                        clsLog.AddLog(DateTime.Now.ToString(), pageContentEntity.SType + ";" + pageContentEntity.Url);
                    }
                    #endregion
                }
                catch (Exception ex)
                {
                    clsLog.AddLog(DateTime.Now.ToString(), "分析数据失败" + ex.ToString());
                    clsLog.AddLog(DateTime.Now.ToString(), pageContentEntity.SType + ";" + pageContentEntity.PID + ";" + pageContentEntity.Url);
                    urlContorl.SaveUrl(pageContentEntity, ex.ToString());
                }

                Interlocked.Decrement(ref Program.clsContentSignal);
            }
        }
Exemple #3
0
        public async void SpiderData()
        {
            Log.ClsLog          clsLog         = new Log.ClsLog();
            Page.ClsPageContent clsPageContent = new Page.ClsPageContent();
            SnatchAt            sa             = new SnatchAt();
            string PContent = "";
            Bitmap MContent = null;

            UrlContorl      urlContorl = new UrlContorl();
            CookieContainer cookies    = new CookieContainer();
            ProxyEntity     proxy      = new ProxyEntity();

            Common.NetContorl netContorl = new Common.NetContorl();

            CancellationTokenSource tokenSource = new CancellationTokenSource();
            HttpClient _client = new HttpClient("", 0, false);

            //Links.ForEach(url =>//串行
            //Parallel.ForEach<PageUrlEntity>(Program.pageUrlList.GetConsumingEnumerable(), new ParallelOptions() { MaxDegreeOfParallelism = 5 }, pageUrlEntity =>
            //找出要抓取的Url
            foreach (PageUrlEntity pageUrlEntity in Program.pageUrlList.GetConsumingEnumerable())
            {
                //判断分析队列中的页面数是否大于最大分析队列页面数,如果大于则休眠系统设置时间
                if (Program.clsContentSignal >= Program.sysPara.MaxPage)
                {
                    Thread.Sleep(Program.sysPara.SpiderSleepTime);
                }

                if (Program.sysPara.IsProxy == "true")
                {
                    proxy = netContorl.GetProxyEntity_URL();
                    while (proxy == null)
                    {
                        Thread.Sleep(Program.sysPara.NetSleepTime);
                        proxy = netContorl.GetProxyEntity_URL();
                    }
                    _client = new HttpClient(proxy.proxyAddess.Split(':')[0].ToString(), Convert.ToInt32(proxy.proxyAddess.Split(':')[1].ToString()), false);
                }

                //开始抓取数据
                try
                {
                    //抓取数据
                    try
                    {
                        PContent = "";
                        MContent = null;

                        if (pageUrlEntity.UrlType == "MGET")
                        {
                            PContent = _client.GetResponse(pageUrlEntity.SourUrl, pageUrlEntity.Url, pageUrlEntity.UrlType, pageUrlEntity.UrlPara, pageUrlEntity.EnCode);
                            pageUrlEntity.CookieContent = cookies;
                            pageUrlEntity.PContent      = "";
                            if (MContent == null)
                            {
                                throw new Exception("空图片");
                            }
                            clsPageContent.AddPageContent(pageUrlEntity, MContent);
                        }
                        else
                        {
                            PContent = _client.GetResponse(pageUrlEntity.SourUrl, pageUrlEntity.Url, pageUrlEntity.UrlType, pageUrlEntity.UrlPara, pageUrlEntity.EnCode);
                            pageUrlEntity.CookieContent = cookies;
                            pageUrlEntity.PContent      = PContent;
                            if (PContent == "")
                            {
                                throw new Exception("空页面");
                            }
                            if (PContent.Contains("超时"))
                            {
                                throw new Exception("操作超时");
                            }
                            clsPageContent.AddPageContent(pageUrlEntity, PContent);
                        }
                    }
                    catch (Exception ex)
                    {
                        clsLog.AddLog(DateTime.Now.ToString(), "抓取数据失败" + ex.ToString());
                        clsLog.AddLog(DateTime.Now.ToString(), pageUrlEntity.SType + ";" + pageUrlEntity.Url + ";");
                        urlContorl.SaveUrl(pageUrlEntity, PContent, ex.ToString());
                    }
                }
                catch (Exception ex)
                {
                    clsLog.AddLog(DateTime.Now.ToString(), "抓取失败" + ex.ToString());
                    clsLog.AddLog(DateTime.Now.ToString(), pageUrlEntity.SType + ";" + pageUrlEntity.Url + ";");
                    urlContorl.SaveUrl(pageUrlEntity, PContent, ex.ToString());
                }
                Interlocked.Decrement(ref Program.clsUrlSignal);

                Thread.SpinWait(Program.sysPara.BegSpiderIntervalTime + Program.sysPara.IntervalSpiderIntervalTime * (Program.CurrSpiderTimes - 1));
            }
        }
Exemple #4
0
        public void SpiderData()
        {
            Log.ClsLog          clsLog         = new Log.ClsLog();
            Page.ClsPageContent clsPageContent = new Page.ClsPageContent();
            SnatchAt            sa             = new SnatchAt();
            string PContent = "";
            Bitmap MContent = null;

            UrlContorl      urlContorl = new UrlContorl();
            CookieContainer cookies    = new CookieContainer();
            ProxyEntity     proxy      = new ProxyEntity();

            Common.NetContorl netContorl = new Common.NetContorl();

            CancellationTokenSource tokenSource = new CancellationTokenSource();
            CookieContainer         cookie      = new CookieContainer();
            HttpClient _client = new HttpClient("", 0, false, cookie);

            //Links.ForEach(url =>//串行
            //Parallel.ForEach<PageUrlEntity>(Program.pageUrlList.GetConsumingEnumerable(), new ParallelOptions() { MaxDegreeOfParallelism = 5 }, pageUrlEntity =>
            //找出要抓取的Url
            foreach (PageUrlEntity pageUrlEntity in Program.pageUrlList.GetConsumingEnumerable())
            {
                if (pageUrlEntity.CookieContent != null)
                {
                    _client = new HttpClient("", 0, false, pageUrlEntity.CookieContent);
                    cookie  = pageUrlEntity.CookieContent;
                }
                //判断分析队列中的页面数是否大于最大分析队列页面数,如果大于则休眠系统设置时间
                if (Program.clsContentSignal >= Program.sysPara.MaxPage)
                {
                    Thread.Sleep(Program.sysPara.SpiderSleepTime);
                }

                if (Program.sysPara.IsProxy == "true")
                {
                    proxy = netContorl.GetProxyEntity_URL2();
                    while (proxy == null)
                    {
                        Thread.Sleep(Program.sysPara.NetSleepTime);
                        proxy = netContorl.GetProxyEntity_URL2();
                    }
                    proxy.proxyAddess = proxy.proxyAddess.Replace("http://", "");
                    if (pageUrlEntity.CookieContent.Count > 0)
                    {
                        _client = new HttpClient("", 0, false, pageUrlEntity.CookieContent);
                        cookie  = pageUrlEntity.CookieContent;
                    }
                    else
                    {
                        _client = new HttpClient(proxy.proxyAddess.Split(':')[0].ToString(), Convert.ToInt32(proxy.proxyAddess.Split(':')[1].ToString()), true, null);
                    }
                }
                int _spidertime = 1;
                //开始抓取数据
                try
                {
                    //抓取数据
                    try
                    {
                        PContent = "";
                        MContent = null;

                        if (pageUrlEntity.UrlType == "MGET")
                        {
                            PContent = _client.GetResponse(pageUrlEntity.SourUrl, pageUrlEntity.Url, pageUrlEntity.UrlType, pageUrlEntity.UrlPara, pageUrlEntity.EnCode);
                            pageUrlEntity.CookieContent = cookies;
                            pageUrlEntity.PContent      = "";
                            if (MContent == null)
                            {
                                throw new Exception("空图片");
                            }
                            clsPageContent.AddPageContent(pageUrlEntity, MContent);
                        }
                        else
                        {
                            PContent = _client.GetResponse(pageUrlEntity.SourUrl, pageUrlEntity.Url, pageUrlEntity.UrlType, pageUrlEntity.UrlPara, pageUrlEntity.EnCode);
                            pageUrlEntity.CookieContent = cookies;
                            pageUrlEntity.PContent      = PContent;
                            if (string.IsNullOrEmpty(PContent))
                            {
                                throw new Exception("空页面");
                            }
                            if (PContent.Contains("超时"))
                            {
                                throw new Exception("操作超时");
                            }
                            clsPageContent.AddPageContent(pageUrlEntity, PContent);
                        }
                    }
                    catch (Exception ex)
                    {
                        clsLog.AddLog(DateTime.Now.ToString(), "抓取数据失败" + ex.ToString());
                        clsLog.AddLog(DateTime.Now.ToString(), pageUrlEntity.SType + ";" + pageUrlEntity.Url + ";");
                        Program.helper.OntxtviewCompleted(this, new EventControllerArgs()
                        {
                            IsSuccess = true, Msg = "抓取数据失败" + ex.ToString()
                        });
                        Program.helper.OntxtviewCompleted(this, new EventControllerArgs()
                        {
                            IsSuccess = true, Msg = pageUrlEntity.SType + ";" + pageUrlEntity.Url + ";"
                        });

                        //urlContorl.SaveUrl(pageUrlEntity, PContent, ex.ToString());

                        //错误页面重抓
                        if (_spidertime < 4)
                        {
                            Program.helper.OntxtviewCompleted(this, new EventControllerArgs()
                            {
                                IsSuccess = true, Msg = "错误页面重抓"
                            });
                            if (Program.sysPara.IsProxy == "true")
                            {
                                proxy = netContorl.GetProxyEntity_URL2();
                                while (proxy == null)
                                {
                                    Thread.Sleep(Program.sysPara.NetSleepTime);
                                    proxy = netContorl.GetProxyEntity_URL2();
                                }
                                proxy.proxyAddess = proxy.proxyAddess.Replace("http://", "");
                                _client           = new HttpClient(proxy.proxyAddess.Split(':')[0].ToString(), Convert.ToInt32(proxy.proxyAddess.Split(':')[1].ToString()), true, pageUrlEntity.CookieContent);
                            }


                            PContent = "";
                            MContent = null;
                            Program.helper.OntxtviewCompleted(this, new EventControllerArgs()
                            {
                                IsSuccess = true, Msg = "第" + _spidertime + "次重抓"
                            });
                            if (pageUrlEntity.UrlType == "MGET")
                            {
                                PContent = _client.GetResponse(pageUrlEntity.SourUrl, pageUrlEntity.Url, pageUrlEntity.UrlType, pageUrlEntity.UrlPara, pageUrlEntity.EnCode);
                                pageUrlEntity.CookieContent = cookies;
                                pageUrlEntity.PContent      = "";
                                if (MContent == null)
                                {
                                    throw new Exception("空图片");
                                }
                                clsPageContent.AddPageContent(pageUrlEntity, MContent);
                            }
                            else
                            {
                                PContent = _client.GetResponse(pageUrlEntity.SourUrl, pageUrlEntity.Url, pageUrlEntity.UrlType, pageUrlEntity.UrlPara, pageUrlEntity.EnCode);
                                pageUrlEntity.CookieContent = cookies;
                                pageUrlEntity.PContent      = PContent;
                                if (string.IsNullOrEmpty(PContent))
                                {
                                    throw new Exception("空页面");
                                }
                                if (PContent.Contains("超时"))
                                {
                                    throw new Exception("操作超时");
                                }
                                clsPageContent.AddPageContent(pageUrlEntity, PContent);
                            }
                            Thread.Sleep(Program.sysPara.BegSpiderIntervalTime + Program.sysPara.IntervalSpiderIntervalTime * (Program.CurrSpiderTimes - 1));

                            _spidertime++;
                        }
                    }
                }
                catch (Exception ex)
                {
                    clsLog.AddLog(DateTime.Now.ToString(), "抓取失败" + ex.ToString());
                    clsLog.AddLog(DateTime.Now.ToString(), pageUrlEntity.SType + ";" + pageUrlEntity.Url + ";");
                    Program.helper.OntxtviewCompleted(this, new EventControllerArgs()
                    {
                        IsSuccess = true, Msg = "抓取数据失败" + ex.ToString()
                    });
                    Program.helper.OntxtviewCompleted(this, new EventControllerArgs()
                    {
                        IsSuccess = true, Msg = pageUrlEntity.SType + ";" + pageUrlEntity.Url + ";"
                    });

                    //urlContorl.SaveUrl(pageUrlEntity, PContent, ex.ToString());
                }
                Interlocked.Decrement(ref Program.clsUrlSignal);
                Program.helper.OnAllItemAnalyzeCompleted(this, new EventControllerArgs()
                {
                    IsSuccess = true
                });
                Thread.Sleep(Program.sysPara.BegSpiderIntervalTime + Program.sysPara.IntervalSpiderIntervalTime * (Program.CurrSpiderTimes - 1));
            }
        }