/// <summary>
        /// 获取网站对应的区块设置
        /// </summary>
        /// <param name="CrawlerConfigId"></param>
        /// <returns>区块层级配置信息</returns>
        public List <CrawlerPartConfig> GetCrawlerPartConfig(Guid CrawlerConfigId)
        {
            Database db = new Database();
            List <CrawlerPartConfig> cpc_list = new List <CrawlerPartConfig>();

            try
            {
                StringBuilder sb = new StringBuilder();
                sb.Append("select * from dbo.CrawlerPartConfig where CrawlerConfigId = @CrawlerConfigId; ");
                SqlParameter[] parameters =
                {
                    db.MakeParam("CrawlerConfigId", SqlDbType.UniqueIdentifier, 0, ParameterDirection.Input, CrawlerConfigId)
                };
                DataSet ds = db.ExeQueryGetDataSet(sb.ToString(), parameters);

                foreach (DataRow item in ds.Tables[0].Rows)
                {
                    CrawlerPartConfig cpc = new CrawlerPartConfig()
                    {
                        Id = new Guid(item["Id"].ToString()),
                        CrawlerConfigId      = new Guid(item["CrawlerConfigId"].ToString()),
                        FetchModel           = Convert.ToInt32(item["FetchModel"]),
                        IndexLink_XPath      = item["IndexLink_XPath"] == DBNull.Value ? null : item["IndexLink_XPath"].ToString(),
                        LoadMoreButton_XPath = item["LoadMoreButton_XPath"] == DBNull.Value ? null : item["LoadMoreButton_XPath"].ToString(),
                        PageNumInput_XPath   = item["PageNumInput_XPath"] == DBNull.Value ? null : item["PageNumInput_XPath"].ToString(),
                        GoToButton_XPath     = item["GoToButton_XPath"] == DBNull.Value ? null : item["GoToButton_XPath"].ToString(),
                        FirstPage            = item["FirstPage"] == DBNull.Value ? null : (int?)(Convert.ToInt32(item["FirstPage"])),
                        LastPage             = item["LastPage"] == DBNull.Value ? null : (int?)(Convert.ToInt32(item["LastPage"])),
                        ExtentionData1       = item["ExtentionData1"] == DBNull.Value ? null : item["ExtentionData1"].ToString(),
                        ExtentionData2       = item["ExtentionData2"] == DBNull.Value ? null : item["ExtentionData2"].ToString(),
                        ExtentionData3       = item["ExtentionData3"] == DBNull.Value ? null : item["ExtentionData3"].ToString(),
                        ExtentionData4       = item["ExtentionData4"] == DBNull.Value ? null : item["ExtentionData4"].ToString(),
                        ExtentionData5       = item["ExtentionData5"] == DBNull.Value ? null : item["ExtentionData5"].ToString(),
                        Title_Xpath          = item["Title_Xpath"] == DBNull.Value ? null : item["Title_Xpath"].ToString(),
                        Content_Xpath        = item["Content_Xpath"] == DBNull.Value ? null : item["Content_Xpath"].ToString(),
                        Editor_Xpath         = item["Editor_Xpath"] == DBNull.Value ? null : item["Editor_Xpath"].ToString(),
                        Source_Xpath         = item["Source_Xpath"] == DBNull.Value ? null : item["Source_Xpath"].ToString(),
                        PublishTime_Xpath    = item["PublishTime_Xpath"] == DBNull.Value ? null : item["PublishTime_Xpath"].ToString()
                    };
                    cpc_list.Add(cpc);
                }
                return(cpc_list);
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                db.Close();
                db.Dispose();
            }
        }
        private News GetNews(PhantomJSDriver driver, CrawlerPartConfig cpc, Uri uri, int threadId, int milliseconds)
        {
            string title             = driver.FindElement(By.XPath(cpc.Title_Xpath)).Text;
            var    contentcollection = driver.FindElements(By.XPath(cpc.Content_Xpath));
            string content           = string.Empty;

            foreach (var item in contentcollection)
            {
                content += item.Text;
            }
            string   source      = string.Empty;
            string   editor      = string.Empty;
            DateTime?publishtime = null;

            if (!string.IsNullOrEmpty(cpc.Source_Xpath))
            {
                source = driver.FindElement(By.XPath(cpc.Source_Xpath)).Text;
            }
            if (!string.IsNullOrEmpty(cpc.Editor_Xpath))
            {
                editor = driver.FindElement(By.XPath(cpc.Editor_Xpath)).Text;
            }
            if (!string.IsNullOrEmpty(cpc.PublishTime_Xpath))
            {
                publishtime = Convert.ToDateTime(driver.FindElement(By.XPath(cpc.PublishTime_Xpath)).Text.TrimStart().Substring(0, 16));
            }

            News news = new News()
            {
                Id = Guid.NewGuid(),
                CrawlerConfigId = cpc.CrawlerConfigId,
                Title           = title,
                Content         = content,
                Editor          = editor,
                Source          = source,
                Address         = uri.ToString(),
                SpendTime       = milliseconds,
                ThreadId        = threadId,
                PublishTime     = publishtime
            };

            //WriteLog.InsertLogs(uri.ToString(), "步骤二");
            return(news);
        }
Beispiel #3
0
 private void StartSync(CrawlerPartConfig cpc, string Address, ref List <Target> List_tar)
 {
     try
     {
         Uri uri = new Uri(Address);
         if (cpc.FetchModel == 1)
         {
             PhantomJSDriver driver          = sc.StartSync(uri, null, null);
             var             titlecollection = driver.FindElements(By.XPath(cpc.IndexLink_XPath));
             foreach (var item in titlecollection)
             {
                 if (!string.IsNullOrEmpty(item.Text))
                 {
                     Target t = new Target
                     {
                         Name      = item.Text,
                         Uri       = item.GetAttribute("href"),
                         Operation = null,
                         CPC       = cpc
                     };
                     List_tar.Add(t);
                 }
             }
             driver.Quit();
         }
         else if (cpc.FetchModel == 3)
         {
             Operation opera = new Operation
             {
                 Action = (x) =>
                 {
                     x.FindElement(By.XPath(cpc.LoadMoreButton_XPath)).Click();
                 },
                 Condition = (x) =>
                 {
                     return(!x.FindElement(By.XPath(cpc.ExtentionData1)).Displayed);
                 },
                 Timeout        = 10000,
                 ExtentionData1 = cpc.ExtentionData1,
                 ExtentionData2 = cpc.ExtentionData2,
                 ExtentionData3 = cpc.ExtentionData3
             };
             PhantomJSDriver driver          = sc.StartSync(uri, null, opera);
             var             titlecollection = driver.FindElements(By.XPath(cpc.IndexLink_XPath));
             foreach (var item in titlecollection)
             {
                 if (!string.IsNullOrEmpty(item.Text))
                 {
                     Target t = new Target
                     {
                         Name      = item.Text,
                         Uri       = item.GetAttribute("href"),
                         Operation = null,
                         CPC       = cpc
                     };
                     List_tar.Add(t);
                 }
             }
             driver.Quit();
         }
         else if (cpc.FetchModel == 5)
         {
             Script script = new Script()
             {
                 Code      = "document.body.scrollTop=10000",
                 Condition = (x) =>
                 {
                     return(!x.FindElement(By.XPath(cpc.ExtentionData1)).Displayed);
                 },
                 ActionTimes = Convert.ToInt32(cpc.ExtentionData4),
                 FetchModel  = 5,
                 Timeout     = 10000
             };
             PhantomJSDriver driver          = sc.StartSync(uri, script, null);
             var             titlecollection = driver.FindElements(By.XPath(cpc.IndexLink_XPath));
             foreach (var item in titlecollection)
             {
                 if (!string.IsNullOrEmpty(item.Text))
                 {
                     Target t = new Target
                     {
                         Name      = item.Text,
                         Uri       = item.GetAttribute("href"),
                         Operation = null,
                         CPC       = cpc
                     };
                     List_tar.Add(t);
                 }
             }
             driver.Quit();
         }
     }
     catch (Exception ex)
     {
         throw ex;
     }
 }
        /// <summary>
        /// 异步创建爬虫
        /// </summary>
        /// <param name="uri">爬虫URL地址</param>
        /// <param name="proxy">代理服务器</param>
        /// <returns>网页源代码</returns>
        public async Task StartAsync(Uri uri, Script script, Operation operation, CrawlerPartConfig cpc)
        {
            await Task.Run(() =>
            {
                try
                {
                    Semaphore.Wait();
                    //OnStart?.Invoke(this, new OnStartEventArgs(uri));
                    var _service        = PhantomJSDriverService.CreateDefaultService();
                    _service.LoadImages = false;
                    var _option         = new PhantomJSOptions();
                    var driver          = new PhantomJSDriver(_service, _option);
                    try
                    {
                        //WriteLog.InsertLogs(uri.ToString(), "步骤零");
                        var watch = DateTime.Now;
                        driver.Navigate().GoToUrl(uri.ToString());

                        if (script != null)
                        {
                            driver.ExecuteScript(script.Code, script.Args);
                        }
                        if (operation != null)
                        {
                            ExecuteAction(operation, driver);
                        }
                        var threadId = Thread.CurrentThread.ManagedThreadId;
                        var seconds  = Convert.ToInt32(DateTime.Now.Subtract(watch).TotalSeconds);
                        //WriteLog.InsertLogs(uri.ToString(), "步骤一");
                        //打开网页时间过长可能导致driver被垃圾回收,限制为30秒
                        if (seconds < 30)
                        {
                            News news = GetNews(driver, cpc, uri, threadId, seconds);
                            OnCompleted?.Invoke(this, new OnCompletedEventArgs(news));
                            //OnCompleted?.Invoke(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource, driver, cpc));
                        }
                        else
                        {
                            WriteLog.InsertLogs(uri.ToString(), "打开网页超时");
                            if (driver != null)
                            {
                                driver.Quit();
                                driver = null;
                                //WriteLog.InsertLogs(uri.ToString(), "结束");
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        OnError?.Invoke(this, new OnErrorEventArgs(uri, ex));
                        //WriteLog.InsertLogs(uri.ToString(), ex.Message);
                    }
                    finally
                    {
                        if (driver != null)
                        {
                            driver.Quit();
                            //WriteLog.InsertLogs(uri.ToString(), "结束");
                        }
                    }
                }
                catch (Exception ex)
                {
                    OnError?.Invoke(this, new OnErrorEventArgs(uri, ex));
                    //WriteLog.InsertLogs(uri.ToString(), ex.Message);
                }
                finally
                {
                    Semaphore.Release();
                }
            });
        }