/// <summary> /// 获取网站对应的区块设置 /// </summary> /// <param name="CrawlerConfigId"></param> /// <returns>区块层级配置信息</returns> public List <CrawlerPartConfig> GetCrawlerPartConfig(Guid CrawlerConfigId) { Database db = new Database(); List <CrawlerPartConfig> cpc_list = new List <CrawlerPartConfig>(); try { StringBuilder sb = new StringBuilder(); sb.Append("select * from dbo.CrawlerPartConfig where CrawlerConfigId = @CrawlerConfigId; "); SqlParameter[] parameters = { db.MakeParam("CrawlerConfigId", SqlDbType.UniqueIdentifier, 0, ParameterDirection.Input, CrawlerConfigId) }; DataSet ds = db.ExeQueryGetDataSet(sb.ToString(), parameters); foreach (DataRow item in ds.Tables[0].Rows) { CrawlerPartConfig cpc = new CrawlerPartConfig() { Id = new Guid(item["Id"].ToString()), CrawlerConfigId = new Guid(item["CrawlerConfigId"].ToString()), FetchModel = Convert.ToInt32(item["FetchModel"]), IndexLink_XPath = item["IndexLink_XPath"] == DBNull.Value ? null : item["IndexLink_XPath"].ToString(), LoadMoreButton_XPath = item["LoadMoreButton_XPath"] == DBNull.Value ? null : item["LoadMoreButton_XPath"].ToString(), PageNumInput_XPath = item["PageNumInput_XPath"] == DBNull.Value ? null : item["PageNumInput_XPath"].ToString(), GoToButton_XPath = item["GoToButton_XPath"] == DBNull.Value ? null : item["GoToButton_XPath"].ToString(), FirstPage = item["FirstPage"] == DBNull.Value ? null : (int?)(Convert.ToInt32(item["FirstPage"])), LastPage = item["LastPage"] == DBNull.Value ? null : (int?)(Convert.ToInt32(item["LastPage"])), ExtentionData1 = item["ExtentionData1"] == DBNull.Value ? null : item["ExtentionData1"].ToString(), ExtentionData2 = item["ExtentionData2"] == DBNull.Value ? null : item["ExtentionData2"].ToString(), ExtentionData3 = item["ExtentionData3"] == DBNull.Value ? null : item["ExtentionData3"].ToString(), ExtentionData4 = item["ExtentionData4"] == DBNull.Value ? null : item["ExtentionData4"].ToString(), ExtentionData5 = item["ExtentionData5"] == DBNull.Value ? null : item["ExtentionData5"].ToString(), Title_Xpath = item["Title_Xpath"] == DBNull.Value ? null : item["Title_Xpath"].ToString(), Content_Xpath = item["Content_Xpath"] == DBNull.Value ? null : item["Content_Xpath"].ToString(), Editor_Xpath = item["Editor_Xpath"] == DBNull.Value ? null : item["Editor_Xpath"].ToString(), Source_Xpath = item["Source_Xpath"] == DBNull.Value ? null : item["Source_Xpath"].ToString(), PublishTime_Xpath = item["PublishTime_Xpath"] == DBNull.Value ? null : item["PublishTime_Xpath"].ToString() }; cpc_list.Add(cpc); } return(cpc_list); } catch (Exception ex) { throw ex; } finally { db.Close(); db.Dispose(); } }
private News GetNews(PhantomJSDriver driver, CrawlerPartConfig cpc, Uri uri, int threadId, int milliseconds) { string title = driver.FindElement(By.XPath(cpc.Title_Xpath)).Text; var contentcollection = driver.FindElements(By.XPath(cpc.Content_Xpath)); string content = string.Empty; foreach (var item in contentcollection) { content += item.Text; } string source = string.Empty; string editor = string.Empty; DateTime?publishtime = null; if (!string.IsNullOrEmpty(cpc.Source_Xpath)) { source = driver.FindElement(By.XPath(cpc.Source_Xpath)).Text; } if (!string.IsNullOrEmpty(cpc.Editor_Xpath)) { editor = driver.FindElement(By.XPath(cpc.Editor_Xpath)).Text; } if (!string.IsNullOrEmpty(cpc.PublishTime_Xpath)) { publishtime = Convert.ToDateTime(driver.FindElement(By.XPath(cpc.PublishTime_Xpath)).Text.TrimStart().Substring(0, 16)); } News news = new News() { Id = Guid.NewGuid(), CrawlerConfigId = cpc.CrawlerConfigId, Title = title, Content = content, Editor = editor, Source = source, Address = uri.ToString(), SpendTime = milliseconds, ThreadId = threadId, PublishTime = publishtime }; //WriteLog.InsertLogs(uri.ToString(), "步骤二"); return(news); }
private void StartSync(CrawlerPartConfig cpc, string Address, ref List <Target> List_tar) { try { Uri uri = new Uri(Address); if (cpc.FetchModel == 1) { PhantomJSDriver driver = sc.StartSync(uri, null, null); var titlecollection = driver.FindElements(By.XPath(cpc.IndexLink_XPath)); foreach (var item in titlecollection) { if (!string.IsNullOrEmpty(item.Text)) { Target t = new Target { Name = item.Text, Uri = item.GetAttribute("href"), Operation = null, CPC = cpc }; List_tar.Add(t); } } driver.Quit(); } else if (cpc.FetchModel == 3) { Operation opera = new Operation { Action = (x) => { x.FindElement(By.XPath(cpc.LoadMoreButton_XPath)).Click(); }, Condition = (x) => { return(!x.FindElement(By.XPath(cpc.ExtentionData1)).Displayed); }, Timeout = 10000, ExtentionData1 = cpc.ExtentionData1, ExtentionData2 = cpc.ExtentionData2, ExtentionData3 = cpc.ExtentionData3 }; PhantomJSDriver driver = sc.StartSync(uri, null, opera); var titlecollection = driver.FindElements(By.XPath(cpc.IndexLink_XPath)); foreach (var item in titlecollection) { if (!string.IsNullOrEmpty(item.Text)) { Target t = new Target { Name = item.Text, Uri = item.GetAttribute("href"), Operation = null, CPC = cpc }; List_tar.Add(t); } } driver.Quit(); } else if (cpc.FetchModel == 5) { Script script = new Script() { Code = "document.body.scrollTop=10000", Condition = (x) => { return(!x.FindElement(By.XPath(cpc.ExtentionData1)).Displayed); }, ActionTimes = Convert.ToInt32(cpc.ExtentionData4), FetchModel = 5, Timeout = 10000 }; PhantomJSDriver driver = sc.StartSync(uri, script, null); var titlecollection = driver.FindElements(By.XPath(cpc.IndexLink_XPath)); foreach (var item in titlecollection) { if (!string.IsNullOrEmpty(item.Text)) { Target t = new Target { Name = item.Text, Uri = item.GetAttribute("href"), Operation = null, CPC = cpc }; List_tar.Add(t); } } driver.Quit(); } } catch (Exception ex) { throw ex; } }
/// <summary> /// 异步创建爬虫 /// </summary> /// <param name="uri">爬虫URL地址</param> /// <param name="proxy">代理服务器</param> /// <returns>网页源代码</returns> public async Task StartAsync(Uri uri, Script script, Operation operation, CrawlerPartConfig cpc) { await Task.Run(() => { try { Semaphore.Wait(); //OnStart?.Invoke(this, new OnStartEventArgs(uri)); var _service = PhantomJSDriverService.CreateDefaultService(); _service.LoadImages = false; var _option = new PhantomJSOptions(); var driver = new PhantomJSDriver(_service, _option); try { //WriteLog.InsertLogs(uri.ToString(), "步骤零"); var watch = DateTime.Now; driver.Navigate().GoToUrl(uri.ToString()); if (script != null) { driver.ExecuteScript(script.Code, script.Args); } if (operation != null) { ExecuteAction(operation, driver); } var threadId = Thread.CurrentThread.ManagedThreadId; var seconds = Convert.ToInt32(DateTime.Now.Subtract(watch).TotalSeconds); //WriteLog.InsertLogs(uri.ToString(), "步骤一"); //打开网页时间过长可能导致driver被垃圾回收,限制为30秒 if (seconds < 30) { News news = GetNews(driver, cpc, uri, threadId, seconds); OnCompleted?.Invoke(this, new OnCompletedEventArgs(news)); //OnCompleted?.Invoke(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource, driver, cpc)); } else { WriteLog.InsertLogs(uri.ToString(), "打开网页超时"); if (driver != null) { driver.Quit(); driver = null; //WriteLog.InsertLogs(uri.ToString(), "结束"); } } } catch (Exception ex) { OnError?.Invoke(this, new OnErrorEventArgs(uri, ex)); //WriteLog.InsertLogs(uri.ToString(), ex.Message); } finally { if (driver != null) { driver.Quit(); //WriteLog.InsertLogs(uri.ToString(), "结束"); } } } catch (Exception ex) { OnError?.Invoke(this, new OnErrorEventArgs(uri, ex)); //WriteLog.InsertLogs(uri.ToString(), ex.Message); } finally { Semaphore.Release(); } }); }