/// <summary> /// 根据Section Code ,覆盖所有相关 DataSecion. /// /// </summary> public async Task CoverNewSectionCodeAsync(List <EDataSection> newList) { var rAll = await base.Db.Ado.UseTranAsync(() => { foreach (var es in newList) { int n = base.IsExist(a => new CountResult { Count = SqlFunc.AggregateCount(a.Id) }, a => a.SectionCode == es.SectionCode && a.ItemCode == es.ItemCode); if (n == 0) { base.Add(es); } else { int r = base.DelAll(a => a.SectionCode == es.SectionCode && a.ItemCode == es.ItemCode && a.CreateDateTime.AddDays(30) < DateTime.Today); if (r > 0) { base.Add(es); } } } // base.AddRange(newList); }); if (!rAll.IsSuccess) { NLogUtil.ErrorTxt($"[CoverNewSectionCode]建立Book和 Section 关系:{rAll.ErrorMessage}"); } }
private static void RunPlan() { int tryNum = 1; while (tryNum < 3) { try { _DouBanBookRepository = _ServiceProvider.GetService <DouBanBookService>(); var list = _ServiceProvider.GetService <IEnumerable <ICrawlerBatchBook> >(); _TagList = list.FirstOrDefault(a => a.GetType().Name == "TagListCrawler"); _DetailBook = _ServiceProvider.GetService <ICrawlerBook>(); _CrawlerTag = _ServiceProvider.GetService <ICrawlerTag>(); PlanFromTagsTask planFromTagsTask = new PlanFromTagsTask( _CrawlerTag, _TagList, _DetailBook, _DouBanBookRepository); planFromTagsTask.run(); } catch (ExceptionProxyConnect epc) { NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}"); NLogUtil.InfoTxt($"开始尝试第{tryNum++}次运行计划"); } } NLogUtil.InfoTxt($"第{tryNum}次运行计划后结束"); }
public List <BookBatch> CrawlerUrls(string entryUrl) { if (string.IsNullOrEmpty(entryUrl)) { NLogUtil.ErrorTxt("TagListCrawler 没有入口Url"); } List <BookBatch> result = new List <BookBatch>(); var doc = getDocbyEntryUrl(entryUrl); var nodes = doc.DocumentNode.SelectNodes("//div[@class='article']//ul[@class='subject-list']/li"); if (nodes != null) { foreach (var n in nodes) { var url = n.SelectSingleNode("./div/a").Attributes["href"].Value; result.Add(new BookBatch { DetailUrl = url }); } } return(result); }
/// <summary> /// 书简介 /// </summary> /// <param name="doc"></param> public void AnalyContent(HtmlDocument doc) { EBookInfo bi = _bookDetailData.DouBanBookInfo; string summery = ""; try { var hiddenNode = doc.DocumentNode.SelectSingleNode("//div[@class='related_info']//span[@class='all hidden']//div[@class='intro']"); if (hiddenNode != null) { summery = hiddenNode.InnerHtml.Trim(); } else { var shortNode = doc.DocumentNode.SelectSingleNode("//div[@class='related_info']//div[@class='intro']"); if (shortNode != null) { summery = shortNode.InnerHtml.Trim(); } } bi.Summery = summery; } catch (Exception ex) { NLogUtil.ErrorTxt("BookDetailCrawler AnalyContent:" + ex.Message); } }
public List <EPlan_FromDouBanTagUrls> InitPlanFromTagUrl(List <Secction_Tag> tagLists) { List <EPlan_FromDouBanTagUrls> result = new List <EPlan_FromDouBanTagUrls>(); if (!_PlanFTURepository.IsExistPlan()) { NLogUtil.InfoTxt("豆瓣计划写入到数据库"); if (tagLists != null && tagLists.Count > 0) { List <EDataSection> dsList = new List <EDataSection>(); // List<EPlan_FromDouBanTagUrls> planList = new List<EPlan_FromDouBanTagUrls>(); var allSection = _SectionDb.AllSection(); foreach (Secction_Tag st in tagLists) { ESection section = null; try { section = allSection[st.sectionName]; } catch { section = null; } foreach (var tag in st.TagList) { result.Add(PlanFTURepository.NewModelInstance(tag.Name, tag.Url)); if (section != null) { dsList.Add(DataSectionRepository.newModelInstance(section.Code, tag.Code)); } } } var rAll = _Db.Ado.UseTran(async() => { await _DataSectionDb.CoverNewSectionCodeAsync(dsList); _PlanFTURepository.CoverPlans(GenCodeHelper.Plan_FromDouBanTagUrls, result); }); if (rAll.IsSuccess) { NLogUtil.InfoTxt("【成功】豆瓣计划已在数据库初始化"); } else { NLogUtil.ErrorTxt($"【失败】豆瓣计划:{rAll.ErrorMessage}"); } } } else { NLogUtil.InfoTxt("DouBan Plan 已经存在数据库中!"); //DouBand Tag List 只能获取1000内的 result = _PlanFTURepository.QueryPlan(GenCodeHelper.Plan_FromDouBanTagUrls); } return(result); }
private static void Init() { try { Console.WriteLine("Init"); InitSystem(); //DbSeed.InitTables(_Db); //var needInitSection = Convert.ToBoolean(_configuration["InitTask:NeedInitSection"]); //DbSeed.InitData(_DouBanBookRepository, needInitSection); } catch (Exception ex) { NLogUtil.ErrorTxt($"Init Error:{ex.Message}", true); } }
private static void RunSingle() { //string url = "https://book.douban.com/subject/34845963/"; string url = "https://book.douban.com/subject/2669319/"; try { SinglgTask singlgTask = new SinglgTask(_DetailBook, _DouBanBookRepository); singlgTask.runAsync(url); } catch (ExceptionProxyConnect epc) { NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}"); } }
private static void RunLatestTask() { try { LatestTask latestTask = new LatestTask(_LatestBatchBook, _DetailBook, _DouBanBookRepository); latestTask.Run(); } catch (ExceptionProxyConnect epc) { NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}"); } catch (Exception ex) { NLogUtil.ErrorTxt($"RunLatestTask Error:{ex.Message}", true); } }
private static void InitSystem() { try { dynamic type = (new Program()).GetType(); _CurrentDirectory = Path.GetDirectoryName(type.Assembly.Location); _configuration = Configuration.SetBasePath(_CurrentDirectory) .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) .Build(); var serviceCollection = new ServiceCollection(); serviceCollection.AddSqlSugarSetup(_configuration); serviceCollection.AddCrawlers(_configuration); serviceCollection.AddRepository(_configuration); serviceCollection.AddInitDbSeed(_configuration); serviceCollection.AddMemoryCache(); // serviceCollection.AddScoped<IConfiguration>(_configuration); // _ServiceProvider = serviceCollection.BuildServiceProvider(); _DouBanBookRepository = _ServiceProvider.GetService <DouBanBookService>(); _Db = _ServiceProvider.GetService <SqlSugarClient>(); var list = _ServiceProvider.GetService <IEnumerable <ICrawlerBatchBook> >(); _LatestBatchBook = list.FirstOrDefault(a => a.GetType().Name == "BookLatestCrawler"); _TagList = list.FirstOrDefault(a => a.GetType().Name == "TagListCrawler"); _DetailBook = _ServiceProvider.GetService <ICrawlerBook>(); _CrawlerTag = _ServiceProvider.GetService <ICrawlerTag>(); _DaReTouCrawler = _ServiceProvider.GetService <DaReTouCrawler>(); } catch (Exception ex) { NLogUtil.ErrorTxt($"Init Error:{ex.Message}", true); } }
public void run() { NLogUtil.InfoTxt("开始豆瓣爬书计划"); var taglist = _TagsCrawler.getUrls(""); if (taglist != null && taglist.Count > 0) { var allList = _DouBanBookRepository.InitPlanFromTagUrl(taglist); try { foreach (var plan in allList) { while (plan.ProcessPageIndex < CrawlerSetting.DB_MaxIndex_TagList) { NLogUtil.InfoTxt($"豆瓣爬书计划-TagCode:{plan.TagCode},Index:{plan.ProcessPageIndex}"); var url = $"{DouBanBookBaseCrawlerData.DouBanBookPrefix}/tag/{plan.TagCode}?start={plan.ProcessPageIndex}&type=T"; List <BookDetail_middle> bnList = _TagListCrawler.Crawler(url); HandleBookMiddleList(bnList); plan.ProcessPageIndex += CrawlerSetting.DB_TagList_Step; _DouBanBookRepository.UpdatePlan(plan); } } } catch (ExceptionProxyConnect epc) { throw epc; } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】豆瓣爬书计划-TagList:{ex.Message}"); } } }
/// <summary> /// 处理抓爬的书本Middle数据 /// </summary> /// <param name="middle"></param> public async Task <bool> HandleBookMiddleAsync(BookDetail_middle middle) { try { NLogUtil.InfoTxt($"开始处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}"); VerifyBookData(middle); var rAll = await _Db.Ado.UseTranAsync(() => { try { //书本作者 var ePerson = _PersonDb.AddOrUpdate_MasterData <EPerson>(middle.Author); } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】写入人物:{ex.Message}"); } //Section // var rSection = _SectionDb.AddOrUpdate_MasterData<ESection>(middle.SectionList); try { //书本信息 var rBook = _BookDb.AddOrUpdate_MasterData <EBookInfo>(middle.DouBanBookInfo); } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】写入书本:{ex.Message}"); } try { //书本Tag var rTag = _TagDb.AddOrUpdate_MasterData <ETag>(middle.tagList); } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】写入更新Tags:{ex.Message}"); } // HandleDataSection(middle.GetDataSections()); //书本和Tag关系 HandleBookTag(middle.GetBookTags()); }); if (!rAll.IsSuccess) { NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title} -- {rAll.ErrorMessage}"); } else { NLogUtil.InfoTxt($"【成功】处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}"); } return(rAll.IsSuccess); } catch (Exception ex) { // _Db.Ado.RollbackTran(); NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}--{ex.Message}"); return(false); } // return true; }
public BookDetail_middle Crawler(string entryUrl = "") { InitData(entryUrl); NLogUtil.InfoTxt($"Crawler Book:${_entryUrl}"); //List<BookDetail_middle> datas = new List<BookDetail_middle>(); //datas.Add(_bookDetailData); var bi = _bookDetailData.DouBanBookInfo; HtmlDocument htmlDoc = getDocbyEntryUrl(entryUrl); VerifyHeader(htmlDoc); var bn = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='wrapper']/h1/span"); if (bn == null) { return(null); } // throw new Exception("No Book Title"); bi.Title = bn.InnerText; var node = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='mainpic']/a"); bi.CoverUrl_Big = node.Attributes["href"].Value; //大图片 node = node.SelectSingleNode("//img"); bi.CoverUrl = node.Attributes["src"].Value; //小图片 var info = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='info']"); var infoAttrs = info.SelectNodes(".//span"); if (infoAttrs == null) { return(null); } foreach (var span in infoAttrs) { try { AnalyInfo(span); } catch (Exception ex) { NLogUtil.ErrorTxt($"BookDetailCrawler Book Property-{_DouBanBookId}:[{span.InnerHtml}]{ex.Message}"); } } AnalyContent(htmlDoc); //简介 AnalyAuthor(htmlDoc); //作者 AnalyCatalog(htmlDoc); //目录 AnalyScore(htmlDoc); //分数 AnalyTags(htmlDoc); //Tags return(_bookDetailData); }