public async void Run() { NLogUtil.InfoTxt("开始批量【豆瓣新书】", true); var list = _LatestCrawler.CrawlerUrls(""); List <EDataSection> secList = new List <EDataSection>(); foreach (var url in list) { // _DetailCrawler = new BookDetailCrawler(); var midData = _DetailCrawler.Crawler(url.DetailUrl); if (midData != null) { // 添加虚拟非虚拟 midData.DouBanBookInfo.FictionType = url.FictionType; // 添加到最新专栏 var dataSection = _DouBanBookRepository.GetSection_NewExpress(); secList.Add(DataSectionRepository.newModelInstance(dataSection.Code, midData.DouBanBookInfo.Code)); await _DouBanBookRepository.HandleBookMiddleAsync(midData); } } await _DouBanBookRepository.CoverSection(secList); NLogUtil.InfoTxt("结束批量【豆瓣新书】", true); }
private static void RunPlan() { int tryNum = 1; while (tryNum < 3) { try { _DouBanBookRepository = _ServiceProvider.GetService <DouBanBookService>(); var list = _ServiceProvider.GetService <IEnumerable <ICrawlerBatchBook> >(); _TagList = list.FirstOrDefault(a => a.GetType().Name == "TagListCrawler"); _DetailBook = _ServiceProvider.GetService <ICrawlerBook>(); _CrawlerTag = _ServiceProvider.GetService <ICrawlerTag>(); PlanFromTagsTask planFromTagsTask = new PlanFromTagsTask( _CrawlerTag, _TagList, _DetailBook, _DouBanBookRepository); planFromTagsTask.run(); } catch (ExceptionProxyConnect epc) { NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}"); NLogUtil.InfoTxt($"开始尝试第{tryNum++}次运行计划"); } } NLogUtil.InfoTxt($"第{tryNum}次运行计划后结束"); }
public HtmlDocument getDocbyEntryUrl(string entryUrl) { HtmlDocument htmlDoc = null; try { htmlDoc = newLoadWeb(entryUrl); } catch (ExceptionProxyConnect epc) { throw epc; } catch { NLogUtil.InfoTxt("Connect Error,Auto Try Next Proxy Connect"); try { ProxyManager.RemoveProxyHostCache(); htmlDoc = newLoadWeb(entryUrl); } catch (Exception ex) { throw new ExceptionProxyConnect("Connect Error while LoadDoc"); } } return(htmlDoc); }
public List <EPlan_FromDouBanTagUrls> InitPlanFromTagUrl(List <Secction_Tag> tagLists) { List <EPlan_FromDouBanTagUrls> result = new List <EPlan_FromDouBanTagUrls>(); if (!_PlanFTURepository.IsExistPlan()) { NLogUtil.InfoTxt("豆瓣计划写入到数据库"); if (tagLists != null && tagLists.Count > 0) { List <EDataSection> dsList = new List <EDataSection>(); // List<EPlan_FromDouBanTagUrls> planList = new List<EPlan_FromDouBanTagUrls>(); var allSection = _SectionDb.AllSection(); foreach (Secction_Tag st in tagLists) { ESection section = null; try { section = allSection[st.sectionName]; } catch { section = null; } foreach (var tag in st.TagList) { result.Add(PlanFTURepository.NewModelInstance(tag.Name, tag.Url)); if (section != null) { dsList.Add(DataSectionRepository.newModelInstance(section.Code, tag.Code)); } } } var rAll = _Db.Ado.UseTran(async() => { await _DataSectionDb.CoverNewSectionCodeAsync(dsList); _PlanFTURepository.CoverPlans(GenCodeHelper.Plan_FromDouBanTagUrls, result); }); if (rAll.IsSuccess) { NLogUtil.InfoTxt("【成功】豆瓣计划已在数据库初始化"); } else { NLogUtil.ErrorTxt($"【失败】豆瓣计划:{rAll.ErrorMessage}"); } } } else { NLogUtil.InfoTxt("DouBan Plan 已经存在数据库中!"); //DouBand Tag List 只能获取1000内的 result = _PlanFTURepository.QueryPlan(GenCodeHelper.Plan_FromDouBanTagUrls); } return(result); }
public List <Secction_Tag> getUrls(string entryUrl) { if (string.IsNullOrEmpty(entryUrl)) { entryUrl = DouBanTagsUrl; } List <Secction_Tag> result = new List <Secction_Tag>(); NLogUtil.InfoTxt("开始抓爬豆瓣 All Tag"); try { HtmlDocument doc = getDocbyEntryUrl(entryUrl); VerifyHeader(doc); var root = doc.DocumentNode.SelectNodes("//div[@class='article']/div")[1]; if (root == null) { throw new ExceptionProxyConnect("getUrls Null"); } var secNodes = root.SelectNodes(".//div"); foreach (var sec in secNodes) { var secInfo = sec.SelectSingleNode(".//a"); Secction_Tag secTag = new Secction_Tag(); var sname = secInfo.Attributes["name"].Value; // sname = sname.Replace(".", ""); secTag.sectionName = sname; result.Add(secTag); var allHref = sec.SelectNodes(".//table[@class='tagCol']//a"); foreach (var tagnode in allHref) { secTag.TagList.Add(newTag(tagnode.InnerText.Trim())); } } } catch (ExceptionProxyConnect epc) { throw epc; } catch (Exception ex) { NLogUtil.InfoTxt($"抓爬豆瓣错误:{ex.Message}"); } NLogUtil.InfoTxt("抓爬豆瓣Tag结束"); return(result); }
public List <BookDetail_middle> Crawler(string entryUrl = "") { NLogUtil.InfoTxt($"[开始]抓爬TagList{entryUrl}"); var urlList = CrawlerUrls(entryUrl); List <BookDetail_middle> result = new List <BookDetail_middle>(); foreach (var bookUrl in urlList) { var bd = ToDetail(bookUrl.DetailUrl, _CrawlerBook); if (bd != null) { result.Add(bd); } } NLogUtil.InfoTxt($"[结束]抓爬TagList{entryUrl}"); return(result); }
public async void runAsync(string url) { NLogUtil.InfoTxt($"开始抓爬单本书:{url}"); BookDetail_middle task_midData = null; try { task_midData = await _DetailCrawler.CrawlerAsync(url); } catch (ExceptionProxyConnect epc) { throw epc; } NLogUtil.InfoTxt($"抓爬结束"); // _DouBanBookRepository.Test(task_midData); // var rBook = _BookDb.AddOrUpdate_MasterData<EBookInfo>(middle.DouBanBookInfo); await _DouBanBookRepository.HandleBookMiddleAsync(task_midData); }
public void run() { NLogUtil.InfoTxt("开始豆瓣爬书计划"); var taglist = _TagsCrawler.getUrls(""); if (taglist != null && taglist.Count > 0) { var allList = _DouBanBookRepository.InitPlanFromTagUrl(taglist); try { foreach (var plan in allList) { while (plan.ProcessPageIndex < CrawlerSetting.DB_MaxIndex_TagList) { NLogUtil.InfoTxt($"豆瓣爬书计划-TagCode:{plan.TagCode},Index:{plan.ProcessPageIndex}"); var url = $"{DouBanBookBaseCrawlerData.DouBanBookPrefix}/tag/{plan.TagCode}?start={plan.ProcessPageIndex}&type=T"; List <BookDetail_middle> bnList = _TagListCrawler.Crawler(url); HandleBookMiddleList(bnList); plan.ProcessPageIndex += CrawlerSetting.DB_TagList_Step; _DouBanBookRepository.UpdatePlan(plan); } } } catch (ExceptionProxyConnect epc) { throw epc; } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】豆瓣爬书计划-TagList:{ex.Message}"); } } }
/// <summary> /// 处理抓爬的书本Middle数据 /// </summary> /// <param name="middle"></param> public async Task <bool> HandleBookMiddleAsync(BookDetail_middle middle) { try { NLogUtil.InfoTxt($"开始处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}"); VerifyBookData(middle); var rAll = await _Db.Ado.UseTranAsync(() => { try { //书本作者 var ePerson = _PersonDb.AddOrUpdate_MasterData <EPerson>(middle.Author); } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】写入人物:{ex.Message}"); } //Section // var rSection = _SectionDb.AddOrUpdate_MasterData<ESection>(middle.SectionList); try { //书本信息 var rBook = _BookDb.AddOrUpdate_MasterData <EBookInfo>(middle.DouBanBookInfo); } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】写入书本:{ex.Message}"); } try { //书本Tag var rTag = _TagDb.AddOrUpdate_MasterData <ETag>(middle.tagList); } catch (Exception ex) { NLogUtil.ErrorTxt($"【错误】写入更新Tags:{ex.Message}"); } // HandleDataSection(middle.GetDataSections()); //书本和Tag关系 HandleBookTag(middle.GetBookTags()); }); if (!rAll.IsSuccess) { NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title} -- {rAll.ErrorMessage}"); } else { NLogUtil.InfoTxt($"【成功】处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}"); } return(rAll.IsSuccess); } catch (Exception ex) { // _Db.Ado.RollbackTran(); NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}--{ex.Message}"); return(false); } // return true; }
public BookDetail_middle Crawler(string entryUrl = "") { InitData(entryUrl); NLogUtil.InfoTxt($"Crawler Book:${_entryUrl}"); //List<BookDetail_middle> datas = new List<BookDetail_middle>(); //datas.Add(_bookDetailData); var bi = _bookDetailData.DouBanBookInfo; HtmlDocument htmlDoc = getDocbyEntryUrl(entryUrl); VerifyHeader(htmlDoc); var bn = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='wrapper']/h1/span"); if (bn == null) { return(null); } // throw new Exception("No Book Title"); bi.Title = bn.InnerText; var node = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='mainpic']/a"); bi.CoverUrl_Big = node.Attributes["href"].Value; //大图片 node = node.SelectSingleNode("//img"); bi.CoverUrl = node.Attributes["src"].Value; //小图片 var info = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='info']"); var infoAttrs = info.SelectNodes(".//span"); if (infoAttrs == null) { return(null); } foreach (var span in infoAttrs) { try { AnalyInfo(span); } catch (Exception ex) { NLogUtil.ErrorTxt($"BookDetailCrawler Book Property-{_DouBanBookId}:[{span.InnerHtml}]{ex.Message}"); } } AnalyContent(htmlDoc); //简介 AnalyAuthor(htmlDoc); //作者 AnalyCatalog(htmlDoc); //目录 AnalyScore(htmlDoc); //分数 AnalyTags(htmlDoc); //Tags return(_bookDetailData); }