Пример #1
0
        public async void Run()
        {
            NLogUtil.InfoTxt("开始批量【豆瓣新书】", true);
            var list = _LatestCrawler.CrawlerUrls("");
            List <EDataSection> secList = new List <EDataSection>();

            foreach (var url in list)
            {
                //  _DetailCrawler = new BookDetailCrawler();

                var midData = _DetailCrawler.Crawler(url.DetailUrl);
                if (midData != null)
                {
                    // 添加虚拟非虚拟
                    midData.DouBanBookInfo.FictionType = url.FictionType;
                    // 添加到最新专栏
                    var dataSection = _DouBanBookRepository.GetSection_NewExpress();
                    secList.Add(DataSectionRepository.newModelInstance(dataSection.Code, midData.DouBanBookInfo.Code));

                    await _DouBanBookRepository.HandleBookMiddleAsync(midData);
                }
            }

            await _DouBanBookRepository.CoverSection(secList);

            NLogUtil.InfoTxt("结束批量【豆瓣新书】", true);
        }
Пример #2
0
        private static void RunPlan()
        {
            int tryNum = 1;

            while (tryNum < 3)
            {
                try
                {
                    _DouBanBookRepository = _ServiceProvider.GetService <DouBanBookService>();


                    var list = _ServiceProvider.GetService <IEnumerable <ICrawlerBatchBook> >();

                    _TagList    = list.FirstOrDefault(a => a.GetType().Name == "TagListCrawler");
                    _DetailBook = _ServiceProvider.GetService <ICrawlerBook>();
                    _CrawlerTag = _ServiceProvider.GetService <ICrawlerTag>();

                    PlanFromTagsTask planFromTagsTask = new PlanFromTagsTask(
                        _CrawlerTag,
                        _TagList,
                        _DetailBook,
                        _DouBanBookRepository);

                    planFromTagsTask.run();
                }
                catch (ExceptionProxyConnect epc)
                {
                    NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}");
                    NLogUtil.InfoTxt($"开始尝试第{tryNum++}次运行计划");
                }
            }
            NLogUtil.InfoTxt($"第{tryNum}次运行计划后结束");
        }
Пример #3
0
        public HtmlDocument getDocbyEntryUrl(string entryUrl)
        {
            HtmlDocument htmlDoc = null;

            try
            {
                htmlDoc = newLoadWeb(entryUrl);
            }
            catch (ExceptionProxyConnect epc)
            {
                throw epc;
            }
            catch
            {
                NLogUtil.InfoTxt("Connect Error,Auto Try Next Proxy Connect");
                try
                {
                    ProxyManager.RemoveProxyHostCache();
                    htmlDoc = newLoadWeb(entryUrl);
                }
                catch (Exception ex)
                {
                    throw new ExceptionProxyConnect("Connect Error while LoadDoc");
                }
            }

            return(htmlDoc);
        }
        public List <EPlan_FromDouBanTagUrls> InitPlanFromTagUrl(List <Secction_Tag> tagLists)
        {
            List <EPlan_FromDouBanTagUrls> result = new List <EPlan_FromDouBanTagUrls>();

            if (!_PlanFTURepository.IsExistPlan())
            {
                NLogUtil.InfoTxt("豆瓣计划写入到数据库");
                if (tagLists != null && tagLists.Count > 0)
                {
                    List <EDataSection> dsList = new List <EDataSection>();
                    // List<EPlan_FromDouBanTagUrls> planList = new List<EPlan_FromDouBanTagUrls>();

                    var allSection = _SectionDb.AllSection();
                    foreach (Secction_Tag st in tagLists)
                    {
                        ESection section = null;
                        try
                        {
                            section = allSection[st.sectionName];
                        }
                        catch
                        {
                            section = null;
                        }
                        foreach (var tag in st.TagList)
                        {
                            result.Add(PlanFTURepository.NewModelInstance(tag.Name, tag.Url));
                            if (section != null)
                            {
                                dsList.Add(DataSectionRepository.newModelInstance(section.Code, tag.Code));
                            }
                        }
                    }
                    var rAll = _Db.Ado.UseTran(async() =>
                    {
                        await _DataSectionDb.CoverNewSectionCodeAsync(dsList);
                        _PlanFTURepository.CoverPlans(GenCodeHelper.Plan_FromDouBanTagUrls, result);
                    });
                    if (rAll.IsSuccess)
                    {
                        NLogUtil.InfoTxt("【成功】豆瓣计划已在数据库初始化");
                    }
                    else
                    {
                        NLogUtil.ErrorTxt($"【失败】豆瓣计划:{rAll.ErrorMessage}");
                    }
                }
            }
            else
            {
                NLogUtil.InfoTxt("DouBan Plan 已经存在数据库中!");
                //DouBand Tag List 只能获取1000内的
                result = _PlanFTURepository.QueryPlan(GenCodeHelper.Plan_FromDouBanTagUrls);
            }
            return(result);
        }
Пример #5
0
        public List <Secction_Tag> getUrls(string entryUrl)
        {
            if (string.IsNullOrEmpty(entryUrl))
            {
                entryUrl = DouBanTagsUrl;
            }
            List <Secction_Tag> result = new List <Secction_Tag>();

            NLogUtil.InfoTxt("开始抓爬豆瓣  All Tag");
            try
            {
                HtmlDocument doc = getDocbyEntryUrl(entryUrl);
                VerifyHeader(doc);

                var root = doc.DocumentNode.SelectNodes("//div[@class='article']/div")[1];
                if (root == null)
                {
                    throw new ExceptionProxyConnect("getUrls Null");
                }
                var secNodes = root.SelectNodes(".//div");
                foreach (var sec in secNodes)
                {
                    var          secInfo = sec.SelectSingleNode(".//a");
                    Secction_Tag secTag  = new Secction_Tag();
                    var          sname   = secInfo.Attributes["name"].Value;
                    //  sname = sname.Replace(".", "");
                    secTag.sectionName = sname;
                    result.Add(secTag);
                    var allHref = sec.SelectNodes(".//table[@class='tagCol']//a");
                    foreach (var tagnode in allHref)
                    {
                        secTag.TagList.Add(newTag(tagnode.InnerText.Trim()));
                    }
                }
            }
            catch (ExceptionProxyConnect epc)
            {
                throw epc;
            }
            catch (Exception ex)
            {
                NLogUtil.InfoTxt($"抓爬豆瓣错误:{ex.Message}");
            }
            NLogUtil.InfoTxt("抓爬豆瓣Tag结束");
            return(result);
        }
Пример #6
0
        public List <BookDetail_middle> Crawler(string entryUrl = "")
        {
            NLogUtil.InfoTxt($"[开始]抓爬TagList{entryUrl}");
            var urlList = CrawlerUrls(entryUrl);

            List <BookDetail_middle> result = new List <BookDetail_middle>();

            foreach (var bookUrl in urlList)
            {
                var bd = ToDetail(bookUrl.DetailUrl, _CrawlerBook);
                if (bd != null)
                {
                    result.Add(bd);
                }
            }
            NLogUtil.InfoTxt($"[结束]抓爬TagList{entryUrl}");

            return(result);
        }
Пример #7
0
        public async void runAsync(string url)
        {
            NLogUtil.InfoTxt($"开始抓爬单本书:{url}");
            BookDetail_middle task_midData = null;

            try
            {
                task_midData = await _DetailCrawler.CrawlerAsync(url);
            }
            catch (ExceptionProxyConnect epc)
            {
                throw epc;
            }


            NLogUtil.InfoTxt($"抓爬结束");
            //    _DouBanBookRepository.Test(task_midData);
            //   var rBook = _BookDb.AddOrUpdate_MasterData<EBookInfo>(middle.DouBanBookInfo);
            await _DouBanBookRepository.HandleBookMiddleAsync(task_midData);
        }
Пример #8
0
        public void run()
        {
            NLogUtil.InfoTxt("开始豆瓣爬书计划");

            var taglist = _TagsCrawler.getUrls("");

            if (taglist != null && taglist.Count > 0)
            {
                var allList = _DouBanBookRepository.InitPlanFromTagUrl(taglist);

                try
                {
                    foreach (var plan in allList)
                    {
                        while (plan.ProcessPageIndex < CrawlerSetting.DB_MaxIndex_TagList)
                        {
                            NLogUtil.InfoTxt($"豆瓣爬书计划-TagCode:{plan.TagCode},Index:{plan.ProcessPageIndex}");
                            var url = $"{DouBanBookBaseCrawlerData.DouBanBookPrefix}/tag/{plan.TagCode}?start={plan.ProcessPageIndex}&type=T";
                            List <BookDetail_middle> bnList = _TagListCrawler.Crawler(url);
                            HandleBookMiddleList(bnList);

                            plan.ProcessPageIndex += CrawlerSetting.DB_TagList_Step;
                            _DouBanBookRepository.UpdatePlan(plan);
                        }
                    }
                }
                catch (ExceptionProxyConnect epc)
                {
                    throw epc;
                }
                catch (Exception ex)
                {
                    NLogUtil.ErrorTxt($"【错误】豆瓣爬书计划-TagList:{ex.Message}");
                }
            }
        }
        /// <summary>
        /// 处理抓爬的书本Middle数据
        /// </summary>
        /// <param name="middle"></param>
        public async Task <bool> HandleBookMiddleAsync(BookDetail_middle middle)
        {
            try
            {
                NLogUtil.InfoTxt($"开始处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}");
                VerifyBookData(middle);

                var rAll = await _Db.Ado.UseTranAsync(() => {
                    try
                    {
                        //书本作者
                        var ePerson = _PersonDb.AddOrUpdate_MasterData <EPerson>(middle.Author);
                    }
                    catch (Exception ex)
                    {
                        NLogUtil.ErrorTxt($"【错误】写入人物:{ex.Message}");
                    }


                    //Section
                    //   var rSection = _SectionDb.AddOrUpdate_MasterData<ESection>(middle.SectionList);
                    try
                    {
                        //书本信息
                        var rBook = _BookDb.AddOrUpdate_MasterData <EBookInfo>(middle.DouBanBookInfo);
                    }
                    catch (Exception ex)
                    {
                        NLogUtil.ErrorTxt($"【错误】写入书本:{ex.Message}");
                    }



                    try
                    {
                        //书本Tag
                        var rTag = _TagDb.AddOrUpdate_MasterData <ETag>(middle.tagList);
                    }
                    catch (Exception ex)
                    {
                        NLogUtil.ErrorTxt($"【错误】写入更新Tags:{ex.Message}");
                    }


                    //    HandleDataSection(middle.GetDataSections());
                    //书本和Tag关系
                    HandleBookTag(middle.GetBookTags());
                });


                if (!rAll.IsSuccess)
                {
                    NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title} -- {rAll.ErrorMessage}");
                }
                else
                {
                    NLogUtil.InfoTxt($"【成功】处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}");
                }
                return(rAll.IsSuccess);
            }
            catch (Exception ex)
            {
                //  _Db.Ado.RollbackTran();
                NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}--{ex.Message}");
                return(false);
            }

            //  return true;
        }
Пример #10
0
        public BookDetail_middle Crawler(string entryUrl = "")
        {
            InitData(entryUrl);
            NLogUtil.InfoTxt($"Crawler Book:${_entryUrl}");


            //List<BookDetail_middle> datas = new List<BookDetail_middle>();
            //datas.Add(_bookDetailData);

            var bi = _bookDetailData.DouBanBookInfo;


            HtmlDocument htmlDoc = getDocbyEntryUrl(entryUrl);

            VerifyHeader(htmlDoc);

            var bn = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='wrapper']/h1/span");

            if (bn == null)
            {
                return(null);
            }
            // throw new Exception("No Book Title");
            bi.Title = bn.InnerText;
            var node = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='mainpic']/a");

            bi.CoverUrl_Big = node.Attributes["href"].Value; //大图片

            node        = node.SelectSingleNode("//img");
            bi.CoverUrl = node.Attributes["src"].Value; //小图片

            var info      = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='info']");
            var infoAttrs = info.SelectNodes(".//span");

            if (infoAttrs == null)
            {
                return(null);
            }
            foreach (var span in infoAttrs)
            {
                try
                {
                    AnalyInfo(span);
                }
                catch (Exception ex)
                {
                    NLogUtil.ErrorTxt($"BookDetailCrawler Book Property-{_DouBanBookId}:[{span.InnerHtml}]{ex.Message}");
                }
            }

            AnalyContent(htmlDoc); //简介

            AnalyAuthor(htmlDoc);  //作者

            AnalyCatalog(htmlDoc); //目录

            AnalyScore(htmlDoc);   //分数

            AnalyTags(htmlDoc);    //Tags

            return(_bookDetailData);
        }