コード例 #1
0
        /// <summary>
        /// 根据Section Code ,覆盖所有相关 DataSecion.
        ///
        /// </summary>
        public async Task CoverNewSectionCodeAsync(List <EDataSection> newList)
        {
            var rAll = await base.Db.Ado.UseTranAsync(() =>
            {
                foreach (var es in newList)
                {
                    int n = base.IsExist(a => new CountResult {
                        Count = SqlFunc.AggregateCount(a.Id)
                    }, a => a.SectionCode == es.SectionCode && a.ItemCode == es.ItemCode);
                    if (n == 0)
                    {
                        base.Add(es);
                    }
                    else
                    {
                        int r = base.DelAll(a => a.SectionCode == es.SectionCode && a.ItemCode == es.ItemCode && a.CreateDateTime.AddDays(30) < DateTime.Today);
                        if (r > 0)
                        {
                            base.Add(es);
                        }
                    }
                }
                //    base.AddRange(newList);
            });

            if (!rAll.IsSuccess)
            {
                NLogUtil.ErrorTxt($"[CoverNewSectionCode]建立Book和 Section 关系:{rAll.ErrorMessage}");
            }
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: flysnoopy1984/DataCrawler
        private static void RunPlan()
        {
            int tryNum = 1;

            while (tryNum < 3)
            {
                try
                {
                    _DouBanBookRepository = _ServiceProvider.GetService <DouBanBookService>();


                    var list = _ServiceProvider.GetService <IEnumerable <ICrawlerBatchBook> >();

                    _TagList    = list.FirstOrDefault(a => a.GetType().Name == "TagListCrawler");
                    _DetailBook = _ServiceProvider.GetService <ICrawlerBook>();
                    _CrawlerTag = _ServiceProvider.GetService <ICrawlerTag>();

                    PlanFromTagsTask planFromTagsTask = new PlanFromTagsTask(
                        _CrawlerTag,
                        _TagList,
                        _DetailBook,
                        _DouBanBookRepository);

                    planFromTagsTask.run();
                }
                catch (ExceptionProxyConnect epc)
                {
                    NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}");
                    NLogUtil.InfoTxt($"开始尝试第{tryNum++}次运行计划");
                }
            }
            NLogUtil.InfoTxt($"第{tryNum}次运行计划后结束");
        }
コード例 #3
0
        public List <BookBatch> CrawlerUrls(string entryUrl)
        {
            if (string.IsNullOrEmpty(entryUrl))
            {
                NLogUtil.ErrorTxt("TagListCrawler 没有入口Url");
            }

            List <BookBatch> result = new List <BookBatch>();
            var doc   = getDocbyEntryUrl(entryUrl);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='article']//ul[@class='subject-list']/li");

            if (nodes != null)
            {
                foreach (var n in nodes)
                {
                    var url = n.SelectSingleNode("./div/a").Attributes["href"].Value;
                    result.Add(new BookBatch
                    {
                        DetailUrl = url
                    });
                }
            }


            return(result);
        }
コード例 #4
0
        /// <summary>
        /// 书简介
        /// </summary>
        /// <param name="doc"></param>
        public void AnalyContent(HtmlDocument doc)
        {
            EBookInfo bi      = _bookDetailData.DouBanBookInfo;
            string    summery = "";

            try
            {
                var hiddenNode = doc.DocumentNode.SelectSingleNode("//div[@class='related_info']//span[@class='all hidden']//div[@class='intro']");
                if (hiddenNode != null)
                {
                    summery = hiddenNode.InnerHtml.Trim();
                }
                else
                {
                    var shortNode = doc.DocumentNode.SelectSingleNode("//div[@class='related_info']//div[@class='intro']");
                    if (shortNode != null)
                    {
                        summery = shortNode.InnerHtml.Trim();
                    }
                }
                bi.Summery = summery;
            }
            catch (Exception ex)
            {
                NLogUtil.ErrorTxt("BookDetailCrawler AnalyContent:" + ex.Message);
            }
        }
コード例 #5
0
        public List <EPlan_FromDouBanTagUrls> InitPlanFromTagUrl(List <Secction_Tag> tagLists)
        {
            List <EPlan_FromDouBanTagUrls> result = new List <EPlan_FromDouBanTagUrls>();

            if (!_PlanFTURepository.IsExistPlan())
            {
                NLogUtil.InfoTxt("豆瓣计划写入到数据库");
                if (tagLists != null && tagLists.Count > 0)
                {
                    List <EDataSection> dsList = new List <EDataSection>();
                    // List<EPlan_FromDouBanTagUrls> planList = new List<EPlan_FromDouBanTagUrls>();

                    var allSection = _SectionDb.AllSection();
                    foreach (Secction_Tag st in tagLists)
                    {
                        ESection section = null;
                        try
                        {
                            section = allSection[st.sectionName];
                        }
                        catch
                        {
                            section = null;
                        }
                        foreach (var tag in st.TagList)
                        {
                            result.Add(PlanFTURepository.NewModelInstance(tag.Name, tag.Url));
                            if (section != null)
                            {
                                dsList.Add(DataSectionRepository.newModelInstance(section.Code, tag.Code));
                            }
                        }
                    }
                    var rAll = _Db.Ado.UseTran(async() =>
                    {
                        await _DataSectionDb.CoverNewSectionCodeAsync(dsList);
                        _PlanFTURepository.CoverPlans(GenCodeHelper.Plan_FromDouBanTagUrls, result);
                    });
                    if (rAll.IsSuccess)
                    {
                        NLogUtil.InfoTxt("【成功】豆瓣计划已在数据库初始化");
                    }
                    else
                    {
                        NLogUtil.ErrorTxt($"【失败】豆瓣计划:{rAll.ErrorMessage}");
                    }
                }
            }
            else
            {
                NLogUtil.InfoTxt("DouBan Plan 已经存在数据库中!");
                //DouBand Tag List 只能获取1000内的
                result = _PlanFTURepository.QueryPlan(GenCodeHelper.Plan_FromDouBanTagUrls);
            }
            return(result);
        }
コード例 #6
0
ファイル: Program.cs プロジェクト: flysnoopy1984/DataCrawler
 private static void Init()
 {
     try
     {
         Console.WriteLine("Init");
         InitSystem();
         //DbSeed.InitTables(_Db);
         //var needInitSection = Convert.ToBoolean(_configuration["InitTask:NeedInitSection"]);
         //DbSeed.InitData(_DouBanBookRepository, needInitSection);
     }
     catch (Exception ex)
     {
         NLogUtil.ErrorTxt($"Init Error:{ex.Message}", true);
     }
 }
コード例 #7
0
ファイル: Program.cs プロジェクト: flysnoopy1984/DataCrawler
        private static void RunSingle()
        {
            //string url = "https://book.douban.com/subject/34845963/";

            string url = "https://book.douban.com/subject/2669319/";

            try
            {
                SinglgTask singlgTask = new SinglgTask(_DetailBook, _DouBanBookRepository);
                singlgTask.runAsync(url);
            }
            catch (ExceptionProxyConnect epc)
            {
                NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}");
            }
        }
コード例 #8
0
ファイル: Program.cs プロジェクト: flysnoopy1984/DataCrawler
 private static void RunLatestTask()
 {
     try
     {
         LatestTask latestTask = new LatestTask(_LatestBatchBook, _DetailBook, _DouBanBookRepository);
         latestTask.Run();
     }
     catch (ExceptionProxyConnect epc)
     {
         NLogUtil.ErrorTxt($"代理连接错误:{epc.Message}");
     }
     catch (Exception ex)
     {
         NLogUtil.ErrorTxt($"RunLatestTask Error:{ex.Message}", true);
     }
 }
コード例 #9
0
ファイル: Program.cs プロジェクト: flysnoopy1984/DataCrawler
        private static void InitSystem()
        {
            try
            {
                dynamic type = (new Program()).GetType();
                _CurrentDirectory = Path.GetDirectoryName(type.Assembly.Location);

                _configuration = Configuration.SetBasePath(_CurrentDirectory)
                                 .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
                                 .Build();

                var serviceCollection = new ServiceCollection();
                serviceCollection.AddSqlSugarSetup(_configuration);
                serviceCollection.AddCrawlers(_configuration);
                serviceCollection.AddRepository(_configuration);
                serviceCollection.AddInitDbSeed(_configuration);
                serviceCollection.AddMemoryCache();
                //  serviceCollection.AddScoped<IConfiguration>(_configuration);
                //

                _ServiceProvider      = serviceCollection.BuildServiceProvider();
                _DouBanBookRepository = _ServiceProvider.GetService <DouBanBookService>();
                _Db = _ServiceProvider.GetService <SqlSugarClient>();

                var list = _ServiceProvider.GetService <IEnumerable <ICrawlerBatchBook> >();
                _LatestBatchBook = list.FirstOrDefault(a => a.GetType().Name == "BookLatestCrawler");
                _TagList         = list.FirstOrDefault(a => a.GetType().Name == "TagListCrawler");
                _DetailBook      = _ServiceProvider.GetService <ICrawlerBook>();
                _CrawlerTag      = _ServiceProvider.GetService <ICrawlerTag>();
                _DaReTouCrawler  = _ServiceProvider.GetService <DaReTouCrawler>();
            }
            catch (Exception ex)
            {
                NLogUtil.ErrorTxt($"Init Error:{ex.Message}", true);
            }
        }
コード例 #10
0
        public void run()
        {
            NLogUtil.InfoTxt("开始豆瓣爬书计划");

            var taglist = _TagsCrawler.getUrls("");

            if (taglist != null && taglist.Count > 0)
            {
                var allList = _DouBanBookRepository.InitPlanFromTagUrl(taglist);

                try
                {
                    foreach (var plan in allList)
                    {
                        while (plan.ProcessPageIndex < CrawlerSetting.DB_MaxIndex_TagList)
                        {
                            NLogUtil.InfoTxt($"豆瓣爬书计划-TagCode:{plan.TagCode},Index:{plan.ProcessPageIndex}");
                            var url = $"{DouBanBookBaseCrawlerData.DouBanBookPrefix}/tag/{plan.TagCode}?start={plan.ProcessPageIndex}&type=T";
                            List <BookDetail_middle> bnList = _TagListCrawler.Crawler(url);
                            HandleBookMiddleList(bnList);

                            plan.ProcessPageIndex += CrawlerSetting.DB_TagList_Step;
                            _DouBanBookRepository.UpdatePlan(plan);
                        }
                    }
                }
                catch (ExceptionProxyConnect epc)
                {
                    throw epc;
                }
                catch (Exception ex)
                {
                    NLogUtil.ErrorTxt($"【错误】豆瓣爬书计划-TagList:{ex.Message}");
                }
            }
        }
コード例 #11
0
        /// <summary>
        /// 处理抓爬的书本Middle数据
        /// </summary>
        /// <param name="middle"></param>
        public async Task <bool> HandleBookMiddleAsync(BookDetail_middle middle)
        {
            try
            {
                NLogUtil.InfoTxt($"开始处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}");
                VerifyBookData(middle);

                var rAll = await _Db.Ado.UseTranAsync(() => {
                    try
                    {
                        //书本作者
                        var ePerson = _PersonDb.AddOrUpdate_MasterData <EPerson>(middle.Author);
                    }
                    catch (Exception ex)
                    {
                        NLogUtil.ErrorTxt($"【错误】写入人物:{ex.Message}");
                    }


                    //Section
                    //   var rSection = _SectionDb.AddOrUpdate_MasterData<ESection>(middle.SectionList);
                    try
                    {
                        //书本信息
                        var rBook = _BookDb.AddOrUpdate_MasterData <EBookInfo>(middle.DouBanBookInfo);
                    }
                    catch (Exception ex)
                    {
                        NLogUtil.ErrorTxt($"【错误】写入书本:{ex.Message}");
                    }



                    try
                    {
                        //书本Tag
                        var rTag = _TagDb.AddOrUpdate_MasterData <ETag>(middle.tagList);
                    }
                    catch (Exception ex)
                    {
                        NLogUtil.ErrorTxt($"【错误】写入更新Tags:{ex.Message}");
                    }


                    //    HandleDataSection(middle.GetDataSections());
                    //书本和Tag关系
                    HandleBookTag(middle.GetBookTags());
                });


                if (!rAll.IsSuccess)
                {
                    NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title} -- {rAll.ErrorMessage}");
                }
                else
                {
                    NLogUtil.InfoTxt($"【成功】处理书本到数据库:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}");
                }
                return(rAll.IsSuccess);
            }
            catch (Exception ex)
            {
                //  _Db.Ado.RollbackTran();
                NLogUtil.ErrorTxt($"[数据库]录入书本失败:{middle.DouBanBookInfo.Code}-{middle.DouBanBookInfo.Title}--{ex.Message}");
                return(false);
            }

            //  return true;
        }
コード例 #12
0
        public BookDetail_middle Crawler(string entryUrl = "")
        {
            InitData(entryUrl);
            NLogUtil.InfoTxt($"Crawler Book:${_entryUrl}");


            //List<BookDetail_middle> datas = new List<BookDetail_middle>();
            //datas.Add(_bookDetailData);

            var bi = _bookDetailData.DouBanBookInfo;


            HtmlDocument htmlDoc = getDocbyEntryUrl(entryUrl);

            VerifyHeader(htmlDoc);

            var bn = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='wrapper']/h1/span");

            if (bn == null)
            {
                return(null);
            }
            // throw new Exception("No Book Title");
            bi.Title = bn.InnerText;
            var node = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='mainpic']/a");

            bi.CoverUrl_Big = node.Attributes["href"].Value; //大图片

            node        = node.SelectSingleNode("//img");
            bi.CoverUrl = node.Attributes["src"].Value; //小图片

            var info      = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='info']");
            var infoAttrs = info.SelectNodes(".//span");

            if (infoAttrs == null)
            {
                return(null);
            }
            foreach (var span in infoAttrs)
            {
                try
                {
                    AnalyInfo(span);
                }
                catch (Exception ex)
                {
                    NLogUtil.ErrorTxt($"BookDetailCrawler Book Property-{_DouBanBookId}:[{span.InnerHtml}]{ex.Message}");
                }
            }

            AnalyContent(htmlDoc); //简介

            AnalyAuthor(htmlDoc);  //作者

            AnalyCatalog(htmlDoc); //目录

            AnalyScore(htmlDoc);   //分数

            AnalyTags(htmlDoc);    //Tags

            return(_bookDetailData);
        }