Beispiel #1
0
 public ManualCrawlerForm(InforPublisher publisher)
 {
     InitializeComponent();
     this._publisher = publisher;
     dtp_publish_time.Format = DateTimePickerFormat.Custom;
     dtp_publish_time.CustomFormat = "yyyy年MM月dd HH:mm:ss";
     cmb_star_level.DisplayMember = "Value";
     cmb_star_level.ValueMember = "Key";
     cmb_star_level.DataSource = new List<KeyValuePair<int, string>>()
     {
         new KeyValuePair<int, string>(0, "不指定"),
         new KeyValuePair<int, string>(5, "5星级"),
         new KeyValuePair<int, string>(4, "4星级"),
         new KeyValuePair<int, string>(3, "3星级"),
         new KeyValuePair<int, string>(2, "2星级"),
         new KeyValuePair<int, string>(1, "1星级"),
     };
     ResetInput();
 }
Beispiel #2
0
        public IList<InforHistory> CrawlXueQiuFellowNewPosts(InforPublisher publisher, string cookies)
        {
            var publisherId = publisher.Id;

            if (publisher == null)
            {
                return new List<InforHistory>();
            }

            var publisherGroupType = "nothing";
            var pageIndex = 1;
            var newPosts = new List<InforHistory>();
            var savedLatestPost = _ihRepo.RetrieveTheLatest(publisher.Id);
            var hadScrawledBeforeThisTime = savedLatestPost != null;
            var hasFinished = false;

            while (true)
            {
                var histories = (List<InforHistory>)null;
                var container = (NetworkSourceFellowStatusContainerBase)null;

                switch (publisherGroupType)
                {
                    case CrawlerType.SnowballFellowCrawler:

                        var crawler1 = new SnowballFellowStatusCrawler(publisher.DataUrl, pageIndex, cookies);
                        var container1 = crawler1.CrawlObject();
                        // 按动态发表的先后顺序(倒序)排序
                        histories = Mapper.Map<IList<SnowballFellowStatus>, IList<InforHistory>>(container1.Statuses).OrderByDescending(x => x.PublishTime).ToList();
                        // 赋值为具体容器
                        container = container1;
                        break;

                    case CrawlerType.SinaWeiboFellowCrawler:

                        histories = new List<InforHistory>();

                        // 新浪微博每一页的数据分3次加载完成,为提高性能避免重复过多查询,每一批次i的数据均需要和最新(已获取过)的数据进行比较
                        var crawler2 = new SinaWeiboFellowStatusCrawler(publisher.DataUrl, publisher.CrawlingFrequency * 1000, publisher.ExtraParameter, pageIndex, cookies, savedLatestPost == null ? (DateTime?)null : savedLatestPost.PublishTime);
                        var container2 = crawler2.CrawlObject();
                        // 按动态发表的先后顺序(倒序)排序
                        histories.AddRange(Mapper.Map<IList<SinaWeiboFellowStatus>, IList<InforHistory>>(container2.Statuses));

                        /*
                         > 以下代码,抓取发布者的长篇文章
                        */
                        var crawler3 = new SinaWeiboFellowEssayCrawler(publisher.DataUrl, publisher.ExtraParameter, pageIndex, cookies, savedLatestPost == null ? (DateTime?)null : savedLatestPost.PublishTime);
                        var container3 = crawler3.CrawlObject();
                        // 按文章发表的先后顺序(倒序)排序
                        var essayHistories = Mapper.Map<IList<SinaWeiboFellowStatus>, IList<InforHistory>>(container3.Statuses);
                        // 添加长篇文章
                        histories.AddRange(essayHistories);

                        /*
                            > container2: 根据最新一条已存储记录,获取到的微博动态列表
                            > container3: 根据最新一条已存储记录,获取到的长篇文章列表
                            > container: 以其中(可能)存在尚未被获取且发表时间大于最新已存储记录的时间,作为继续获取数据的容器
                            > histories: 最终结果讲合并:动态列表+长篇文章,作为本轮数据的全集
                        */
                        container = container2.PageIndex < container2.PageCount ? container2 : container3;

                        break;

                    default: throw new NotImplementedException(string.Format("针对动态:{0} -> {1} -> {2} 的爬虫暂未实现!", publisher.GroupName, publisher.Name, publisher.CrawlerName));
                }

                // 如果当前页没有数据返回,则认为已经访问完所有数据,并到此结束
                if (histories.Count == 0)
                {
                    break;
                }

                var orderedHistoryList = histories.OrderByDescending(x => x.PublishTime).ToList();

                // 逐一检查新获取的每一条记录
                foreach (var post in orderedHistoryList)
                {
                    // 此动态已经是数据库中已保存的最新一条
                    if (hadScrawledBeforeThisTime && (post.MD5 == savedLatestPost.MD5 || post.PublishTime <= savedLatestPost.PublishTime))
                    {
                        hasFinished = true;
                        break;
                    }

                    // 新产生的动态,指定所属Publisher
                    post.PublisherId = publisher.Id;

                    // 避免内容完全一致的两条动态同时存在(虽然可能性较小)
                    if (newPosts.Count(x => x.MD5 == post.MD5) == 0)
                    {
                        newPosts.Add(post);
                    }
                }

                // 新产生的动态获取完毕
                if (hasFinished)
                {
                    break;
                }

                // 到达最后一页
                if (container.PageIndex >= container.PageCount)
                {
                    break;
                }

                Thread.Sleep(publisher.CrawlingFrequency * 1000);
                pageIndex++;
            }

            return newPosts;
        }
Beispiel #3
0
 public IList<InforHistory> CrawlSinaWeiboStockCircleNewPosts(InforPublisher publisher, string cookies)
 {
     throw new NotImplementedException();
 }
Beispiel #4
0
 public IList<InforHistory> CrawlBusinessLeaderNewPosts(InforPublisher publisher, string cookies)
 {
     throw new NotImplementedException();
 }
 private void AddNewPostManually(InforPublisher selectedPublisher)
 {
     var form = new ManualCrawlerForm(selectedPublisher);
     form.ShowDialog();
 }
 private List<InforHistory> RetrieveNewXueqiuFellowPosts(InforPublisher publisher, string cookie)
 {
     return _bizInfoMedia.CrawlXueQiuFellowNewPosts(publisher, cookie).ToList();
 }
 private List<InforHistory> RetrieveNewSinaWeiboStockCirclePosts(InforPublisher publisher, string cookie)
 {
     return _bizInfoMedia.CrawlSinaWeiboStockCircleNewPosts(publisher, cookie).ToList();
 }
 private List<InforHistory> RetrieveNewBusinessLeaderPosts(InforPublisher publisher, string cookie)
 {
     return _bizInfoMedia.CrawlBusinessLeaderNewPosts(publisher, cookie).ToList();
 }