public ManualCrawlerForm(InforPublisher publisher) { InitializeComponent(); this._publisher = publisher; dtp_publish_time.Format = DateTimePickerFormat.Custom; dtp_publish_time.CustomFormat = "yyyy年MM月dd HH:mm:ss"; cmb_star_level.DisplayMember = "Value"; cmb_star_level.ValueMember = "Key"; cmb_star_level.DataSource = new List<KeyValuePair<int, string>>() { new KeyValuePair<int, string>(0, "不指定"), new KeyValuePair<int, string>(5, "5星级"), new KeyValuePair<int, string>(4, "4星级"), new KeyValuePair<int, string>(3, "3星级"), new KeyValuePair<int, string>(2, "2星级"), new KeyValuePair<int, string>(1, "1星级"), }; ResetInput(); }
public IList<InforHistory> CrawlXueQiuFellowNewPosts(InforPublisher publisher, string cookies) { var publisherId = publisher.Id; if (publisher == null) { return new List<InforHistory>(); } var publisherGroupType = "nothing"; var pageIndex = 1; var newPosts = new List<InforHistory>(); var savedLatestPost = _ihRepo.RetrieveTheLatest(publisher.Id); var hadScrawledBeforeThisTime = savedLatestPost != null; var hasFinished = false; while (true) { var histories = (List<InforHistory>)null; var container = (NetworkSourceFellowStatusContainerBase)null; switch (publisherGroupType) { case CrawlerType.SnowballFellowCrawler: var crawler1 = new SnowballFellowStatusCrawler(publisher.DataUrl, pageIndex, cookies); var container1 = crawler1.CrawlObject(); // 按动态发表的先后顺序(倒序)排序 histories = Mapper.Map<IList<SnowballFellowStatus>, IList<InforHistory>>(container1.Statuses).OrderByDescending(x => x.PublishTime).ToList(); // 赋值为具体容器 container = container1; break; case CrawlerType.SinaWeiboFellowCrawler: histories = new List<InforHistory>(); // 新浪微博每一页的数据分3次加载完成,为提高性能避免重复过多查询,每一批次i的数据均需要和最新(已获取过)的数据进行比较 var crawler2 = new SinaWeiboFellowStatusCrawler(publisher.DataUrl, publisher.CrawlingFrequency * 1000, publisher.ExtraParameter, pageIndex, cookies, savedLatestPost == null ? (DateTime?)null : savedLatestPost.PublishTime); var container2 = crawler2.CrawlObject(); // 按动态发表的先后顺序(倒序)排序 histories.AddRange(Mapper.Map<IList<SinaWeiboFellowStatus>, IList<InforHistory>>(container2.Statuses)); /* > 以下代码,抓取发布者的长篇文章 */ var crawler3 = new SinaWeiboFellowEssayCrawler(publisher.DataUrl, publisher.ExtraParameter, pageIndex, cookies, savedLatestPost == null ? (DateTime?)null : savedLatestPost.PublishTime); var container3 = crawler3.CrawlObject(); // 按文章发表的先后顺序(倒序)排序 var essayHistories = Mapper.Map<IList<SinaWeiboFellowStatus>, IList<InforHistory>>(container3.Statuses); // 添加长篇文章 histories.AddRange(essayHistories); /* > container2: 根据最新一条已存储记录,获取到的微博动态列表 > container3: 根据最新一条已存储记录,获取到的长篇文章列表 > container: 以其中(可能)存在尚未被获取且发表时间大于最新已存储记录的时间,作为继续获取数据的容器 > histories: 最终结果讲合并:动态列表+长篇文章,作为本轮数据的全集 */ container = container2.PageIndex < container2.PageCount ? container2 : container3; break; default: throw new NotImplementedException(string.Format("针对动态:{0} -> {1} -> {2} 的爬虫暂未实现!", publisher.GroupName, publisher.Name, publisher.CrawlerName)); } // 如果当前页没有数据返回,则认为已经访问完所有数据,并到此结束 if (histories.Count == 0) { break; } var orderedHistoryList = histories.OrderByDescending(x => x.PublishTime).ToList(); // 逐一检查新获取的每一条记录 foreach (var post in orderedHistoryList) { // 此动态已经是数据库中已保存的最新一条 if (hadScrawledBeforeThisTime && (post.MD5 == savedLatestPost.MD5 || post.PublishTime <= savedLatestPost.PublishTime)) { hasFinished = true; break; } // 新产生的动态,指定所属Publisher post.PublisherId = publisher.Id; // 避免内容完全一致的两条动态同时存在(虽然可能性较小) if (newPosts.Count(x => x.MD5 == post.MD5) == 0) { newPosts.Add(post); } } // 新产生的动态获取完毕 if (hasFinished) { break; } // 到达最后一页 if (container.PageIndex >= container.PageCount) { break; } Thread.Sleep(publisher.CrawlingFrequency * 1000); pageIndex++; } return newPosts; }
public IList<InforHistory> CrawlSinaWeiboStockCircleNewPosts(InforPublisher publisher, string cookies) { throw new NotImplementedException(); }
public IList<InforHistory> CrawlBusinessLeaderNewPosts(InforPublisher publisher, string cookies) { throw new NotImplementedException(); }
private void AddNewPostManually(InforPublisher selectedPublisher) { var form = new ManualCrawlerForm(selectedPublisher); form.ShowDialog(); }
private List<InforHistory> RetrieveNewXueqiuFellowPosts(InforPublisher publisher, string cookie) { return _bizInfoMedia.CrawlXueQiuFellowNewPosts(publisher, cookie).ToList(); }
private List<InforHistory> RetrieveNewSinaWeiboStockCirclePosts(InforPublisher publisher, string cookie) { return _bizInfoMedia.CrawlSinaWeiboStockCircleNewPosts(publisher, cookie).ToList(); }
private List<InforHistory> RetrieveNewBusinessLeaderPosts(InforPublisher publisher, string cookie) { return _bizInfoMedia.CrawlBusinessLeaderNewPosts(publisher, cookie).ToList(); }