//传承人和非遗用的同一个页面结构,所以可以复用 public static void GenerateCCRDetail(string url) { if (MongodbChecker.CheckHeritageProjectInheritatePeopleExist(url)) { return; } var bson = GenerateHeritageProjectDetailPage(url, false); if (bson != null) { MongodbSaver.SaveHeritageProjectInheritatePeople(bson); } }
private static void GetBanner(BufferBlock <string> imageTargetBlock) { var doc = WebpageHelper.GetHttpRequestDocument(MainPage); var nodes = from links in doc.DocumentNode.Descendants() where links.Name == "a" && links.Attributes["href"] != null && links.Attributes["class"] != null && links.Attributes["class"].Value.Equals("slick-link p-show") && !MongodbChecker.CheckMainNewsList(links.Attributes["href"].Value) select new BsonDocument() .Add("link", links.Attributes["href"].Value) .Add("img", WebpageHelper.GetSubUrl( links.Attributes["style"].Value.Substring( links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal), links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) - links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)))) .Add("compressImg", WebImageSaver.Instance.GetComressImageName(WebpageHelper.GetSubUrl( links.Attributes["style"].Value.Substring( links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal), links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) - links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))))) .Add("originImage", links.Attributes["style"].Value.Substring( links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal), links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) - links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))); foreach (var node in nodes) { imageTargetBlock.Post(node["originImage"].AsBsonValue.ToString()); } var enumerable = nodes as BsonDocument[] ?? nodes.ToArray(); if (!enumerable.Any()) { return; } MongodbSaver.SaveMainpageNewsList(nodes); foreach (var node in enumerable) { Console.WriteLine(node["link"].AsBsonValue + " " + node["img"].AsBsonValue); } }
public static async Task <int> GenerateProjectDetailPage(ISourceBlock <string> urlSource) { var errTime = 0; while (await urlSource.OutputAvailableAsync()) { if (errTime == 10) { return(DUPLICATED_NEWS); } var url = urlSource.Receive(); if (MongodbChecker.CheckHeritageProjectDetailExist(url)) { errTime++; continue; } var bson = GenerateHeritageProjectDetailPage(url, true); if (bson != null) { MongodbSaver.SaveHeritageProjectDetail(bson); } } return(PROCESS_SUCCESS); }
private static void GetAllProjectList(BufferBlock <string> imageTargetBlock) { short errorTime = 0; var totalPages = DebugHelper.DebugHelperTools.IsDebugMode() ? 2 : 10; var block = new BufferBlock <string>(); var task = GetHeritageProjectDetailWorker.GenerateProjectDetailPage(block); for (int i = 1; i < totalPages; i++) { if (errorTime > 10) { Console.WriteLine("GetAllProjectList: reach the limitation of error time"); break; } var currentPage = String.Format(REQUEST_URL, i); Console.WriteLine("Starting process: " + currentPage); var requestResult = WebpageHelper.GetRequest(currentPage); if (string.IsNullOrEmpty(requestResult)) { errorTime++; continue; } var jsonObject = JsonConvert.DeserializeObject <HeritageProjectRequest>(requestResult); if (jsonObject.Links.Total_pages != 0 && jsonObject.Links.Total_pages != totalPages && !DebugHelper.DebugHelperTools.IsDebugMode()) { totalPages = jsonObject.Links.Total_pages; } var list = jsonObject.List; if (list == null || list.Length == 0) { continue; } var bsonArray = new List <BsonDocument>(); var heritageType = typeof(HeritageProject); var properties = typeof(HeritageProject).GetProperties(); for (int j = 0; j < list.Length; j++) { var bsonDocument = new BsonDocument(); list[j].Link = "/project_details/" + list[j].Id; foreach (var property in properties) { //反射获取HeritageProject所有属性 //以属性名作为MongoDB存储的Key值 //反射获取对应List当中的值 bsonDocument.Add(property.Name.ToLower(), Regex.Replace(heritageType.GetProperty(property.Name).GetValue(list[j]).ToString(), "<.*?>", string.Empty)); } if (MongodbChecker.CheckHeritageProjectExist(list[j].Link)) { Console.WriteLine("Duplicated Heritage Project Link: {0}", list[j].Link); errorTime++; continue; } block.Post(list[j].Link); bsonArray.Add(bsonDocument); } if (bsonArray.Count != 0) { MongodbSaver.SaveHeritageProjectNewsList(bsonArray); } if (jsonObject.More != 1) { Console.WriteLine("GetAllProjectList: current page is {0}, more equals 0", i); break; } } block.Complete(); task.Wait(); }
public static async Task <int> GetNewsList(Task <int> imageSaverTask, BufferBlock <string> imageTargetBlock) { BufferBlock <string> newsDetailTargetBlock = new BufferBlock <string>(); int errorTime = 0; var newsDetailPageGenerate = GetNewsDetail.GenerateNewsDetail(newsDetailTargetBlock, imageTargetBlock); int pageNumber = DebugHelperTools.IsDebugMode() ? 2 : 255; for (int page = 1; page < pageNumber; page++) { if (errorTime > 10) { Console.WriteLine("reach the limitation of error time"); break; } Console.WriteLine("starting process {0} page", page); string pageURL = String.Format("{0}?category_id=9&page={1}&limit=0", GetIhChina.NewsListUrl, page); var result = WebpageHelper.GetRequest(pageURL); var jsonObject = JsonConvert.DeserializeObject <NewsListResponse>(result); if (jsonObject.more != 1 && String.IsNullOrEmpty(jsonObject.data)) { Console.WriteLine("GetNewsList: There is no more here, break from the loop"); break; } var doc = new HtmlDocument(); doc.LoadHtml(jsonObject.data); var newsListNodes = from links in doc.DocumentNode.Descendants() where links.Name == "div" && links.Attributes["class"] != null && links.Attributes["class"].Value == "list-item" select links; var newslistBsons = new List <BsonDocument>(); foreach (var node in newsListNodes) { var newsBson = new BsonDocument(); var titleNode = node.SelectSingleNode(".//div[@class='h16']/a"); if (titleNode != null) { var link = titleNode.Attributes["href"].Value; if (MongodbChecker.CheckNewsExist(link)) { Console.WriteLine("duplicated url: page {0}", link); errorTime++; continue; } newsDetailTargetBlock.Post(link); newsBson.Add("link", link); newsBson.Add("title", titleNode.Attributes["title"].Value); } var imgNode = node.SelectSingleNode(".//img"); if (imgNode != null) { var imgUrl = WebpageHelper.GetSubUrl(imgNode.Attributes["src"].Value); newsBson.Add("img", imgUrl); newsBson.Add("compressImg", WebImageSaver.Instance.GetComressImageName(imgUrl)); imageTargetBlock.Post(GetIhChina.MainPage + imgNode.Attributes["src"].Value); } var dataNode = node.SelectSingleNode(".//div[@class='date']/div"); if (dataNode != null) { newsBson.Add("date", dataNode.InnerText); } var contentNode = node.SelectSingleNode(".//div[@class='p']"); if (contentNode != null) { newsBson.Add("content", contentNode.InnerText); } if (newsBson.Count() != 0) { newslistBsons.Add(newsBson); } //每10条进行一次数据库插入,减少内存负担 if (newslistBsons.Count == 10) { MongodbSaver.SaveNewsList(newslistBsons); for (int i = 0; i < 10; i++) { newslistBsons[i] = null; } newslistBsons.Clear(); } } if (newslistBsons.Count != 0) { MongodbSaver.SaveNewsList(newslistBsons); } } newsDetailTargetBlock.Complete(); return(await newsDetailPageGenerate); }