public static int GetPageLastIndex(string pageUrl) { var doc = WebpageHelper.GetHttpRequestDocument(pageUrl); var nodes = doc.DocumentNode.SelectNodes("//div[@class='page-mod']/ul/li"); if (nodes == null) { return(100); } int lastIndex = 0; foreach (var node in nodes) { string indexString = node.InnerText; int tempIndex; if (int.TryParse(indexString, out tempIndex)) { if (lastIndex < tempIndex) { lastIndex = tempIndex; } } } return(lastIndex); }
public static void GetForumsList(BufferBlock <string> imageTargetBlock) { var block = new BufferBlock <string>(); var task = GetNewsDetail.GenerateForumDetail(block, imageTargetBlock); int errorTime = 0; var firstPage = "http://www.ihchina.cn/luntan/p/1.html"; var lastPageNumber = DebugHelperTools.IsDebugMode() ? 2 : WebpageHelper.GetPageLastIndex(firstPage); for (int i = 1; i < lastPageNumber && errorTime < 10; i++) { var listUrl = string.Format("http://www.ihchina.cn/luntan/p/{0}.html", i); Console.WriteLine("starting process page:{0}", listUrl); var doc = WebpageHelper.GetHttpRequestDocument(listUrl); var listNodes = doc.DocumentNode.SelectNodes("//div[@id='datalist']/div[@class='list-item']"); if (listNodes == null) { errorTime++; continue; } List <BsonDocument> result = new List <BsonDocument>(); foreach (var node in listNodes) { if (errorTime == 10) { break; } var bson = WebpageHelper.AnalizeGeneralListInformation(node, MongodbChecker.CheckForumsListExist, imageTargetBlock); if (bson == null) { errorTime++; Console.WriteLine("duplicated url: page {0}", i); continue; } if (bson != null) { var link = bson.GetElement("link").Value.ToString(); block.Post(link); result.Add(bson); } //每10条进行一次数据库插入,减少内存负担 if (result.Count == 10) { MongodbSaver.SaveForumsList(result); result.Clear(); } } if (result.Count > 0) { MongodbSaver.SaveForumsList(result); } } block.Complete(); task.Wait(); }
public async Task <int> SaveFileAsync(ISourceBlock <string> source) { while (await source.OutputAvailableAsync()) { WebClient wc = new WebClient(); if (!Directory.Exists("img")) { Directory.CreateDirectory("img"); } var imageUrl = source.Receive(); if (!imageUrl.StartsWith(GetIhChina.MainPage)) { imageUrl = GetIhChina.MainPage + imageUrl; } var savePath = Path.Combine(Directory.GetCurrentDirectory(), "img", WebpageHelper.GetSubUrl(imageUrl)); if (File.Exists(savePath)) { continue; } try { Console.WriteLine("Starting to save image {0}", imageUrl); wc.DownloadFile(imageUrl, savePath); //CompressImageWithGuetzli(savePath); 压缩貌似有点问题,待修复 CorpImage(savePath); Console.WriteLine("Save image {0} completely", imageUrl); } catch (WebException e) { Console.WriteLine("Save image {0} error", imageUrl); Console.WriteLine(e); } catch (Exception e) { Console.WriteLine("unhandled error"); Console.WriteLine(e); } } return(1); }
public static string GetSimpleRequestResult(string url) { string result; using (StreamReader file = new StreamReader(Path.Combine(Path.Combine(Directory.GetCurrentDirectory(), "cache"), WebpageHelper.GetSubUrl(url)))) { result = file.ReadToEnd(); } return(result); }
public static string GetCacheImageName(string imageURL) { return(Path.Combine(Directory.GetCurrentDirectory(), "img", WebpageHelper.GetSubUrl(imageURL))); }
public static string GetCacheFileName(string url) { if (!Directory.Exists(Path.Combine(Directory.GetCurrentDirectory(), "cache"))) { Directory.CreateDirectory(Path.Combine(Directory.GetCurrentDirectory(), "cache")); } return(Path.Combine(Path.Combine(Directory.GetCurrentDirectory(), "cache"), WebpageHelper.GetSubUrl(url))); }
public static bool CheckCacheFileExist(string url) { return(File.Exists(Path.Combine(Path.Combine(Directory.GetCurrentDirectory(), "cache"), WebpageHelper.GetSubUrl(url)))); }
private static BsonDocument processDetailPage(string url, BufferBlock<string> imageBlock) { Console.WriteLine("starting process: " + url + "...."); var doc = WebpageHelper.GetHttpRequestDocument(url); var titleNode = doc.DocumentNode.SelectSingleNode("//div[@class='article-title']"); if (titleNode == null) return null; bson.Clear(); bson.Add("link", url); var titleNameNode = titleNode.SelectSingleNode(".//div[@class='h24']"); if (titleNameNode != null) { bson.Add("title", titleNameNode.InnerText.Replace("\t", "")); //规格化文字,TODO 未完成 } var titleSubItemsNodes = titleNode.SelectNodes(".//div[@class='sub']/span[@class='sub-item']"); if(titleSubItemsNodes==null) { return null; } var subTitleList = new BsonArray(); foreach (var subItemNode in titleSubItemsNodes) { if (subItemNode.SelectSingleNode(".//div[@class='en']") != null) //日期的要单独处理 { var dateNode = subItemNode.SelectSingleNode(".//div[@class='en'"); subTitleList.Add(dateNode.InnerText); } else { subTitleList.Add(subItemNode.InnerText); } } bson.Add("subtitle", subTitleList); //开始读取内容 var contentList = new BsonArray(); var contentNodes = doc.DocumentNode.SelectNodes("//div[@class='article-cont']/p"); if(contentNodes==null) { return null; } foreach (var node in contentNodes) { var lineDic = new BsonDocument(); var picNode = node.SelectSingleNode(".//img"); if (picNode != null) { var imgUrl = WebpageHelper.GetSubUrl(picNode.Attributes["src"].Value); lineDic["type"] = "img"; lineDic["content"] = imgUrl; lineDic["compressImg"] = WebImageSaver.Instance.GetComressImageName(imgUrl); imageBlock.Post(GetIhChina.MainPage + picNode.Attributes["src"].Value); } else { lineDic["type"] = "text"; lineDic["content"] = node.InnerText; } contentList.Add(lineDic); } bson.Add("content", contentList); var authorNode = doc.DocumentNode.SelectSingleNode("//div[@class='author']"); if (authorNode != null) { bson.Add("author", authorNode.InnerText); } //爬取底部相关阅读 var relativeNews = doc.DocumentNode.SelectNodes("//div[@class='list-mod4']/a"); if (relativeNews != null) { var relativeNewsList = new BsonArray(); foreach (var node in relativeNews) { var newsDic = new BsonDocument(); newsDic["link"] = node.Attributes["href"].Value; var date = node.SelectSingleNode(".//div[@class='date']"); if (date != null) { newsDic["date"] = date.InnerText; } var title = node.SelectSingleNode(".//div[@class='p']"); if (title != null) { newsDic["title"] = title.InnerText; } relativeNewsList.Add(newsDic); } if (relativeNewsList.Count > 0) { bson.Add("relativeNews", relativeNewsList); } } Console.WriteLine(bson); return bson; }
public static async Task <int> GetNewsList(Task <int> imageSaverTask, BufferBlock <string> imageTargetBlock) { BufferBlock <string> newsDetailTargetBlock = new BufferBlock <string>(); int errorTime = 0; var newsDetailPageGenerate = GetNewsDetail.GenerateNewsDetail(newsDetailTargetBlock, imageTargetBlock); int pageNumber = DebugHelperTools.IsDebugMode() ? 2 : 255; for (int page = 1; page < pageNumber; page++) { if (errorTime > 10) { Console.WriteLine("reach the limitation of error time"); break; } Console.WriteLine("starting process {0} page", page); string pageURL = String.Format("{0}?category_id=9&page={1}&limit=0", GetIhChina.NewsListUrl, page); var result = WebpageHelper.GetRequest(pageURL); var jsonObject = JsonConvert.DeserializeObject <NewsListResponse>(result); if (jsonObject.more != 1 && String.IsNullOrEmpty(jsonObject.data)) { Console.WriteLine("GetNewsList: There is no more here, break from the loop"); break; } var doc = new HtmlDocument(); doc.LoadHtml(jsonObject.data); var newsListNodes = from links in doc.DocumentNode.Descendants() where links.Name == "div" && links.Attributes["class"] != null && links.Attributes["class"].Value == "list-item" select links; var newslistBsons = new List <BsonDocument>(); foreach (var node in newsListNodes) { var newsBson = new BsonDocument(); var titleNode = node.SelectSingleNode(".//div[@class='h16']/a"); if (titleNode != null) { var link = titleNode.Attributes["href"].Value; if (MongodbChecker.CheckNewsExist(link)) { Console.WriteLine("duplicated url: page {0}", link); errorTime++; continue; } newsDetailTargetBlock.Post(link); newsBson.Add("link", link); newsBson.Add("title", titleNode.Attributes["title"].Value); } var imgNode = node.SelectSingleNode(".//img"); if (imgNode != null) { var imgUrl = WebpageHelper.GetSubUrl(imgNode.Attributes["src"].Value); newsBson.Add("img", imgUrl); newsBson.Add("compressImg", WebImageSaver.Instance.GetComressImageName(imgUrl)); imageTargetBlock.Post(GetIhChina.MainPage + imgNode.Attributes["src"].Value); } var dataNode = node.SelectSingleNode(".//div[@class='date']/div"); if (dataNode != null) { newsBson.Add("date", dataNode.InnerText); } var contentNode = node.SelectSingleNode(".//div[@class='p']"); if (contentNode != null) { newsBson.Add("content", contentNode.InnerText); } if (newsBson.Count() != 0) { newslistBsons.Add(newsBson); } //每10条进行一次数据库插入,减少内存负担 if (newslistBsons.Count == 10) { MongodbSaver.SaveNewsList(newslistBsons); for (int i = 0; i < 10; i++) { newslistBsons[i] = null; } newslistBsons.Clear(); } } if (newslistBsons.Count != 0) { MongodbSaver.SaveNewsList(newslistBsons); } } newsDetailTargetBlock.Complete(); return(await newsDetailPageGenerate); }