private static void GetListPage(string url, string category) { string html = GetGeneralContent(url); if (html == "") { return; } HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); HtmlNode htmlNode = document.DocumentNode.SelectSingleNode("//table[@class='CommonListArea']"); if (htmlNode == null) { return; } int tdIndex = 0; string detailUrl = ""; foreach (HtmlNode node in htmlNode.Elements("tr")) { try { if (++tdIndex <= 2) { continue; } var tdArray = node.Elements("td").ToArray(); if (tdArray.Length < 4) { continue; } var blog = new Blog(); var titleNode = tdArray[0]; var title = titleNode.InnerText; if (title == "") { continue; } blog.Title = title.Length > 200 ? title.Substring(0, 200) : title; Trace(title); var link = titleNode.Elements("a").Skip(1).FirstOrDefault(); if (link == null) { continue; } detailUrl = link.GetAttributeValue("href", ""); if (detailUrl == "") { continue; } Trace(detailUrl); string resourceId = GetResourceId(detailUrl); if (CrawlerUtility.ExistRecord(resourceId, ResourceType.Ed2000)) { _existResource++; var distributeDate = tdArray[1].InnerText; var updateTime = tdArray[2].InnerText; if (updateTime.Contains(distributeDate)) { Info("Resource Existed,resourceId:{0}", resourceId); } else { _updateResource++; int blogId = CrawlerUtility.ExistContinutedRecord(resourceId, ResourceType.Ed2000); Info("resourceId:{0} exist,to be updated blogId:{1}", resourceId, blogId); if (AppendBlogLinks(resourceId, blogId)) { Info("Blog Updated:" + blogId); } else { Info("Blog Sync Fail,resourceId:{0},blogId:{1}", resourceId, blog.Id); } _fetchCount++; } continue; } string imageUrl = ""; var urlList = GetIntroDetail("http://www.ed2000.com" + detailUrl, blog, out imageUrl); if (urlList.Count == 0) { continue; } bool syncFlag = false; syncFlag = SaveBlog(blog, urlList); if (!syncFlag) { Info("Blog Sync Fail,detailUrl:{0},blogId:{1}", detailUrl, blog.Id); continue; } syncFlag = SaveRecomment(blog, imageUrl, category); if (!syncFlag) { Info("Recomment Sync Fail,blogId:{0},imageUrl:{1}", blog.Id, imageUrl); } CrawlerUtility.AddResourceRecord(blog.Id, resourceId, ResourceType.Ed2000, true); Info("Blog Added,blogId:{0},resourceId:{1}", blog.Id, resourceId); _fetchCount++; } catch (Exception ex) { Logger.Error(ex, detailUrl); } } }
private static void GetListPage(string url) { string html = GetGeneralContent(url); if (html == "") { return; } HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); HtmlNode htmlNode = document.DocumentNode.SelectSingleNode("//div[@class='resource-showlist']//ul"); if (htmlNode == null) { return; } foreach (HtmlNode node in htmlNode.Elements("li")) { try { bool continuted = false; string continuteType = ""; var nodeArray = node.Elements("div").ToArray(); var infoNode = nodeArray[1]; if (infoNode == null) { continue; } var detailNode = infoNode.Element("dl").Element("dt").Element("strong").Element("a"); var detailContinuedNode = infoNode.Element("dl").Element("dt").Element("font"); if (detailNode == null) { continue; } string title = detailNode.InnerText; string detailUrl = detailNode.GetAttributeValue("href", ""); if (detailUrl == "") { continue; } Console.WriteLine(title); if (detailContinuedNode != null) { continuteType = detailContinuedNode.InnerText; Console.WriteLine(continuteType); } if (continuteType.Contains("[尚未开播]")) { continue; } if (continuteType.Contains("连载中]") || continuteType.Contains("季完结]")) { continuted = true; } Console.WriteLine(detailUrl); string resourceId = GetResourceId(detailUrl); if (CrawlerUtility.ExistRecord(resourceId, ResourceType.Zimuzu)) { int blogId = CrawlerUtility.ExistContinutedRecord(resourceId, ResourceType.Zimuzu); if (blogId > 0) { Console.WriteLine("resourceId:{0} exist,to be updated blogId:{1}", resourceId, blogId); if (AppendBlogLinks(resourceId, blogId)) { Console.WriteLine("Blog Updated:" + blogId); } else { Console.WriteLine("Blog Sync Fail,resourceId:{0},blogId:{1}", resourceId, blogId); } if (!continuted) { CrawlerUtility.UpdateRecordOver(blogId); } } else { Console.WriteLine("resourceId:{0} exist", resourceId); } continue; } var blog = new Blog { Title = title.Length > 200 ? title.Substring(0, 200) : title }; string blogImgUrl = ""; string coverImgUrl = ""; GetBlogContent(detailUrl, blog, out blogImgUrl, out coverImgUrl); string linkUrl = detailUrl.Replace("resource", "resource/list"); List <Link> linkList = GetBlogLink(linkUrl); if (string.IsNullOrEmpty(blog.Content) || linkList.Count == 0) { continue; } ImageUrl imageUrl = DownloadBlogImgToLocal(blogImgUrl); bool syncFlag = false; syncFlag = SaveBlog(blog, imageUrl, linkList); if (!syncFlag) { Console.WriteLine("Blog Sync Fail,detailUrl:{0},blogId:{1}", detailUrl, blog.Id); continue; } syncFlag = SaveRecomment(blog, coverImgUrl); if (!syncFlag) { Console.WriteLine("Recomment Sync Fail,blogId:{0},imageUrl:{1}", blog.Id, imageUrl); } CrawlerUtility.AddResourceRecord(blog.Id, resourceId, ResourceType.Zimuzu, continuted); Console.WriteLine("Blog Added:" + blog.Id); } catch (Exception ex) { Logger.Error(ex); } } }