示例#1
0
        private static void GetListPage(string url, string category)
        {
            string html = GetGeneralContent(url);

            if (html == "")
            {
                return;
            }
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(html);
            HtmlNode htmlNode = document.DocumentNode.SelectSingleNode("//table[@class='CommonListArea']");

            if (htmlNode == null)
            {
                return;
            }
            int    tdIndex   = 0;
            string detailUrl = "";

            foreach (HtmlNode node in htmlNode.Elements("tr"))
            {
                try
                {
                    if (++tdIndex <= 2)
                    {
                        continue;
                    }
                    var tdArray = node.Elements("td").ToArray();
                    if (tdArray.Length < 4)
                    {
                        continue;
                    }
                    var blog      = new Blog();
                    var titleNode = tdArray[0];

                    var title = titleNode.InnerText;
                    if (title == "")
                    {
                        continue;
                    }
                    blog.Title = title.Length > 200 ? title.Substring(0, 200) : title;
                    Trace(title);
                    var link = titleNode.Elements("a").Skip(1).FirstOrDefault();
                    if (link == null)
                    {
                        continue;
                    }
                    detailUrl = link.GetAttributeValue("href", "");
                    if (detailUrl == "")
                    {
                        continue;
                    }
                    Trace(detailUrl);
                    string resourceId = GetResourceId(detailUrl);
                    if (CrawlerUtility.ExistRecord(resourceId, ResourceType.Ed2000))
                    {
                        _existResource++;
                        var distributeDate = tdArray[1].InnerText;
                        var updateTime     = tdArray[2].InnerText;
                        if (updateTime.Contains(distributeDate))
                        {
                            Info("Resource Existed,resourceId:{0}", resourceId);
                        }
                        else
                        {
                            _updateResource++;
                            int blogId = CrawlerUtility.ExistContinutedRecord(resourceId, ResourceType.Ed2000);
                            Info("resourceId:{0} exist,to be updated blogId:{1}", resourceId, blogId);
                            if (AppendBlogLinks(resourceId, blogId))
                            {
                                Info("Blog Updated:" + blogId);
                            }
                            else
                            {
                                Info("Blog Sync Fail,resourceId:{0},blogId:{1}", resourceId, blog.Id);
                            }
                            _fetchCount++;
                        }
                        continue;
                    }
                    string imageUrl = "";
                    var    urlList  = GetIntroDetail("http://www.ed2000.com" + detailUrl, blog, out imageUrl);
                    if (urlList.Count == 0)
                    {
                        continue;
                    }
                    bool syncFlag = false;
                    syncFlag = SaveBlog(blog, urlList);
                    if (!syncFlag)
                    {
                        Info("Blog Sync Fail,detailUrl:{0},blogId:{1}", detailUrl, blog.Id);
                        continue;
                    }
                    syncFlag = SaveRecomment(blog, imageUrl, category);
                    if (!syncFlag)
                    {
                        Info("Recomment Sync Fail,blogId:{0},imageUrl:{1}", blog.Id, imageUrl);
                    }
                    CrawlerUtility.AddResourceRecord(blog.Id, resourceId, ResourceType.Ed2000, true);
                    Info("Blog Added,blogId:{0},resourceId:{1}", blog.Id, resourceId);
                    _fetchCount++;
                }
                catch (Exception ex)
                {
                    Logger.Error(ex, detailUrl);
                }
            }
        }
示例#2
0
        private static void GetListPage(string url)
        {
            string html = GetGeneralContent(url);

            if (html == "")
            {
                return;
            }
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(html);
            HtmlNode htmlNode = document.DocumentNode.SelectSingleNode("//div[@class='resource-showlist']//ul");

            if (htmlNode == null)
            {
                return;
            }
            foreach (HtmlNode node in htmlNode.Elements("li"))
            {
                try
                {
                    bool   continuted    = false;
                    string continuteType = "";
                    var    nodeArray     = node.Elements("div").ToArray();
                    var    infoNode      = nodeArray[1];
                    if (infoNode == null)
                    {
                        continue;
                    }
                    var detailNode          = infoNode.Element("dl").Element("dt").Element("strong").Element("a");
                    var detailContinuedNode = infoNode.Element("dl").Element("dt").Element("font");
                    if (detailNode == null)
                    {
                        continue;
                    }
                    string title     = detailNode.InnerText;
                    string detailUrl = detailNode.GetAttributeValue("href", "");
                    if (detailUrl == "")
                    {
                        continue;
                    }
                    Console.WriteLine(title);
                    if (detailContinuedNode != null)
                    {
                        continuteType = detailContinuedNode.InnerText;
                        Console.WriteLine(continuteType);
                    }
                    if (continuteType.Contains("[尚未开播]"))
                    {
                        continue;
                    }
                    if (continuteType.Contains("连载中]") || continuteType.Contains("季完结]"))
                    {
                        continuted = true;
                    }

                    Console.WriteLine(detailUrl);
                    string resourceId = GetResourceId(detailUrl);
                    if (CrawlerUtility.ExistRecord(resourceId, ResourceType.Zimuzu))
                    {
                        int blogId = CrawlerUtility.ExistContinutedRecord(resourceId, ResourceType.Zimuzu);
                        if (blogId > 0)
                        {
                            Console.WriteLine("resourceId:{0} exist,to be updated blogId:{1}", resourceId, blogId);
                            if (AppendBlogLinks(resourceId, blogId))
                            {
                                Console.WriteLine("Blog Updated:" + blogId);
                            }
                            else
                            {
                                Console.WriteLine("Blog Sync Fail,resourceId:{0},blogId:{1}", resourceId, blogId);
                            }
                            if (!continuted)
                            {
                                CrawlerUtility.UpdateRecordOver(blogId);
                            }
                        }
                        else
                        {
                            Console.WriteLine("resourceId:{0} exist", resourceId);
                        }
                        continue;
                    }

                    var blog = new Blog {
                        Title = title.Length > 200 ? title.Substring(0, 200) : title
                    };
                    string blogImgUrl  = "";
                    string coverImgUrl = "";
                    GetBlogContent(detailUrl, blog, out blogImgUrl, out coverImgUrl);
                    string      linkUrl  = detailUrl.Replace("resource", "resource/list");
                    List <Link> linkList = GetBlogLink(linkUrl);
                    if (string.IsNullOrEmpty(blog.Content) || linkList.Count == 0)
                    {
                        continue;
                    }
                    ImageUrl imageUrl = DownloadBlogImgToLocal(blogImgUrl);
                    bool     syncFlag = false;
                    syncFlag = SaveBlog(blog, imageUrl, linkList);
                    if (!syncFlag)
                    {
                        Console.WriteLine("Blog Sync Fail,detailUrl:{0},blogId:{1}", detailUrl, blog.Id);
                        continue;
                    }
                    syncFlag = SaveRecomment(blog, coverImgUrl);
                    if (!syncFlag)
                    {
                        Console.WriteLine("Recomment Sync Fail,blogId:{0},imageUrl:{1}", blog.Id, imageUrl);
                    }
                    CrawlerUtility.AddResourceRecord(blog.Id, resourceId, ResourceType.Zimuzu, continuted);
                    Console.WriteLine("Blog Added:" + blog.Id);
                }
                catch (Exception ex)
                {
                    Logger.Error(ex);
                }
            }
        }