Example #1
0
        public static List <ArticleCategory> CrawCategories(string cateUrl)
        {
            List <ArticleCategory> categories = new List <ArticleCategory>();
            var                rootnode       = HtmlNoder.GetHtmlRoot(WebDomain + cateUrl);
            string             cateHtmlPath   = "//table[@class='catetable']/tr";   //"//span[@class='num']/font[last()]";
            HtmlNodeCollection cateHtml       = rootnode.SelectNodes(cateHtmlPath); //所有找到的节点都是一个集合

            if (cateHtml != null)
            {
                foreach (var p1 in cateHtml)
                {
                    var catePNode = p1.SelectSingleNode("td/div/a[@class='f1']");
                    var category  = GetCategory(catePNode);

                    var cate2Nodes = p1.SelectNodes("td/div/a[@class='f2']");
                    //category.Childs = new List<ArticleCategory>();

                    cateResposity.AddCategory(category);
                    Console.WriteLine(category.Title + ":" + category.ShortTitle);

                    categories.Add(category);
                    foreach (var p2Node in cate2Nodes)
                    {
                        var child = GetCategory(p2Node);
                        child.ParentId = category.Id;
                        cateResposity.AddCategory(child);
                        categories.Add(child);
                        Console.WriteLine(">>>>>>" + child.Title + ":" + child.ShortTitle);
                    }

                    Console.WriteLine("--------------------------------------------");
                }
            }
            return(categories);
        }
Example #2
0
        public static List <Article> CrawlArticles(string cateId, string listUrl)
        {
            var articleQueues = new ConcurrentQueue <Article>();
            var rootNode      = HtmlNoder.GetHtmlRoot(listUrl);//考虑分页
            //取出总页数
            var spanPager = rootNode.SelectSingleNode("//td[@class='pageTotal']/span[2]").InnerText();
            var totalPage = spanPager.ToInt();

            var pageNode = rootNode.SelectSingleNode("//div[@class='paginator']/span[@class='cpb']");
            var pageUrl  = pageNode == null ? null : HtmlTag.GetAnchor(pageNode.NextSibling).Href;

            if (string.IsNullOrWhiteSpace(pageUrl))
            {
                return(null);
            }
            var lurl = listUrl.Replace(WebDomain, "/").Replace(".html", "");

            Parallel.For(1, totalPage, curr => {
                try
                {
                    //"c-0001500001-1-5404-0-0-0-0-9-0-0.html";
                    var currUrl = lurl + "-" + curr + pageUrl.Substring(lurl.Length + 2);
                    CrawlArticleList(articleQueues, currUrl, cateId);

                    //var nextPageNode = rootNode.SelectSingleNode("//div[@class='paginator']/span[@class='cpb']").NextSibling;
                    //if (nextPageNode != null)
                    //{
                    //    nextUrl = HtmlTag.GetAnchor(nextPageNode).Href;
                    //}
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
            });

            var articleResposity = m_ServiceProvide.GetService <ICrawlDbResposity <Article> >();
            var titles           = articleQueues.Select(u => u.Title).ToArray();
            var allTitles        = articleResposity.Get(u => u.Title, u => true).ToList();

            var leaveTitltes   = titles.Where(w => !allTitles.Contains(w));
            var newArticleList = articleQueues.Where(w => leaveTitltes.Contains(w.Title)).ToList();

            if (newArticleList.Any())
            {
                articleResposity.AddBetch(newArticleList, 100);
            }

            return(articleQueues.ToList());
        }
Example #3
0
        public static void CrawlArticleList(ConcurrentQueue <Article> articleQueues, string currUrl, string cateId)
        {
            var rootNode = HtmlNoder.GetHtmlRoot(WebDomain + currUrl);

            if (rootNode == null)
            {
                return;
            }
            //列表页数据
            var articleNodes = rootNode.SelectNodes("//div[@class='doc-list']/ul/li");

            if (articleNodes == null || articleNodes.Count == 0)
            {
                Console.WriteLine($" >>>{currUrl}  :{cateId}");
            }
            Parallel.ForEach(articleNodes, articleNode =>
            {
                //foreach (var articleNode in articleNodes)
                //{
                try
                {
                    var aNode     = articleNode.SelectSingleNode("div[@class='doc-list-title']/h3/a");
                    var art       = HtmlTag.GetAnchor(aNode);
                    var page      = articleNode.SelectSingleNode("div[@class='doc-list-info']/div[@class='page']/strong").InnerText();
                    var readCount = articleNode.SelectSingleNode("div[@class='doc-list-comment']/div[@class='read']").InnerText().Replace("人已阅读", "");
                    var amount    = articleNode.SelectSingleNode("div[@class='doc-list-info']/div[@class='price ticket']/span").InnerText();
                    var type      = articleNode.SelectSingleNode("div[@class='doc-list-title']/h3/img").GetAttributeValue("class");
                    var coverUrl  = articleNode.SelectSingleNode("div[@class='doc-list-img']/a/img").GetAttributeValue("src");

                    var article = new Article()
                    {
                        Id            = Guid.NewGuid().ToString(),
                        Title         = art.Text,
                        ResourceUrl   = WebDomain + art.Href,
                        CategoryId    = cateId,
                        CreatorTime   = DateTime.Now,
                        BookStatus    = 1,
                        SortCode      = 1,
                        Keyword       = art.Text,
                        PageCount     = Convert.ToInt32(page),
                        ReadCount     = Convert.ToInt32(readCount),
                        RequireAmount = Convert.ToInt32(amount),
                        ResourceType  = type,
                        CoverUrl      = coverUrl,
                        EnabledMark   = true
                    };

                    var articleRootNode = HtmlNoder.GetHtmlRoot(WebDomain + art.Href);
                    if (articleRootNode == null)
                    {
                        return;
                    }
                    article.Description = articleRootNode.SelectSingleNode("//dl/dd[@class='fLeft wordwrap']").InnerText();
                    //var filesNodes = articleRootNode.SelectNodes("//div[class='outer_page']/div[class='inner_page']/div/img");
                    //var files = new List<string>();
                    //foreach (var imgNode in filesNodes)
                    //{
                    //    files.Add(imgNode.GetAttributeValue("src", ""));
                    //}
                    ////article.FileType =  FileType.doc;
                    //article.Attachment = string.Join(";", files);

                    articleQueues.Enqueue(article);

                    Console.WriteLine($" >>>{art.Text}  :{WebDomain + art.Href}");
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }

                //}
            });
        }