コード例 #1
0
        public static int GetPageLastIndex(string pageUrl)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(pageUrl);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='page-mod']/ul/li");

            if (nodes == null)
            {
                return(100);
            }
            int lastIndex = 0;

            foreach (var node in nodes)
            {
                string indexString = node.InnerText;
                int    tempIndex;
                if (int.TryParse(indexString, out tempIndex))
                {
                    if (lastIndex < tempIndex)
                    {
                        lastIndex = tempIndex;
                    }
                }
            }
            return(lastIndex);
        }
コード例 #2
0
        public static void GetForumsList(BufferBlock <string> imageTargetBlock)
        {
            var block          = new BufferBlock <string>();
            var task           = GetNewsDetail.GenerateForumDetail(block, imageTargetBlock);
            int errorTime      = 0;
            var firstPage      = "http://www.ihchina.cn/luntan/p/1.html";
            var lastPageNumber = DebugHelperTools.IsDebugMode() ? 2 : WebpageHelper.GetPageLastIndex(firstPage);

            for (int i = 1; i < lastPageNumber && errorTime < 10; i++)
            {
                var listUrl = string.Format("http://www.ihchina.cn/luntan/p/{0}.html", i);
                Console.WriteLine("starting process page:{0}", listUrl);
                var doc       = WebpageHelper.GetHttpRequestDocument(listUrl);
                var listNodes = doc.DocumentNode.SelectNodes("//div[@id='datalist']/div[@class='list-item']");
                if (listNodes == null)
                {
                    errorTime++;
                    continue;
                }
                List <BsonDocument> result = new List <BsonDocument>();
                foreach (var node in listNodes)
                {
                    if (errorTime == 10)
                    {
                        break;
                    }
                    var bson = WebpageHelper.AnalizeGeneralListInformation(node, MongodbChecker.CheckForumsListExist, imageTargetBlock);
                    if (bson == null)
                    {
                        errorTime++;
                        Console.WriteLine("duplicated url: page {0}", i);
                        continue;
                    }
                    if (bson != null)
                    {
                        var link = bson.GetElement("link").Value.ToString();
                        block.Post(link);
                        result.Add(bson);
                    }
                    //每10条进行一次数据库插入,减少内存负担
                    if (result.Count == 10)
                    {
                        MongodbSaver.SaveForumsList(result);
                        result.Clear();
                    }
                }
                if (result.Count > 0)
                {
                    MongodbSaver.SaveForumsList(result);
                }
            }
            block.Complete();
            task.Wait();
        }
コード例 #3
0
        private static BsonDocument processDetailPage(string url, BufferBlock<string> imageBlock)
        {
            Console.WriteLine("starting process: " + url + "....");
            var doc = WebpageHelper.GetHttpRequestDocument(url);
            var titleNode = doc.DocumentNode.SelectSingleNode("//div[@class='article-title']");
            if (titleNode == null)
                return null;
            bson.Clear();
            bson.Add("link", url);
            var titleNameNode = titleNode.SelectSingleNode(".//div[@class='h24']");
            if (titleNameNode != null)
            {
                bson.Add("title", titleNameNode.InnerText.Replace("\t", "")); //规格化文字,TODO 未完成
            }
            var titleSubItemsNodes = titleNode.SelectNodes(".//div[@class='sub']/span[@class='sub-item']");
            if(titleSubItemsNodes==null)
            {
                return null;
            }
            var subTitleList = new BsonArray();
            foreach (var subItemNode in titleSubItemsNodes)
            {
                if (subItemNode.SelectSingleNode(".//div[@class='en']") != null) //日期的要单独处理
                {
                    var dateNode = subItemNode.SelectSingleNode(".//div[@class='en'");
                    subTitleList.Add(dateNode.InnerText);
                }
                else
                {
                    subTitleList.Add(subItemNode.InnerText);
                }
            }
            bson.Add("subtitle", subTitleList);

            //开始读取内容
            var contentList = new BsonArray();
            var contentNodes = doc.DocumentNode.SelectNodes("//div[@class='article-cont']/p");
            if(contentNodes==null)
            {
                return null;
            }
            foreach (var node in contentNodes)
            {
                var lineDic = new BsonDocument();
                var picNode = node.SelectSingleNode(".//img");
                if (picNode != null)
                {
                    var imgUrl = WebpageHelper.GetSubUrl(picNode.Attributes["src"].Value);
                    lineDic["type"] = "img";
                    lineDic["content"] = imgUrl;
                    lineDic["compressImg"] = WebImageSaver.Instance.GetComressImageName(imgUrl);
                    imageBlock.Post(GetIhChina.MainPage + picNode.Attributes["src"].Value);
                }
                else
                {
                    lineDic["type"] = "text";
                    lineDic["content"] = node.InnerText;
                }
                contentList.Add(lineDic);
            }
            bson.Add("content", contentList);
            var authorNode = doc.DocumentNode.SelectSingleNode("//div[@class='author']");
            if (authorNode != null)
            {
                bson.Add("author", authorNode.InnerText);
            }
            //爬取底部相关阅读
            var relativeNews = doc.DocumentNode.SelectNodes("//div[@class='list-mod4']/a");
            if (relativeNews != null)
            {
                var relativeNewsList = new BsonArray();
                foreach (var node in relativeNews)
                {
                    var newsDic = new BsonDocument();
                    newsDic["link"] = node.Attributes["href"].Value;
                    var date = node.SelectSingleNode(".//div[@class='date']");
                    if (date != null)
                    {
                        newsDic["date"] = date.InnerText;
                    }
                    var title = node.SelectSingleNode(".//div[@class='p']");
                    if (title != null)
                    {
                        newsDic["title"] = title.InnerText;
                    }
                    relativeNewsList.Add(newsDic);
                }
                if (relativeNewsList.Count > 0)
                {
                    bson.Add("relativeNews", relativeNewsList);
                }
            }
            Console.WriteLine(bson);
            return bson;
        }