public static int GetPageLastIndex(string pageUrl)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(pageUrl);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='page-mod']/ul/li");

            if (nodes == null)
            {
                return(100);
            }
            int lastIndex = 0;

            foreach (var node in nodes)
            {
                string indexString = node.InnerText;
                int    tempIndex;
                if (int.TryParse(indexString, out tempIndex))
                {
                    if (lastIndex < tempIndex)
                    {
                        lastIndex = tempIndex;
                    }
                }
            }
            return(lastIndex);
        }
Exemplo n.º 2
0
        public static void GetForumsList(BufferBlock <string> imageTargetBlock)
        {
            var block          = new BufferBlock <string>();
            var task           = GetNewsDetail.GenerateForumDetail(block, imageTargetBlock);
            int errorTime      = 0;
            var firstPage      = "http://www.ihchina.cn/luntan/p/1.html";
            var lastPageNumber = DebugHelperTools.IsDebugMode() ? 2 : WebpageHelper.GetPageLastIndex(firstPage);

            for (int i = 1; i < lastPageNumber && errorTime < 10; i++)
            {
                var listUrl = string.Format("http://www.ihchina.cn/luntan/p/{0}.html", i);
                Console.WriteLine("starting process page:{0}", listUrl);
                var doc       = WebpageHelper.GetHttpRequestDocument(listUrl);
                var listNodes = doc.DocumentNode.SelectNodes("//div[@id='datalist']/div[@class='list-item']");
                if (listNodes == null)
                {
                    errorTime++;
                    continue;
                }
                List <BsonDocument> result = new List <BsonDocument>();
                foreach (var node in listNodes)
                {
                    if (errorTime == 10)
                    {
                        break;
                    }
                    var bson = WebpageHelper.AnalizeGeneralListInformation(node, MongodbChecker.CheckForumsListExist, imageTargetBlock);
                    if (bson == null)
                    {
                        errorTime++;
                        Console.WriteLine("duplicated url: page {0}", i);
                        continue;
                    }
                    if (bson != null)
                    {
                        var link = bson.GetElement("link").Value.ToString();
                        block.Post(link);
                        result.Add(bson);
                    }
                    //每10条进行一次数据库插入,减少内存负担
                    if (result.Count == 10)
                    {
                        MongodbSaver.SaveForumsList(result);
                        result.Clear();
                    }
                }
                if (result.Count > 0)
                {
                    MongodbSaver.SaveForumsList(result);
                }
            }
            block.Complete();
            task.Wait();
        }
 public async Task <int> SaveFileAsync(ISourceBlock <string> source)
 {
     while (await source.OutputAvailableAsync())
     {
         WebClient wc = new WebClient();
         if (!Directory.Exists("img"))
         {
             Directory.CreateDirectory("img");
         }
         var imageUrl = source.Receive();
         if (!imageUrl.StartsWith(GetIhChina.MainPage))
         {
             imageUrl = GetIhChina.MainPage + imageUrl;
         }
         var savePath = Path.Combine(Directory.GetCurrentDirectory(), "img", WebpageHelper.GetSubUrl(imageUrl));
         if (File.Exists(savePath))
         {
             continue;
         }
         try
         {
             Console.WriteLine("Starting to save image {0}", imageUrl);
             wc.DownloadFile(imageUrl, savePath);
             //CompressImageWithGuetzli(savePath); 压缩貌似有点问题,待修复
             CorpImage(savePath);
             Console.WriteLine("Save image {0} completely", imageUrl);
         }
         catch (WebException e)
         {
             Console.WriteLine("Save image {0} error", imageUrl);
             Console.WriteLine(e);
         }
         catch (Exception e)
         {
             Console.WriteLine("unhandled error");
             Console.WriteLine(e);
         }
     }
     return(1);
 }
Exemplo n.º 4
0
        public static string GetSimpleRequestResult(string url)
        {
            string result;

            using (StreamReader file = new StreamReader(Path.Combine(Path.Combine(Directory.GetCurrentDirectory(), "cache"), WebpageHelper.GetSubUrl(url))))
            {
                result = file.ReadToEnd();
            }
            return(result);
        }
Exemplo n.º 5
0
 public static string GetCacheImageName(string imageURL)
 {
     return(Path.Combine(Directory.GetCurrentDirectory(), "img", WebpageHelper.GetSubUrl(imageURL)));
 }
Exemplo n.º 6
0
        public static string GetCacheFileName(string url)
        {
            if (!Directory.Exists(Path.Combine(Directory.GetCurrentDirectory(), "cache")))
            {
                Directory.CreateDirectory(Path.Combine(Directory.GetCurrentDirectory(), "cache"));
            }

            return(Path.Combine(Path.Combine(Directory.GetCurrentDirectory(), "cache"), WebpageHelper.GetSubUrl(url)));
        }
Exemplo n.º 7
0
 public static bool CheckCacheFileExist(string url)
 {
     return(File.Exists(Path.Combine(Path.Combine(Directory.GetCurrentDirectory(), "cache"), WebpageHelper.GetSubUrl(url))));
 }
        private static BsonDocument processDetailPage(string url, BufferBlock<string> imageBlock)
        {
            Console.WriteLine("starting process: " + url + "....");
            var doc = WebpageHelper.GetHttpRequestDocument(url);
            var titleNode = doc.DocumentNode.SelectSingleNode("//div[@class='article-title']");
            if (titleNode == null)
                return null;
            bson.Clear();
            bson.Add("link", url);
            var titleNameNode = titleNode.SelectSingleNode(".//div[@class='h24']");
            if (titleNameNode != null)
            {
                bson.Add("title", titleNameNode.InnerText.Replace("\t", "")); //规格化文字,TODO 未完成
            }
            var titleSubItemsNodes = titleNode.SelectNodes(".//div[@class='sub']/span[@class='sub-item']");
            if(titleSubItemsNodes==null)
            {
                return null;
            }
            var subTitleList = new BsonArray();
            foreach (var subItemNode in titleSubItemsNodes)
            {
                if (subItemNode.SelectSingleNode(".//div[@class='en']") != null) //日期的要单独处理
                {
                    var dateNode = subItemNode.SelectSingleNode(".//div[@class='en'");
                    subTitleList.Add(dateNode.InnerText);
                }
                else
                {
                    subTitleList.Add(subItemNode.InnerText);
                }
            }
            bson.Add("subtitle", subTitleList);

            //开始读取内容
            var contentList = new BsonArray();
            var contentNodes = doc.DocumentNode.SelectNodes("//div[@class='article-cont']/p");
            if(contentNodes==null)
            {
                return null;
            }
            foreach (var node in contentNodes)
            {
                var lineDic = new BsonDocument();
                var picNode = node.SelectSingleNode(".//img");
                if (picNode != null)
                {
                    var imgUrl = WebpageHelper.GetSubUrl(picNode.Attributes["src"].Value);
                    lineDic["type"] = "img";
                    lineDic["content"] = imgUrl;
                    lineDic["compressImg"] = WebImageSaver.Instance.GetComressImageName(imgUrl);
                    imageBlock.Post(GetIhChina.MainPage + picNode.Attributes["src"].Value);
                }
                else
                {
                    lineDic["type"] = "text";
                    lineDic["content"] = node.InnerText;
                }
                contentList.Add(lineDic);
            }
            bson.Add("content", contentList);
            var authorNode = doc.DocumentNode.SelectSingleNode("//div[@class='author']");
            if (authorNode != null)
            {
                bson.Add("author", authorNode.InnerText);
            }
            //爬取底部相关阅读
            var relativeNews = doc.DocumentNode.SelectNodes("//div[@class='list-mod4']/a");
            if (relativeNews != null)
            {
                var relativeNewsList = new BsonArray();
                foreach (var node in relativeNews)
                {
                    var newsDic = new BsonDocument();
                    newsDic["link"] = node.Attributes["href"].Value;
                    var date = node.SelectSingleNode(".//div[@class='date']");
                    if (date != null)
                    {
                        newsDic["date"] = date.InnerText;
                    }
                    var title = node.SelectSingleNode(".//div[@class='p']");
                    if (title != null)
                    {
                        newsDic["title"] = title.InnerText;
                    }
                    relativeNewsList.Add(newsDic);
                }
                if (relativeNewsList.Count > 0)
                {
                    bson.Add("relativeNews", relativeNewsList);
                }
            }
            Console.WriteLine(bson);
            return bson;
        }
        public static async Task <int> GetNewsList(Task <int> imageSaverTask, BufferBlock <string> imageTargetBlock)
        {
            BufferBlock <string> newsDetailTargetBlock = new BufferBlock <string>();
            int errorTime = 0;
            var newsDetailPageGenerate = GetNewsDetail.GenerateNewsDetail(newsDetailTargetBlock, imageTargetBlock);
            int pageNumber             = DebugHelperTools.IsDebugMode() ? 2 : 255;

            for (int page = 1; page < pageNumber; page++)
            {
                if (errorTime > 10)
                {
                    Console.WriteLine("reach the limitation of error time");
                    break;
                }
                Console.WriteLine("starting process {0} page", page);
                string pageURL    = String.Format("{0}?category_id=9&page={1}&limit=0", GetIhChina.NewsListUrl, page);
                var    result     = WebpageHelper.GetRequest(pageURL);
                var    jsonObject = JsonConvert.DeserializeObject <NewsListResponse>(result);
                if (jsonObject.more != 1 && String.IsNullOrEmpty(jsonObject.data))
                {
                    Console.WriteLine("GetNewsList: There is no more here, break from the loop");
                    break;
                }
                var doc = new HtmlDocument();
                doc.LoadHtml(jsonObject.data);
                var newsListNodes = from links in doc.DocumentNode.Descendants()
                                    where links.Name == "div" && links.Attributes["class"] != null && links.Attributes["class"].Value == "list-item"
                                    select links;
                var newslistBsons = new List <BsonDocument>();
                foreach (var node in newsListNodes)
                {
                    var newsBson  = new BsonDocument();
                    var titleNode = node.SelectSingleNode(".//div[@class='h16']/a");
                    if (titleNode != null)
                    {
                        var link = titleNode.Attributes["href"].Value;
                        if (MongodbChecker.CheckNewsExist(link))
                        {
                            Console.WriteLine("duplicated url: page {0}", link);
                            errorTime++;
                            continue;
                        }
                        newsDetailTargetBlock.Post(link);
                        newsBson.Add("link", link);
                        newsBson.Add("title", titleNode.Attributes["title"].Value);
                    }
                    var imgNode = node.SelectSingleNode(".//img");
                    if (imgNode != null)
                    {
                        var imgUrl = WebpageHelper.GetSubUrl(imgNode.Attributes["src"].Value);
                        newsBson.Add("img", imgUrl);
                        newsBson.Add("compressImg", WebImageSaver.Instance.GetComressImageName(imgUrl));
                        imageTargetBlock.Post(GetIhChina.MainPage + imgNode.Attributes["src"].Value);
                    }
                    var dataNode = node.SelectSingleNode(".//div[@class='date']/div");
                    if (dataNode != null)
                    {
                        newsBson.Add("date", dataNode.InnerText);
                    }
                    var contentNode = node.SelectSingleNode(".//div[@class='p']");
                    if (contentNode != null)
                    {
                        newsBson.Add("content", contentNode.InnerText);
                    }
                    if (newsBson.Count() != 0)
                    {
                        newslistBsons.Add(newsBson);
                    }
                    //每10条进行一次数据库插入,减少内存负担
                    if (newslistBsons.Count == 10)
                    {
                        MongodbSaver.SaveNewsList(newslistBsons);
                        for (int i = 0; i < 10; i++)
                        {
                            newslistBsons[i] = null;
                        }
                        newslistBsons.Clear();
                    }
                }
                if (newslistBsons.Count != 0)
                {
                    MongodbSaver.SaveNewsList(newslistBsons);
                }
            }
            newsDetailTargetBlock.Complete();
            return(await newsDetailPageGenerate);
        }