//传承人和非遗用的同一个页面结构,所以可以复用
        public static void GenerateCCRDetail(string url)
        {
            if (MongodbChecker.CheckHeritageProjectInheritatePeopleExist(url))
            {
                return;
            }
            var bson = GenerateHeritageProjectDetailPage(url, false);

            if (bson != null)
            {
                MongodbSaver.SaveHeritageProjectInheritatePeople(bson);
            }
        }
Exemple #2
0
        private static void GetBanner(BufferBlock <string> imageTargetBlock)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(MainPage);
            var nodes = from links in doc.DocumentNode.Descendants()
                        where
                        links.Name == "a" &&
                        links.Attributes["href"] != null &&
                        links.Attributes["class"] != null &&
                        links.Attributes["class"].Value.Equals("slick-link p-show") &&
                        !MongodbChecker.CheckMainNewsList(links.Attributes["href"].Value)
                        select new BsonDocument()
                        .Add("link", links.Attributes["href"].Value)
                        .Add("img", WebpageHelper.GetSubUrl(
                                 links.Attributes["style"].Value.Substring(
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                     links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))))
                        .Add("compressImg", WebImageSaver.Instance.GetComressImageName(WebpageHelper.GetSubUrl(
                                                                                           links.Attributes["style"].Value.Substring(
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                                                                               links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)))))
                        .Add("originImage", links.Attributes["style"].Value.Substring(
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                 links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)));

            foreach (var node in nodes)
            {
                imageTargetBlock.Post(node["originImage"].AsBsonValue.ToString());
            }

            var enumerable = nodes as BsonDocument[] ?? nodes.ToArray();

            if (!enumerable.Any())
            {
                return;
            }
            MongodbSaver.SaveMainpageNewsList(nodes);
            foreach (var node in enumerable)
            {
                Console.WriteLine(node["link"].AsBsonValue + " " + node["img"].AsBsonValue);
            }
        }
        public static async Task <int> GenerateProjectDetailPage(ISourceBlock <string> urlSource)
        {
            var errTime = 0;

            while (await urlSource.OutputAvailableAsync())
            {
                if (errTime == 10)
                {
                    return(DUPLICATED_NEWS);
                }
                var url = urlSource.Receive();
                if (MongodbChecker.CheckHeritageProjectDetailExist(url))
                {
                    errTime++;
                    continue;
                }
                var bson = GenerateHeritageProjectDetailPage(url, true);
                if (bson != null)
                {
                    MongodbSaver.SaveHeritageProjectDetail(bson);
                }
            }
            return(PROCESS_SUCCESS);
        }
Exemple #4
0
        private static void GetAllProjectList(BufferBlock <string> imageTargetBlock)
        {
            short errorTime  = 0;
            var   totalPages = DebugHelper.DebugHelperTools.IsDebugMode() ? 2 : 10;
            var   block      = new BufferBlock <string>();
            var   task       = GetHeritageProjectDetailWorker.GenerateProjectDetailPage(block);

            for (int i = 1; i < totalPages; i++)
            {
                if (errorTime > 10)
                {
                    Console.WriteLine("GetAllProjectList: reach the limitation of error time");
                    break;
                }
                var currentPage = String.Format(REQUEST_URL, i);
                Console.WriteLine("Starting process: " + currentPage);
                var requestResult = WebpageHelper.GetRequest(currentPage);
                if (string.IsNullOrEmpty(requestResult))
                {
                    errorTime++;
                    continue;
                }
                var jsonObject = JsonConvert.DeserializeObject <HeritageProjectRequest>(requestResult);
                if (jsonObject.Links.Total_pages != 0 &&
                    jsonObject.Links.Total_pages != totalPages &&
                    !DebugHelper.DebugHelperTools.IsDebugMode())
                {
                    totalPages = jsonObject.Links.Total_pages;
                }
                var list = jsonObject.List;
                if (list == null || list.Length == 0)
                {
                    continue;
                }
                var bsonArray    = new List <BsonDocument>();
                var heritageType = typeof(HeritageProject);
                var properties   = typeof(HeritageProject).GetProperties();
                for (int j = 0; j < list.Length; j++)
                {
                    var bsonDocument = new BsonDocument();
                    list[j].Link = "/project_details/" + list[j].Id;
                    foreach (var property in properties)
                    {
                        //反射获取HeritageProject所有属性
                        //以属性名作为MongoDB存储的Key值
                        //反射获取对应List当中的值
                        bsonDocument.Add(property.Name.ToLower(), Regex.Replace(heritageType.GetProperty(property.Name).GetValue(list[j]).ToString(), "<.*?>", string.Empty));
                    }

                    if (MongodbChecker.CheckHeritageProjectExist(list[j].Link))
                    {
                        Console.WriteLine("Duplicated Heritage Project Link: {0}", list[j].Link);
                        errorTime++;
                        continue;
                    }
                    block.Post(list[j].Link);
                    bsonArray.Add(bsonDocument);
                }


                if (bsonArray.Count != 0)
                {
                    MongodbSaver.SaveHeritageProjectNewsList(bsonArray);
                }

                if (jsonObject.More != 1)
                {
                    Console.WriteLine("GetAllProjectList: current page is {0}, more equals 0", i);
                    break;
                }
            }
            block.Complete();
            task.Wait();
        }
        public static async Task <int> GetNewsList(Task <int> imageSaverTask, BufferBlock <string> imageTargetBlock)
        {
            BufferBlock <string> newsDetailTargetBlock = new BufferBlock <string>();
            int errorTime = 0;
            var newsDetailPageGenerate = GetNewsDetail.GenerateNewsDetail(newsDetailTargetBlock, imageTargetBlock);
            int pageNumber             = DebugHelperTools.IsDebugMode() ? 2 : 255;

            for (int page = 1; page < pageNumber; page++)
            {
                if (errorTime > 10)
                {
                    Console.WriteLine("reach the limitation of error time");
                    break;
                }
                Console.WriteLine("starting process {0} page", page);
                string pageURL    = String.Format("{0}?category_id=9&page={1}&limit=0", GetIhChina.NewsListUrl, page);
                var    result     = WebpageHelper.GetRequest(pageURL);
                var    jsonObject = JsonConvert.DeserializeObject <NewsListResponse>(result);
                if (jsonObject.more != 1 && String.IsNullOrEmpty(jsonObject.data))
                {
                    Console.WriteLine("GetNewsList: There is no more here, break from the loop");
                    break;
                }
                var doc = new HtmlDocument();
                doc.LoadHtml(jsonObject.data);
                var newsListNodes = from links in doc.DocumentNode.Descendants()
                                    where links.Name == "div" && links.Attributes["class"] != null && links.Attributes["class"].Value == "list-item"
                                    select links;
                var newslistBsons = new List <BsonDocument>();
                foreach (var node in newsListNodes)
                {
                    var newsBson  = new BsonDocument();
                    var titleNode = node.SelectSingleNode(".//div[@class='h16']/a");
                    if (titleNode != null)
                    {
                        var link = titleNode.Attributes["href"].Value;
                        if (MongodbChecker.CheckNewsExist(link))
                        {
                            Console.WriteLine("duplicated url: page {0}", link);
                            errorTime++;
                            continue;
                        }
                        newsDetailTargetBlock.Post(link);
                        newsBson.Add("link", link);
                        newsBson.Add("title", titleNode.Attributes["title"].Value);
                    }
                    var imgNode = node.SelectSingleNode(".//img");
                    if (imgNode != null)
                    {
                        var imgUrl = WebpageHelper.GetSubUrl(imgNode.Attributes["src"].Value);
                        newsBson.Add("img", imgUrl);
                        newsBson.Add("compressImg", WebImageSaver.Instance.GetComressImageName(imgUrl));
                        imageTargetBlock.Post(GetIhChina.MainPage + imgNode.Attributes["src"].Value);
                    }
                    var dataNode = node.SelectSingleNode(".//div[@class='date']/div");
                    if (dataNode != null)
                    {
                        newsBson.Add("date", dataNode.InnerText);
                    }
                    var contentNode = node.SelectSingleNode(".//div[@class='p']");
                    if (contentNode != null)
                    {
                        newsBson.Add("content", contentNode.InnerText);
                    }
                    if (newsBson.Count() != 0)
                    {
                        newslistBsons.Add(newsBson);
                    }
                    //每10条进行一次数据库插入,减少内存负担
                    if (newslistBsons.Count == 10)
                    {
                        MongodbSaver.SaveNewsList(newslistBsons);
                        for (int i = 0; i < 10; i++)
                        {
                            newslistBsons[i] = null;
                        }
                        newslistBsons.Clear();
                    }
                }
                if (newslistBsons.Count != 0)
                {
                    MongodbSaver.SaveNewsList(newslistBsons);
                }
            }
            newsDetailTargetBlock.Complete();
            return(await newsDetailPageGenerate);
        }