private static BsonDocument GenerateHeritageProjectDetailPage(string url, bool continueReptile)
        {
            var doc           = WebpageHelper.GetHttpRequestDocument(url);
            var containerNode = doc.DocumentNode.SelectSingleNode("//div[@class='x-container']");

            if (containerNode == null)
            {
                return(null);
            }

            //头部标题和表格
            var titleNode = containerNode.SelectSingleNode("//div[@class='t_head']/div[@class='h30']");

            if (titleNode == null)
            {
                return(null);
            }
            var bsonDocument = new BsonDocument();

            bsonDocument.Add("title", titleNode.InnerText);
            bsonDocument.Add("link", url);
            var tableNodes = containerNode.SelectSingleNode(".//div[@class='table']").SelectNodes(".//div[@class='p']");

            if (tableNodes != null)
            {
                BsonArray descList = new BsonArray();
                foreach (var node in tableNodes.Descendants())
                {
                    descList.Add(node.InnerText);
                }
                if (descList.Count > 0)
                {
                    bsonDocument.Add("desc", descList);
                }
            }
            //中央区域描述文字
            var contentNode = containerNode.SelectSingleNode(".//div[@class='text']//div[@class='p']");

            if (contentNode != null)
            {
                StringBuilder sb = new StringBuilder();
                foreach (var node in contentNode.Descendants())
                {
                    sb.Append(node.InnerText);
                }
                bsonDocument.Add("text", sb.ToString());
            }

            //底部相关传承人
            GetBottomInheritageAndReleventInfo(containerNode, bsonDocument, continueReptile);
            Console.WriteLine(bsonDocument);
            return(bsonDocument);
        }
Ejemplo n.º 2
0
        private static async Task GetPeoplePageList(BufferBlock <string> imageTargetBlock)
        {
            var block          = new BufferBlock <string>();
            var task           = GetNewsDetail.GeneratePeopleDetail(block, imageTargetBlock);
            int errorTime      = 0;
            var firstPage      = "http://www.ihchina.cn/character/p/1.html";
            var lastPageNumber = DebugHelperTools.IsDebugMode() ? 1 : WebpageHelper.GetPageLastIndex(firstPage);

            for (int i = 1; i <= lastPageNumber && errorTime < 10; i++)
            {
                var listUrl = string.Format("http://www.ihchina.cn/character/p/{0}.html", i);
                Console.WriteLine("starting process people page: {0}", listUrl);
                var doc       = WebpageHelper.GetHttpRequestDocument(listUrl);
                var listNodes = doc.DocumentNode.SelectNodes("//div[@class='list-item']");
                if (listNodes == null)
                {
                    errorTime++;
                    continue;
                }
                List <BsonDocument> result = new List <BsonDocument>();
                foreach (var node in listNodes)
                {
                    if (errorTime == 10)
                    {
                        break;
                    }
                    var bson = WebpageHelper.AnalizeGeneralListInformation(node, MongodbChecker.CheckPeoplePageListExist, imageTargetBlock);
                    if (bson == null)
                    {
                        errorTime++;
                        Console.WriteLine("duplicated people url page{0}", i);
                        continue;
                    }
                    if (bson != null)
                    {
                        var link = bson.GetElement("link").Value.ToString();
                        block.Post(link);
                        result.Add(bson);
                    }
                }
                if (result.Count > 0)
                {
                    MongodbSaver.SavePeopleListInformation(result);
                }
                result.Clear();
            }
            block.Complete();
            await task;
        }
Ejemplo n.º 3
0
        private static void GetPeopleMainPage(BufferBlock <string> imageTargetBlock)
        {
            var doc  = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE);
            var bson = new BsonDocument();
            //获取图片和表格内容
            var tableImageNodes   = doc.DocumentNode.SelectNodes("//div[@class='tab-cont']/div");
            var tableContentNodes = doc.DocumentNode.SelectNodes("//div[@class='tab-track justify']/div");

            if (tableContentNodes == null || tableContentNodes == null || tableImageNodes.Count != tableContentNodes.Count)
            {
                return;
            }
            var bsonArray = new BsonArray();

            for (int i = 0; i < tableImageNodes.Count; i++)
            {
                var tableBson   = new BsonDocument();
                var imageNode   = tableImageNodes[i];
                var contentNode = tableContentNodes[i];
                var linkNode    = contentNode.SelectSingleNode(".//a");
                if (linkNode == null)
                {
                    continue;
                }
                var imageUrlWithAllText = imageNode.Attributes["style"].Value;
                var imageUrl            = imageUrlWithAllText.Substring(imageUrlWithAllText.IndexOf("/"), imageUrlWithAllText.IndexOf(")") - imageUrlWithAllText.IndexOf("/"));
                var imageDesc           = contentNode.InnerText;
                var link = linkNode.Attributes["href"].Value;
                imageTargetBlock.Post(imageUrl);
                tableBson.Add("img", WebpageHelper.GetSubUrl(imageUrl));
                tableBson.Add("desc", imageDesc);
                tableBson.Add("link", link);
                bsonArray.Add(tableBson);
            }
            bson.Add("table", bsonArray);
            var success = WebpageHelper.TryToInsertOrUpdateABson(bson, MongodbMain.PeopleMainPage);

            if (success)
            {
                Console.WriteLine("People page Insert or update success");
            }
            else
            {
                Console.WriteLine("Duplicated information in people page");
            }
        }
Ejemplo n.º 4
0
        private static void GetBanner(BufferBlock <string> imageTargetBlock)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(MainPage);
            var nodes = from links in doc.DocumentNode.Descendants()
                        where
                        links.Name == "a" &&
                        links.Attributes["href"] != null &&
                        links.Attributes["class"] != null &&
                        links.Attributes["class"].Value.Equals("slick-link p-show") &&
                        !MongodbChecker.CheckMainNewsList(links.Attributes["href"].Value)
                        select new BsonDocument()
                        .Add("link", links.Attributes["href"].Value)
                        .Add("img", WebpageHelper.GetSubUrl(
                                 links.Attributes["style"].Value.Substring(
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                     links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))))
                        .Add("compressImg", WebImageSaver.Instance.GetComressImageName(WebpageHelper.GetSubUrl(
                                                                                           links.Attributes["style"].Value.Substring(
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                                                                               links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)))))
                        .Add("originImage", links.Attributes["style"].Value.Substring(
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                 links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)));

            foreach (var node in nodes)
            {
                imageTargetBlock.Post(node["originImage"].AsBsonValue.ToString());
            }

            var enumerable = nodes as BsonDocument[] ?? nodes.ToArray();

            if (!enumerable.Any())
            {
                return;
            }
            MongodbSaver.SaveMainpageNewsList(nodes);
            foreach (var node in enumerable)
            {
                Console.WriteLine(node["link"].AsBsonValue + " " + node["img"].AsBsonValue);
            }
        }
Ejemplo n.º 5
0
        public static void GetHeritageMapTableInformation(BsonDocument bson)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='tab-cont map_num']/div");

            if (nodes != null)
            {
                var tableArray = new BsonArray();
                for (int i = 0; i < 2; i++)
                {
                    var    tableInfo = new BsonDocument();
                    string name;
                    switch (i)
                    {
                    case 0:
                        name = "国家级代表项目";
                        break;

                    case 1:
                        name = "国家级代表性传承人";
                        break;

                    default:
                        name = "";
                        break;
                    }
                    var          node = nodes[i];
                    BsonDocument tableInforamtionNode = GetMapInformation(node);
                    if (tableInforamtionNode != null && tableInforamtionNode.ElementCount > 0)
                    {
                        tableInfo.Add("desc", name);
                        tableInfo.Add("content", tableInforamtionNode);
                        tableArray.Add(tableInfo);
                    }
                }
                bson.Add("mapTables", tableArray);
            }
        }
Ejemplo n.º 6
0
        private static void GetMainPageInformation(BufferBlock <string> imageTargetBlock)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(PROJECT_MAIN_PAGE);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='x-wrap']/div[@class='title']/div");

            if (nodes != null)
            {
                var bson = new BsonDocument();
                foreach (var node in nodes)
                {
                    switch (node.Attributes["class"].Value)
                    {
                    case "h30":
                        bson.Add("title", node.InnerText);
                        break;

                    case "p":
                        StringBuilder sb = new StringBuilder();
                        foreach (var contentNode in node.ChildNodes)
                        {
                            sb.Append(contentNode.InnerText);
                        }
                        bson.Add("content", sb.ToString());
                        break;
                    }
                }

                //非遗页面的数字
                nodes = doc.DocumentNode.SelectNodes("//div/div[@class='num-item']");
                if (nodes != null)
                {
                    var bsonArray = new BsonArray();
                    foreach (var node in nodes)
                    {
                        var numBson = new BsonDocument();
                        foreach (var childNode in node.ChildNodes)
                        {
                            switch (childNode.Attributes["class"]?.Value)
                            {
                            case "b":
                                numBson.Add("num", childNode.Attributes["data-rn"].Value);
                                break;

                            case "h18":
                                numBson.Add("desc", childNode.InnerText);
                                break;
                            }
                        }
                        bsonArray.Add(numBson);
                    }

                    bson.Add("numItem", bsonArray);
                }

                //获取首页的非物质文化遗产地图
                GetHeritageMapTableInformation(bson);

                var mongodbBson = MongodbGetter.GetHeritageProjectMainPageDesc();
                if (mongodbBson == null)
                {
                    Console.WriteLine("Insert Heritage Project Content");
                    MongodbSaver.SaveHeritageProjectMainContent(bson);
                }
                else if (!CheckBsonIsEqual(bson, mongodbBson))
                {
                    Console.WriteLine("Update Heritage Project Content");
                    MongodbUpdater.UpdateHeritageProjectMainContent(bson);
                }
                else
                {
                    Console.WriteLine("Not Insert Heritage Project Content");
                }
                Console.WriteLine(bson);
            }
        }