private void GetMetaDescrition(String html, String expected)
        {
            String actual = WebpageHelper.GetMetaDescrition(html);

            Assert.IsTrue(actual.Length <= 160);
            Assert.AreEqual(expected, actual);
        }
Esempio n. 2
0
        public void GetUrlTestWithSlash()
        {
            var subName           = WebpageHelper.GetSubUrl(@"http://www.baidu.com");
            var subNameMultiSlash = WebpageHelper.GetSubUrl(@"http://www.baidu.com/newsdetail/10204214.html");

            Assert.AreEqual("www.baidu.com", subName);
            Assert.AreEqual("10204214.html", subNameMultiSlash);
            Assert.Pass();
        }
Esempio n. 3
0
        public void TryToSaveHtmlFile()
        {
            var     url     = "http://billie66.github.io/TLCL/book/chap01.html";
            HtmlWeb htmlWeb = new HtmlWeb();
            var     doc     = htmlWeb.Load(url);

            WebPageSaver.SaveHtml(url, doc);
            Assert.IsTrue(File.Exists(WebpageHelper.GetSubUrl(url)));
        }
Esempio n. 4
0
        public void CorrectRequestStringTest()
        {
            string url1 = @"/asdsad/asdasdasd";
            string url2 = @"+23/dasvsa/fcasvsaf";
            string url3 = MAIN_PAGE + @"/asd/asd/asd";

            Assert.AreEqual(MAIN_PAGE + url1, WebpageHelper.CorrectRequestString(url1));
            Assert.AreEqual(MAIN_PAGE + @"/dasvsa/fcasvsaf", WebpageHelper.CorrectRequestString(url2));
            Assert.AreEqual(url3, WebpageHelper.CorrectRequestString(url3));
        }
        private static BsonDocument GenerateHeritageProjectDetailPage(string url, bool continueReptile)
        {
            var doc           = WebpageHelper.GetHttpRequestDocument(url);
            var containerNode = doc.DocumentNode.SelectSingleNode("//div[@class='x-container']");

            if (containerNode == null)
            {
                return(null);
            }

            //头部标题和表格
            var titleNode = containerNode.SelectSingleNode("//div[@class='t_head']/div[@class='h30']");

            if (titleNode == null)
            {
                return(null);
            }
            var bsonDocument = new BsonDocument();

            bsonDocument.Add("title", titleNode.InnerText);
            bsonDocument.Add("link", url);
            var tableNodes = containerNode.SelectSingleNode(".//div[@class='table']").SelectNodes(".//div[@class='p']");

            if (tableNodes != null)
            {
                BsonArray descList = new BsonArray();
                foreach (var node in tableNodes.Descendants())
                {
                    descList.Add(node.InnerText);
                }
                if (descList.Count > 0)
                {
                    bsonDocument.Add("desc", descList);
                }
            }
            //中央区域描述文字
            var contentNode = containerNode.SelectSingleNode(".//div[@class='text']//div[@class='p']");

            if (contentNode != null)
            {
                StringBuilder sb = new StringBuilder();
                foreach (var node in contentNode.Descendants())
                {
                    sb.Append(node.InnerText);
                }
                bsonDocument.Add("text", sb.ToString());
            }

            //底部相关传承人
            GetBottomInheritageAndReleventInfo(containerNode, bsonDocument, continueReptile);
            Console.WriteLine(bsonDocument);
            return(bsonDocument);
        }
Esempio n. 6
0
        private static async Task GetPeoplePageList(BufferBlock <string> imageTargetBlock)
        {
            var block          = new BufferBlock <string>();
            var task           = GetNewsDetail.GeneratePeopleDetail(block, imageTargetBlock);
            int errorTime      = 0;
            var firstPage      = "http://www.ihchina.cn/character/p/1.html";
            var lastPageNumber = DebugHelperTools.IsDebugMode() ? 1 : WebpageHelper.GetPageLastIndex(firstPage);

            for (int i = 1; i <= lastPageNumber && errorTime < 10; i++)
            {
                var listUrl = string.Format("http://www.ihchina.cn/character/p/{0}.html", i);
                Console.WriteLine("starting process people page: {0}", listUrl);
                var doc       = WebpageHelper.GetHttpRequestDocument(listUrl);
                var listNodes = doc.DocumentNode.SelectNodes("//div[@class='list-item']");
                if (listNodes == null)
                {
                    errorTime++;
                    continue;
                }
                List <BsonDocument> result = new List <BsonDocument>();
                foreach (var node in listNodes)
                {
                    if (errorTime == 10)
                    {
                        break;
                    }
                    var bson = WebpageHelper.AnalizeGeneralListInformation(node, MongodbChecker.CheckPeoplePageListExist, imageTargetBlock);
                    if (bson == null)
                    {
                        errorTime++;
                        Console.WriteLine("duplicated people url page{0}", i);
                        continue;
                    }
                    if (bson != null)
                    {
                        var link = bson.GetElement("link").Value.ToString();
                        block.Post(link);
                        result.Add(bson);
                    }
                }
                if (result.Count > 0)
                {
                    MongodbSaver.SavePeopleListInformation(result);
                }
                result.Clear();
            }
            block.Complete();
            await task;
        }
Esempio n. 7
0
        private static void GetPeopleMainPage(BufferBlock <string> imageTargetBlock)
        {
            var doc  = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE);
            var bson = new BsonDocument();
            //获取图片和表格内容
            var tableImageNodes   = doc.DocumentNode.SelectNodes("//div[@class='tab-cont']/div");
            var tableContentNodes = doc.DocumentNode.SelectNodes("//div[@class='tab-track justify']/div");

            if (tableContentNodes == null || tableContentNodes == null || tableImageNodes.Count != tableContentNodes.Count)
            {
                return;
            }
            var bsonArray = new BsonArray();

            for (int i = 0; i < tableImageNodes.Count; i++)
            {
                var tableBson   = new BsonDocument();
                var imageNode   = tableImageNodes[i];
                var contentNode = tableContentNodes[i];
                var linkNode    = contentNode.SelectSingleNode(".//a");
                if (linkNode == null)
                {
                    continue;
                }
                var imageUrlWithAllText = imageNode.Attributes["style"].Value;
                var imageUrl            = imageUrlWithAllText.Substring(imageUrlWithAllText.IndexOf("/"), imageUrlWithAllText.IndexOf(")") - imageUrlWithAllText.IndexOf("/"));
                var imageDesc           = contentNode.InnerText;
                var link = linkNode.Attributes["href"].Value;
                imageTargetBlock.Post(imageUrl);
                tableBson.Add("img", WebpageHelper.GetSubUrl(imageUrl));
                tableBson.Add("desc", imageDesc);
                tableBson.Add("link", link);
                bsonArray.Add(tableBson);
            }
            bson.Add("table", bsonArray);
            var success = WebpageHelper.TryToInsertOrUpdateABson(bson, MongodbMain.PeopleMainPage);

            if (success)
            {
                Console.WriteLine("People page Insert or update success");
            }
            else
            {
                Console.WriteLine("Duplicated information in people page");
            }
        }
Esempio n. 8
0
        private static void GetBanner(BufferBlock <string> imageTargetBlock)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(MainPage);
            var nodes = from links in doc.DocumentNode.Descendants()
                        where
                        links.Name == "a" &&
                        links.Attributes["href"] != null &&
                        links.Attributes["class"] != null &&
                        links.Attributes["class"].Value.Equals("slick-link p-show") &&
                        !MongodbChecker.CheckMainNewsList(links.Attributes["href"].Value)
                        select new BsonDocument()
                        .Add("link", links.Attributes["href"].Value)
                        .Add("img", WebpageHelper.GetSubUrl(
                                 links.Attributes["style"].Value.Substring(
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                     links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))))
                        .Add("compressImg", WebImageSaver.Instance.GetComressImageName(WebpageHelper.GetSubUrl(
                                                                                           links.Attributes["style"].Value.Substring(
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                                                                               links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)))))
                        .Add("originImage", links.Attributes["style"].Value.Substring(
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                 links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)));

            foreach (var node in nodes)
            {
                imageTargetBlock.Post(node["originImage"].AsBsonValue.ToString());
            }

            var enumerable = nodes as BsonDocument[] ?? nodes.ToArray();

            if (!enumerable.Any())
            {
                return;
            }
            MongodbSaver.SaveMainpageNewsList(nodes);
            foreach (var node in enumerable)
            {
                Console.WriteLine(node["link"].AsBsonValue + " " + node["img"].AsBsonValue);
            }
        }
Esempio n. 9
0
        /// <summary>
        /// Get detail of post
        /// </summary>
        /// <param name="id"></param>
        /// <returns></returns>
        public ActionResult Details(Int32 id)
        {
            using (var db = new DataService())
            {
                // Initialize mode lwith post from database
                Details model = new Details()
                {
                    Post = db.GetPostWithDetails(id)
                };
                // Post not found
                if (model.Post == null)
                {
                    return(HttpNotFound());
                }
                else
                {
                    // Get description
                    model.Description = WebpageHelper.GetMetaDescrition(model.Post.Summary);

                    // Get previous and next posts informations, to display prevous and next buttons
                    DateTime date = model.Post.DateCreatedGmt;

                    model.PreviousPost = db.GetPreviousPost(id, date);
                    model.NextPost     = db.GetNextPost(id, date);

                    // Build comment model to allow user to comment this post
                    UserProfile user = UserService.Get(db);
                    if (user == null)
                    {
                        model.Comment = null;
                        model.CurrentUserSubscibed = false;
                    }
                    else
                    {
                        model.Comment = new Comment();
                        model.CurrentUserSubscibed = db.HasCurrentUserSubscibed(id, user.Id);
                    }
                }

                return(View(model));
            }
        }
Esempio n. 10
0
        public static void GetHeritageMapTableInformation(BsonDocument bson)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='tab-cont map_num']/div");

            if (nodes != null)
            {
                var tableArray = new BsonArray();
                for (int i = 0; i < 2; i++)
                {
                    var    tableInfo = new BsonDocument();
                    string name;
                    switch (i)
                    {
                    case 0:
                        name = "国家级代表项目";
                        break;

                    case 1:
                        name = "国家级代表性传承人";
                        break;

                    default:
                        name = "";
                        break;
                    }
                    var          node = nodes[i];
                    BsonDocument tableInforamtionNode = GetMapInformation(node);
                    if (tableInforamtionNode != null && tableInforamtionNode.ElementCount > 0)
                    {
                        tableInfo.Add("desc", name);
                        tableInfo.Add("content", tableInforamtionNode);
                        tableArray.Add(tableInfo);
                    }
                }
                bson.Add("mapTables", tableArray);
            }
        }
Esempio n. 11
0
        public void GetUrlTestWithoutSlash()
        {
            var subName = "asdsads.txt";

            Assert.AreEqual(subName, WebpageHelper.GetSubUrl(subName));
        }
Esempio n. 12
0
        private static void GetMainPageInformation(BufferBlock <string> imageTargetBlock)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(PROJECT_MAIN_PAGE);
            var nodes = doc.DocumentNode.SelectNodes("//div[@class='x-wrap']/div[@class='title']/div");

            if (nodes != null)
            {
                var bson = new BsonDocument();
                foreach (var node in nodes)
                {
                    switch (node.Attributes["class"].Value)
                    {
                    case "h30":
                        bson.Add("title", node.InnerText);
                        break;

                    case "p":
                        StringBuilder sb = new StringBuilder();
                        foreach (var contentNode in node.ChildNodes)
                        {
                            sb.Append(contentNode.InnerText);
                        }
                        bson.Add("content", sb.ToString());
                        break;
                    }
                }

                //非遗页面的数字
                nodes = doc.DocumentNode.SelectNodes("//div/div[@class='num-item']");
                if (nodes != null)
                {
                    var bsonArray = new BsonArray();
                    foreach (var node in nodes)
                    {
                        var numBson = new BsonDocument();
                        foreach (var childNode in node.ChildNodes)
                        {
                            switch (childNode.Attributes["class"]?.Value)
                            {
                            case "b":
                                numBson.Add("num", childNode.Attributes["data-rn"].Value);
                                break;

                            case "h18":
                                numBson.Add("desc", childNode.InnerText);
                                break;
                            }
                        }
                        bsonArray.Add(numBson);
                    }

                    bson.Add("numItem", bsonArray);
                }

                //获取首页的非物质文化遗产地图
                GetHeritageMapTableInformation(bson);

                var mongodbBson = MongodbGetter.GetHeritageProjectMainPageDesc();
                if (mongodbBson == null)
                {
                    Console.WriteLine("Insert Heritage Project Content");
                    MongodbSaver.SaveHeritageProjectMainContent(bson);
                }
                else if (!CheckBsonIsEqual(bson, mongodbBson))
                {
                    Console.WriteLine("Update Heritage Project Content");
                    MongodbUpdater.UpdateHeritageProjectMainContent(bson);
                }
                else
                {
                    Console.WriteLine("Not Insert Heritage Project Content");
                }
                Console.WriteLine(bson);
            }
        }
Esempio n. 13
0
        private static void GetAllProjectList(BufferBlock <string> imageTargetBlock)
        {
            short errorTime  = 0;
            var   totalPages = DebugHelper.DebugHelperTools.IsDebugMode() ? 2 : 10;
            var   block      = new BufferBlock <string>();
            var   task       = GetHeritageProjectDetailWorker.GenerateProjectDetailPage(block);

            for (int i = 1; i < totalPages; i++)
            {
                if (errorTime > 10)
                {
                    Console.WriteLine("GetAllProjectList: reach the limitation of error time");
                    break;
                }
                var currentPage = String.Format(REQUEST_URL, i);
                Console.WriteLine("Starting process: " + currentPage);
                var requestResult = WebpageHelper.GetRequest(currentPage);
                if (string.IsNullOrEmpty(requestResult))
                {
                    errorTime++;
                    continue;
                }
                var jsonObject = JsonConvert.DeserializeObject <HeritageProjectRequest>(requestResult);
                if (jsonObject.Links.Total_pages != 0 &&
                    jsonObject.Links.Total_pages != totalPages &&
                    !DebugHelper.DebugHelperTools.IsDebugMode())
                {
                    totalPages = jsonObject.Links.Total_pages;
                }
                var list = jsonObject.List;
                if (list == null || list.Length == 0)
                {
                    continue;
                }
                var bsonArray    = new List <BsonDocument>();
                var heritageType = typeof(HeritageProject);
                var properties   = typeof(HeritageProject).GetProperties();
                for (int j = 0; j < list.Length; j++)
                {
                    var bsonDocument = new BsonDocument();
                    list[j].Link = "/project_details/" + list[j].Id;
                    foreach (var property in properties)
                    {
                        //反射获取HeritageProject所有属性
                        //以属性名作为MongoDB存储的Key值
                        //反射获取对应List当中的值
                        bsonDocument.Add(property.Name.ToLower(), Regex.Replace(heritageType.GetProperty(property.Name).GetValue(list[j]).ToString(), "<.*?>", string.Empty));
                    }

                    if (MongodbChecker.CheckHeritageProjectExist(list[j].Link))
                    {
                        Console.WriteLine("Duplicated Heritage Project Link: {0}", list[j].Link);
                        errorTime++;
                        continue;
                    }
                    block.Post(list[j].Link);
                    bsonArray.Add(bsonDocument);
                }


                if (bsonArray.Count != 0)
                {
                    MongodbSaver.SaveHeritageProjectNewsList(bsonArray);
                }

                if (jsonObject.More != 1)
                {
                    Console.WriteLine("GetAllProjectList: current page is {0}, more equals 0", i);
                    break;
                }
            }
            block.Complete();
            task.Wait();
        }