private void GetMetaDescrition(String html, String expected) { String actual = WebpageHelper.GetMetaDescrition(html); Assert.IsTrue(actual.Length <= 160); Assert.AreEqual(expected, actual); }
public void GetUrlTestWithSlash() { var subName = WebpageHelper.GetSubUrl(@"http://www.baidu.com"); var subNameMultiSlash = WebpageHelper.GetSubUrl(@"http://www.baidu.com/newsdetail/10204214.html"); Assert.AreEqual("www.baidu.com", subName); Assert.AreEqual("10204214.html", subNameMultiSlash); Assert.Pass(); }
public void TryToSaveHtmlFile() { var url = "http://billie66.github.io/TLCL/book/chap01.html"; HtmlWeb htmlWeb = new HtmlWeb(); var doc = htmlWeb.Load(url); WebPageSaver.SaveHtml(url, doc); Assert.IsTrue(File.Exists(WebpageHelper.GetSubUrl(url))); }
public void CorrectRequestStringTest() { string url1 = @"/asdsad/asdasdasd"; string url2 = @"+23/dasvsa/fcasvsaf"; string url3 = MAIN_PAGE + @"/asd/asd/asd"; Assert.AreEqual(MAIN_PAGE + url1, WebpageHelper.CorrectRequestString(url1)); Assert.AreEqual(MAIN_PAGE + @"/dasvsa/fcasvsaf", WebpageHelper.CorrectRequestString(url2)); Assert.AreEqual(url3, WebpageHelper.CorrectRequestString(url3)); }
private static BsonDocument GenerateHeritageProjectDetailPage(string url, bool continueReptile) { var doc = WebpageHelper.GetHttpRequestDocument(url); var containerNode = doc.DocumentNode.SelectSingleNode("//div[@class='x-container']"); if (containerNode == null) { return(null); } //头部标题和表格 var titleNode = containerNode.SelectSingleNode("//div[@class='t_head']/div[@class='h30']"); if (titleNode == null) { return(null); } var bsonDocument = new BsonDocument(); bsonDocument.Add("title", titleNode.InnerText); bsonDocument.Add("link", url); var tableNodes = containerNode.SelectSingleNode(".//div[@class='table']").SelectNodes(".//div[@class='p']"); if (tableNodes != null) { BsonArray descList = new BsonArray(); foreach (var node in tableNodes.Descendants()) { descList.Add(node.InnerText); } if (descList.Count > 0) { bsonDocument.Add("desc", descList); } } //中央区域描述文字 var contentNode = containerNode.SelectSingleNode(".//div[@class='text']//div[@class='p']"); if (contentNode != null) { StringBuilder sb = new StringBuilder(); foreach (var node in contentNode.Descendants()) { sb.Append(node.InnerText); } bsonDocument.Add("text", sb.ToString()); } //底部相关传承人 GetBottomInheritageAndReleventInfo(containerNode, bsonDocument, continueReptile); Console.WriteLine(bsonDocument); return(bsonDocument); }
private static async Task GetPeoplePageList(BufferBlock <string> imageTargetBlock) { var block = new BufferBlock <string>(); var task = GetNewsDetail.GeneratePeopleDetail(block, imageTargetBlock); int errorTime = 0; var firstPage = "http://www.ihchina.cn/character/p/1.html"; var lastPageNumber = DebugHelperTools.IsDebugMode() ? 1 : WebpageHelper.GetPageLastIndex(firstPage); for (int i = 1; i <= lastPageNumber && errorTime < 10; i++) { var listUrl = string.Format("http://www.ihchina.cn/character/p/{0}.html", i); Console.WriteLine("starting process people page: {0}", listUrl); var doc = WebpageHelper.GetHttpRequestDocument(listUrl); var listNodes = doc.DocumentNode.SelectNodes("//div[@class='list-item']"); if (listNodes == null) { errorTime++; continue; } List <BsonDocument> result = new List <BsonDocument>(); foreach (var node in listNodes) { if (errorTime == 10) { break; } var bson = WebpageHelper.AnalizeGeneralListInformation(node, MongodbChecker.CheckPeoplePageListExist, imageTargetBlock); if (bson == null) { errorTime++; Console.WriteLine("duplicated people url page{0}", i); continue; } if (bson != null) { var link = bson.GetElement("link").Value.ToString(); block.Post(link); result.Add(bson); } } if (result.Count > 0) { MongodbSaver.SavePeopleListInformation(result); } result.Clear(); } block.Complete(); await task; }
private static void GetPeopleMainPage(BufferBlock <string> imageTargetBlock) { var doc = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE); var bson = new BsonDocument(); //获取图片和表格内容 var tableImageNodes = doc.DocumentNode.SelectNodes("//div[@class='tab-cont']/div"); var tableContentNodes = doc.DocumentNode.SelectNodes("//div[@class='tab-track justify']/div"); if (tableContentNodes == null || tableContentNodes == null || tableImageNodes.Count != tableContentNodes.Count) { return; } var bsonArray = new BsonArray(); for (int i = 0; i < tableImageNodes.Count; i++) { var tableBson = new BsonDocument(); var imageNode = tableImageNodes[i]; var contentNode = tableContentNodes[i]; var linkNode = contentNode.SelectSingleNode(".//a"); if (linkNode == null) { continue; } var imageUrlWithAllText = imageNode.Attributes["style"].Value; var imageUrl = imageUrlWithAllText.Substring(imageUrlWithAllText.IndexOf("/"), imageUrlWithAllText.IndexOf(")") - imageUrlWithAllText.IndexOf("/")); var imageDesc = contentNode.InnerText; var link = linkNode.Attributes["href"].Value; imageTargetBlock.Post(imageUrl); tableBson.Add("img", WebpageHelper.GetSubUrl(imageUrl)); tableBson.Add("desc", imageDesc); tableBson.Add("link", link); bsonArray.Add(tableBson); } bson.Add("table", bsonArray); var success = WebpageHelper.TryToInsertOrUpdateABson(bson, MongodbMain.PeopleMainPage); if (success) { Console.WriteLine("People page Insert or update success"); } else { Console.WriteLine("Duplicated information in people page"); } }
private static void GetBanner(BufferBlock <string> imageTargetBlock) { var doc = WebpageHelper.GetHttpRequestDocument(MainPage); var nodes = from links in doc.DocumentNode.Descendants() where links.Name == "a" && links.Attributes["href"] != null && links.Attributes["class"] != null && links.Attributes["class"].Value.Equals("slick-link p-show") && !MongodbChecker.CheckMainNewsList(links.Attributes["href"].Value) select new BsonDocument() .Add("link", links.Attributes["href"].Value) .Add("img", WebpageHelper.GetSubUrl( links.Attributes["style"].Value.Substring( links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal), links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) - links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)))) .Add("compressImg", WebImageSaver.Instance.GetComressImageName(WebpageHelper.GetSubUrl( links.Attributes["style"].Value.Substring( links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal), links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) - links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))))) .Add("originImage", links.Attributes["style"].Value.Substring( links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal), links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) - links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))); foreach (var node in nodes) { imageTargetBlock.Post(node["originImage"].AsBsonValue.ToString()); } var enumerable = nodes as BsonDocument[] ?? nodes.ToArray(); if (!enumerable.Any()) { return; } MongodbSaver.SaveMainpageNewsList(nodes); foreach (var node in enumerable) { Console.WriteLine(node["link"].AsBsonValue + " " + node["img"].AsBsonValue); } }
/// <summary> /// Get detail of post /// </summary> /// <param name="id"></param> /// <returns></returns> public ActionResult Details(Int32 id) { using (var db = new DataService()) { // Initialize mode lwith post from database Details model = new Details() { Post = db.GetPostWithDetails(id) }; // Post not found if (model.Post == null) { return(HttpNotFound()); } else { // Get description model.Description = WebpageHelper.GetMetaDescrition(model.Post.Summary); // Get previous and next posts informations, to display prevous and next buttons DateTime date = model.Post.DateCreatedGmt; model.PreviousPost = db.GetPreviousPost(id, date); model.NextPost = db.GetNextPost(id, date); // Build comment model to allow user to comment this post UserProfile user = UserService.Get(db); if (user == null) { model.Comment = null; model.CurrentUserSubscibed = false; } else { model.Comment = new Comment(); model.CurrentUserSubscibed = db.HasCurrentUserSubscibed(id, user.Id); } } return(View(model)); } }
public static void GetHeritageMapTableInformation(BsonDocument bson) { var doc = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE); var nodes = doc.DocumentNode.SelectNodes("//div[@class='tab-cont map_num']/div"); if (nodes != null) { var tableArray = new BsonArray(); for (int i = 0; i < 2; i++) { var tableInfo = new BsonDocument(); string name; switch (i) { case 0: name = "国家级代表项目"; break; case 1: name = "国家级代表性传承人"; break; default: name = ""; break; } var node = nodes[i]; BsonDocument tableInforamtionNode = GetMapInformation(node); if (tableInforamtionNode != null && tableInforamtionNode.ElementCount > 0) { tableInfo.Add("desc", name); tableInfo.Add("content", tableInforamtionNode); tableArray.Add(tableInfo); } } bson.Add("mapTables", tableArray); } }
public void GetUrlTestWithoutSlash() { var subName = "asdsads.txt"; Assert.AreEqual(subName, WebpageHelper.GetSubUrl(subName)); }
private static void GetMainPageInformation(BufferBlock <string> imageTargetBlock) { var doc = WebpageHelper.GetHttpRequestDocument(PROJECT_MAIN_PAGE); var nodes = doc.DocumentNode.SelectNodes("//div[@class='x-wrap']/div[@class='title']/div"); if (nodes != null) { var bson = new BsonDocument(); foreach (var node in nodes) { switch (node.Attributes["class"].Value) { case "h30": bson.Add("title", node.InnerText); break; case "p": StringBuilder sb = new StringBuilder(); foreach (var contentNode in node.ChildNodes) { sb.Append(contentNode.InnerText); } bson.Add("content", sb.ToString()); break; } } //非遗页面的数字 nodes = doc.DocumentNode.SelectNodes("//div/div[@class='num-item']"); if (nodes != null) { var bsonArray = new BsonArray(); foreach (var node in nodes) { var numBson = new BsonDocument(); foreach (var childNode in node.ChildNodes) { switch (childNode.Attributes["class"]?.Value) { case "b": numBson.Add("num", childNode.Attributes["data-rn"].Value); break; case "h18": numBson.Add("desc", childNode.InnerText); break; } } bsonArray.Add(numBson); } bson.Add("numItem", bsonArray); } //获取首页的非物质文化遗产地图 GetHeritageMapTableInformation(bson); var mongodbBson = MongodbGetter.GetHeritageProjectMainPageDesc(); if (mongodbBson == null) { Console.WriteLine("Insert Heritage Project Content"); MongodbSaver.SaveHeritageProjectMainContent(bson); } else if (!CheckBsonIsEqual(bson, mongodbBson)) { Console.WriteLine("Update Heritage Project Content"); MongodbUpdater.UpdateHeritageProjectMainContent(bson); } else { Console.WriteLine("Not Insert Heritage Project Content"); } Console.WriteLine(bson); } }
private static void GetAllProjectList(BufferBlock <string> imageTargetBlock) { short errorTime = 0; var totalPages = DebugHelper.DebugHelperTools.IsDebugMode() ? 2 : 10; var block = new BufferBlock <string>(); var task = GetHeritageProjectDetailWorker.GenerateProjectDetailPage(block); for (int i = 1; i < totalPages; i++) { if (errorTime > 10) { Console.WriteLine("GetAllProjectList: reach the limitation of error time"); break; } var currentPage = String.Format(REQUEST_URL, i); Console.WriteLine("Starting process: " + currentPage); var requestResult = WebpageHelper.GetRequest(currentPage); if (string.IsNullOrEmpty(requestResult)) { errorTime++; continue; } var jsonObject = JsonConvert.DeserializeObject <HeritageProjectRequest>(requestResult); if (jsonObject.Links.Total_pages != 0 && jsonObject.Links.Total_pages != totalPages && !DebugHelper.DebugHelperTools.IsDebugMode()) { totalPages = jsonObject.Links.Total_pages; } var list = jsonObject.List; if (list == null || list.Length == 0) { continue; } var bsonArray = new List <BsonDocument>(); var heritageType = typeof(HeritageProject); var properties = typeof(HeritageProject).GetProperties(); for (int j = 0; j < list.Length; j++) { var bsonDocument = new BsonDocument(); list[j].Link = "/project_details/" + list[j].Id; foreach (var property in properties) { //反射获取HeritageProject所有属性 //以属性名作为MongoDB存储的Key值 //反射获取对应List当中的值 bsonDocument.Add(property.Name.ToLower(), Regex.Replace(heritageType.GetProperty(property.Name).GetValue(list[j]).ToString(), "<.*?>", string.Empty)); } if (MongodbChecker.CheckHeritageProjectExist(list[j].Link)) { Console.WriteLine("Duplicated Heritage Project Link: {0}", list[j].Link); errorTime++; continue; } block.Post(list[j].Link); bsonArray.Add(bsonDocument); } if (bsonArray.Count != 0) { MongodbSaver.SaveHeritageProjectNewsList(bsonArray); } if (jsonObject.More != 1) { Console.WriteLine("GetAllProjectList: current page is {0}, more equals 0", i); break; } } block.Complete(); task.Wait(); }