private Post ScrapePost(HtmlNode contentNode, Post post) { post = post ?? new Post(); contentNode.NullCheck(); /// title HtmlNode titleNode = contentNode.SelectSingleNode(PostPageXPath.Title); if (titleNode != null) { post.Title = titleNode.InnerText.SafeTrimAndEscapeHtml(); } else { this._logger.ErrorFormat("ScrapingService, ScrapePost, Title node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); } /// subtitle HtmlNode subtitleNode = contentNode.SelectSingleNode(PostPageXPath.Subtitle); if (subtitleNode != null) { post.Subtitle = subtitleNode.InnerText.SafeTrimAndEscapeHtml(); } else { this._logger.ErrorFormat("ScrapingService, ScrapePost, Subtitle node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); } /// author HtmlNode authorNode = contentNode.SelectSingleNode(PostPageXPath.Auhtor); if (authorNode != null) { IList<HtmlNode> authorNameNodes = authorNode.ChildNodes.Where(x => x.Name == "b" && x.ChildNodes.Where(t => t.Name == "a").Count() == 0).ToList(); if (!authorNameNodes.IsEmpty()) { foreach (HtmlNode author in authorNameNodes) { //TODO http://www.rtvslo.si/mmc-priporoca/dame-niso-sposobne-zmagati-na-dirki-formule-ena/306771 User authorUser = new User() { Name = author.InnerText.SafeTrim().Replace(",", string.Empty).Replace("foto:", string.Empty).SafeTrimAndEscapeHtml(), Function = UserFunctionEnum.Journalist }; post.Authors.Add(authorUser); } } //HtmlNode authorName = authorNode.ChildNodes.FindFirst("b"); //if (authorName != null) //{ // post.Authors = authorName.InnerText.SafeTrimAndEscapeHtml(); //} } if (post.Authors.IsEmpty()) { //this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}", post.Url); } /// info HtmlNode infoNode = contentNode.SelectSingleNode(PostPageXPath.InfoContent); if (infoNode != null) { // <div class="info">16. februar 2013 ob 07:22,<br>zadnji poseg: 16. februar 2013 ob 15:16<br>Schladming - MMC RTV SLO</div> IList<HtmlNode> textNodes = infoNode.ChildNodes.Where(x => x.Name == "#text").ToList(); if (textNodes != null && textNodes.Count > 1) { /// Created datetime string createdDateTimeString = textNodes.First().InnerText.SafeTrim(); DateTime createdDate; if (createdDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out createdDate)) { post.DateCreated = createdDate.ToUniversalTime(); post.LastUpdated = createdDate.ToUniversalTime(); } /// Location string locationString = textNodes.Last().InnerText; IList<string> locationList = locationString.Split(new string[]{"-"}, StringSplitOptions.RemoveEmptyEntries).ToList(); if (locationList != null && locationList.Count > 1) { post.Location = locationList.First().SafeTrim(); if (locationList.Last().SafeTrim() != "MMC RTV SLO") { this._logger.DebugFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, LIST: {1}", post.Url, locationList.SerializeObject()); } } else { this._logger.WarnFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode()); } if (textNodes.Count == 3) { /// Updated datetime string updatedDateTimeString = textNodes[1].InnerText.SafeTrim(); Regex dateTimeRegex = new Regex(@"(?<date>[0-9\.]+[\w+\s+]+[0-9\:]+)", RegexOptions.IgnoreCase); //TODO fix regex Match dateTimeMatch = dateTimeRegex.Match(updatedDateTimeString); if (dateTimeMatch.Success) { updatedDateTimeString = dateTimeMatch.Groups["date"].Value; DateTime updatedDate; if (updatedDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out updatedDate)) { post.DateCreated = updatedDate.ToUniversalTime(); } } } } else { this._logger.ErrorFormat("ScrapingService, ScrapePost, InfoNode - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode()); } } /// Main content IList<HtmlNode> contentNodes = new List<HtmlNode>(); foreach (HtmlNode node in contentNode.ChildNodes) { /// ends with author if (node.Name == "div" && node.Attributes.FirstOrDefault(x => x.Value == "author") != null) { break; } if ((node.Name == "p" || node.Name == "div") && node.FirstChild != null && node.FirstChild.Name != "div" && contentNodes.Count > 0) { contentNodes.Add(node); } /// starts with p tag if (node.Name == "p" && node.FirstChild.Name != "div" && contentNodes.Count == 0) { contentNodes.Add(node); } } //TODO remove string sasas = post.Url; if (!contentNodes.IsEmpty()) { /// Abstract - text inside strong tag in first node HtmlNode abstractNode = contentNodes.First(); HtmlNode strongAbstractNode = abstractNode.ChildNodes.First(x => x.Name == "strong"); post.Abstract = strongAbstractNode.InnerText.SafeTrimAndEscapeHtml(); /// remove abstract from main content abstractNode.ChildNodes.Remove(strongAbstractNode); /// Content StringBuilder content = new StringBuilder(); foreach (HtmlNode node in contentNodes) { // to get white space after paragraph title foreach (HtmlNode childNode in node.ChildNodes) { string text = childNode.InnerText.SafeTrimAndEscapeHtml(); if (text.Length > 0) { content.AppendFormat("{0} ", text); } } } post.Content = content.ToString().SafeTrim(); } else { this._logger.ErrorFormat("ScrapingService, ScrapePost - Post content is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); } return post; }
/// <summary> /// Scrape post statistics /// Rating, Number of comments, Number of FB likes, Number of tweets /// </summary> /// <param name="rootNode"></param> /// <param name="post"></param> /// <returns></returns> private Post ScrapePostStatistics(HtmlNode rootNode, Post post) { post = post ?? new Post(); /// rating HtmlNode ratingNode = rootNode.SelectSingleNode(PostPageXPath.RatingContent); if (ratingNode != null) { string ratingContent = ratingNode.InnerText; Regex ratingRegex = new Regex(@"\w+\s+(?<rating>[0-9\,]+)\s+\w+\s+(?<numRatings>[0-9]+)", RegexOptions.IgnoreCase); Match ratingMatch = ratingRegex.Match(ratingContent); if (ratingMatch.Success) { decimal rating; int numRatings; if (ratingMatch.Groups["rating"].Value.TryParseLogging(out rating)) { post.AvgRating = rating; } if (ratingMatch.Groups["numRatings"].Value.TryParseLogging(out numRatings)) { post.NumOfRatings = numRatings; } } } else { this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, Rating node is null - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode()); } /// num of comments HtmlNode numOfCommentsNode = rootNode.SelectSingleNode(PostPageXPath.NumOfComments); if (numOfCommentsNode != null && !string.IsNullOrEmpty(numOfCommentsNode.InnerText) && numOfCommentsNode.InnerText.StartsWith("(") && numOfCommentsNode.InnerText.EndsWith(")")) { int numOfComments; string numOfCommentsString = numOfCommentsNode.InnerText.Replace("(", string.Empty).Replace(")", string.Empty); if (int.TryParse(numOfCommentsString, out numOfComments)) { post.NumOfComments = numOfComments; } else { this._logger.WarnFormat("ScrapingService, ScrapePostStatistics, NumOfComments parsing: {2} - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode(), numOfCommentsString); } } else { this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfComments - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode()); } /// FB social plugin // https://www.facebook.com/plugins/like.php?href=http%3A%2F%2Fwww.rtvslo.si%2Fsport%2Fodbojka%2Fvodebova-in-fabjanova-ubranili-naslov-drzavnih-prvakinj%2F314078&layout=button_count string fbUrlPattern = "http://www.facebook.com/plugins/like.php?href={0}&layout=button_count"; string encodedUrl = HttpUtility.UrlEncode(post.Url); string fbUrl = string.Format(fbUrlPattern, encodedUrl); string fbPluginPage = this.CreateWebRequest(new Uri(fbUrl)); if (!string.IsNullOrEmpty(fbPluginPage)) { HtmlNode fbRootNode = fbPluginPage.CreateRootNode(); if (fbRootNode != null) { int fbLikes; HtmlNode fbLikesNode = fbRootNode.SelectSingleNode(PostPageXPath.FbLikes); if (fbLikesNode != null && !string.IsNullOrEmpty(fbLikesNode.InnerText) && int.TryParse(fbLikesNode.InnerText, out fbLikes)) { post.NumOfFbLikes = fbLikes; } else { this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, FbLikes - POST URL: {0}, FB URL: {1}, NODE: {2}", post.Url, fbUrl, fbRootNode.SerializeHtmlNode()); } } else { this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, FbLikes Node NULL - POST URL: {0}, FB URL: {1}, NODE: {2}", post.Url, fbUrl, fbRootNode.SerializeHtmlNode()); } } /// Tweeter social plugin // http://platform.twitter.com/widgets/tweet_button.1375828408.html?url=http%3A%2F%2Fwww.rtvslo.si%2Fzabava%2Fiz-sveta-znanih%2Fboy-george-napadel-isinbajevo-zaradi-homofobnih-izjav%2F315495 // http://cdn.api.twitter.com/1/urls/count.json?url=http%3A%2F%2Fwww.rtvslo.si%2Fzabava%2Fiz-sveta-znanih%2Fboy-george-napadel-isinbajevo-zaradi-homofobnih-izjav%2F315495&callback=twttr.receiveCount string twUrlPattern = "http://cdn.api.twitter.com/1/urls/count.json?url={0}"; string twUrl = string.Format(twUrlPattern, encodedUrl); string twJsonPage = this.CreateWebRequest(new Uri(twUrl)); try { JObject twJson = JObject.Parse(twJsonPage); string countString = (string)twJson["count"]; int numOfTweets; if (!string.IsNullOrEmpty(countString) && int.TryParse(countString, out numOfTweets)) { post.NumOfTweets = numOfTweets; } else { this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfTweets - POST URL: {0}, TW URL: {1}, NODE: {2}", post.Url, twUrl, twJsonPage); } } catch (JsonReaderException ex) { this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfTweets Parse EXCEPTION - POST URL: {0}, TW URL: {1}, NODE: {2}, EX:{3}", post.Url, twUrl, twJsonPage, ex.Message); } return post; }
/// <summary> /// Scrape page of post comments /// </summary> /// <param name="html"></param> /// <param name="newsPost"></param> /// <returns></returns> public IList<Comment> ScrapeCommentsPage(string html, Post newsPost) { HtmlNode rootNode = html.CreateRootNode(); IList<Comment> comments = new List<Comment>(); HtmlNodeCollection commentsCollection = rootNode.SelectNodes(CommentsPageXPath.CommentContent); if (commentsCollection == null || commentsCollection.Count == 0) { return new List<Comment>(); } IList<HtmlNode> commentNodes = commentsCollection.Where(x => x.SelectSingleNode(CommentsPageXPath.HeaderInfo) != null).ToList(); if (commentNodes != null && commentNodes.Count > 0) { foreach (HtmlNode node in commentNodes) { Comment comment = this.ScrapeComment(node); if (comment != null) { //comment.PostGuidUrl = newsPost.GuidUrl; //comment.PostUrl = newsPost.Url; comment.PostId = newsPost.Id; comments.Add(comment); } } } else { this._logger.InfoFormat("ScrapingService, ScrapeCommentsPage, No comments - POST-URL: {0}, HTML: {1}", newsPost.Url, html); } return comments; }
/// <summary> /// Scrape post page /// </summary> /// <param name="postUrl"></param> /// <param name="post"></param> /// <returns></returns> public Post ScrapePostPage(Uri postUrl, Post post) { post = post ?? new Post(); string html = this.CreateWebRequest(postUrl); if (!string.IsNullOrEmpty(html)) { HtmlNode rootNode = html.CreateRootNode(); /// accessed time post.AccessedDate = DateTime.UtcNow.ToUniversalTime(); HtmlNode postContent = rootNode.SelectSingleNode(PostPageXPath.PostContent); post = this.ScrapePost(postContent, post); post = this.ScrapePostStatistics(rootNode, post); return post; } return null; }
/// <summary> /// Scrape rtvslo.si archive page html /// Fill some post properties: Id, Url, Category, CategoryUrl /// </summary> /// <param name="html"></param> /// <param name="hasNextPage"></param> /// <returns></returns> public IList<Post> ScrapeArhivePage(string html, out bool hasNextPage) { hasNextPage = false; HtmlNode rootNode = html.CreateRootNode(); IList<Post> result = new List<Post>(); IList<HtmlNode> posts = rootNode.SelectNodes(ArchivePageXPath.PostTitlesContent).ToList(); if (posts != null && posts.Count > 0) { /// posts foreach (HtmlNode node in posts) { Post post = null; /// <a href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384" class="title">Kdo je “kriv” za zgodnjo upokojitev Mateje Robnik?</a> HtmlNode linkNode = node.ChildNodes["a"]; if (linkNode != null) { string href = string.Empty; string url = string.Empty; Category category = new Category(); /// href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384" if (linkNode.Attributes["href"] != null && linkNode.Attributes["href"].Value.SafeTrim().Length > 5) { href = linkNode.Attributes["href"].Value.SafeTrim(); url = href.ToFullRtvSloUrl(); /// scrape categories string[] splittedUrl = href.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries); if (splittedUrl.Length > 2) { for (int i = 0; i < splittedUrl.Length - 2; i++) { if (i == 0) { /// top level category category.Label = splittedUrl[i]; category.Url = string.Format("{0}/", splittedUrl[i].ToFullRtvSloUrl()); } else { category.SaveChildCategory(new Category() { Label = splittedUrl[i] }); } } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Splitted URL length - URL: {2}, NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html, href); } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post link - NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html); } string title = linkNode.InnerHtml.SafeTrimAndEscapeHtml(); post = new Post() { Id = this.GetIdFromUrl(url), Url = url, Title = title, Category = category, }; if (post.Id == 0 || string.IsNullOrEmpty(post.Url) || string.IsNullOrEmpty(post.Title) || string.IsNullOrEmpty(post.Category.Label) ) { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post is not filled right - POST: {0}, HTML: {1}", post.SerializeObject(), html); } } if (post != null) { result.Add(post); } } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, There are no posts - HTML: {0}", html); } /// pager HtmlNode pager = rootNode.SelectSingleNode(ArchivePageXPath.PagerContent); if (pager != null) { /// <a href="/arhiv/?date_from=2013-02-13&date_to=2013-02-13&section=1.2.16.43.4.5.3.8.129.12.9.28.6.24&page=1">2</a> HtmlNode nextPage = pager.SelectSingleNode(ArchivePageXPath.PagerNextPage); if (nextPage != null && nextPage.Attributes["href"] != null) { hasNextPage = true; } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Pager is null - HTML: {0}", html); } return result; }
public void RunDebug() { //this._repositoryService.ClearRepository(); //this._repositoryService.Initialize(); //User user1 = this._scrapingService.ScrapeUserPage(new Uri("http://www.rtvslo.si/profil/veselo-na-delo")); //if (user1 != null) //{ // this._logger.DebugFormat("Simulator, RunStep2 - USER: {0}", user1.SerializeObject()); // string userGuid = this._repositoryService.SaveUser(user1); //} //else //{ // //this._logger.WarnFormat("Simulator, RunStep2, Scrape user unsuccessfull - USER_URL: {0}, USER: {1}", comment.UserUrl, user.SerializeObject()); //} Post post1 = new Post() { Url = "http://www.rtvslo.si/slovenija/jankovic-cestital-bratuskovi-za-napoved-da-bo-kandidirala-za-predsednico-ps-ja/317891" }; post1 = this._scrapingService.ScrapePostPage(new Uri(post1.Url), post1); int startPage = 0; string html = this._scrapingService.GetFilteredArchivePage(startPage); bool hasNextPage = true; while (hasNextPage && !string.IsNullOrEmpty(html)) { /// scrape archive page IList<Post> posts = this._scrapingService.ScrapeArhivePage(html, out hasNextPage); if (!posts.IsEmpty()) { foreach (Post post in posts) { /// save post from archive page this._repositoryService.SaveOrUpdatePostOverview(post); /// scrape details page Post newPost = this._scrapingService.ScrapePostPage(new Uri(post.Url), post); this._logger.DebugFormat("Simulator, RunStep2 - POST: {0}", post.SerializeObject()); foreach (User author in post.Authors) { string authorGuid = this._repositoryService.SearchUserByName(author.Name); if (string.IsNullOrEmpty(authorGuid)) { authorGuid = this._repositoryService.SaveAuthor(author); if (!string.IsNullOrEmpty(authorGuid)) { author.RepositoryGuidUrl = authorGuid; } } else { author.RepositoryGuidUrl = authorGuid; } } /// save details page string postGuid = this._repositoryService.SavePostDetails(post); /// check if save was successsfull if (string.IsNullOrEmpty(postGuid)) { this._logger.FatalFormat("Simulator, RunStep2, SavePostDetails unsuccessfull - POST_GUID: {0}, POST: {1}", postGuid, post.SerializeObject()); continue; } /// post comments int commentsPage = 0; string commentsHtml = this._scrapingService.GetCommentsPage(post.Id, commentsPage++); while (!string.IsNullOrEmpty(commentsHtml)) { IList<Comment> comments = this._scrapingService.ScrapeCommentsPage(commentsHtml, post); if (comments.IsEmpty()) { break; } foreach (Comment comment in comments) { comment.PostGuidUrl = postGuid; string userGuid = this._repositoryService.SearchUserById(comment.UserId); if (string.IsNullOrEmpty(userGuid)) { User user = this._scrapingService.ScrapeUserPage(new Uri(comment.UserUrl)); if (user != null) { this._logger.DebugFormat("Simulator, RunStep2 - USER: {0}", user.SerializeObject()); userGuid = this._repositoryService.SaveUser(user); } else { this._logger.WarnFormat("Simulator, RunStep2, Scrape user unsuccessfull - USER_URL: {0}, USER: {1}", comment.UserUrl, user.SerializeObject()); continue; } } comment.UserGuidUrl = userGuid; this._logger.DebugFormat("Simulator, RunStep2 - COMMENT: {0}", comment.SerializeObject()); this._repositoryService.SaveComment(comment); } /// get next page of comments commentsHtml = this._scrapingService.GetCommentsPage(post.Id, commentsPage++); } } } /// load next page html = this._scrapingService.GetFilteredArchivePage(startPage++); } }
/// <summary> /// Get posts from selected region in Slovenia /// </summary> /// <param name="region"></param> /// <param name="fromDate"></param> /// <param name="toDate"></param> /// <returns></returns> public IList<Post> GetPostsFromRegion(string region, DateTime? fromDate = null, DateTime? toDate = null) { region.NullCheck(); IList<Post> result = new List<Post>(); using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName)) { if (connector.IsReady) { /// SELECT DISTINCT ?post ?locationName ?seeAlso /// WHERE { /// ?post a sioc:Post . /// ?post news:location ?locationName . /// ?post rdfs:seeAlso ?seeAlso . /// ?post dct:created ?date . /// /// SERVICE <http://dbpedia.org/sparql> { /// ?region dbpedia-owl:type dbpedia:Statistical_regions_of_Slovenia . /// ?region rdfs:label "Gorenjska"@NL . /// /// { ?city ?x ?region } UNION { ?region ?z ?city } /// ?city ?y ?locationName /// } /// FILTER(langMatches(lang(?locationName ), "EN") && ?date >= "fromDate" && ?date <= "toDate") /// } StringBuilder query = new StringBuilder(); query.AppendFormat( "SELECT DISTINCT ?post ?locationName ?seeAlso " + "WHERE {{ " + "?post rdf:type sioc:Post . " + "?post news:location ?locationName . " + "?post rdfs:seeAlso ?seeAlso . "); if (fromDate.HasValue && toDate.HasValue) { query.AppendFormat( "?post {0} ?date . ", Predicate.DctCreated); } query.AppendFormat( "SERVICE <http://dbpedia.org/sparql> {{ " + "?region dbpedia-owl:type dbpedia:Statistical_regions_of_Slovenia . " + "?region rdfs:label \"{0}\"@NL . " + "{{ ?city ?x ?region }} UNION {{ ?region ?z ?city }} " + "?city ?y ?locationName " + "}} ", region); query.AppendFormat( "FILTER(langMatches(lang(?locationName ), \"EN\")"); if (fromDate.HasValue && toDate.HasValue) { query.AppendFormat( " && ?date >= \"{0}\" && ?date <= \"{1}\"", fromDate.Value.ToString(RepositoryHelper.DateTimeFormat), toDate.Value.ToString(RepositoryHelper.DateTimeFormat)); } query.AppendFormat( ") }}"); SparqlResultSet queryResult = connector.QueryFormat(query.ToString()); if (queryResult == null) { this._logger.FatalFormat("RepositoryService, GetAllSlovenianRegions, Query result is null - QUERY: {0}", query); } else if (!queryResult.Results.IsEmpty()) { LiteralNode literalNode; UriNode uriNode; foreach (SparqlResult res in queryResult.Results) { Post p = new Post(); uriNode = res.Value("post") as UriNode; if (uriNode != null) { p.RepositoryGuidUrl = uriNode.Uri.AbsoluteUri; } uriNode = res.Value("seeAlso") as UriNode; if (uriNode != null) { p.Url = uriNode.Uri.AbsoluteUri; } literalNode = res.Value("locationName") as LiteralNode; if (literalNode != null) { p.Location = literalNode.Value; } result.Add(p); } } return result; } else { this._logger.FatalFormat("RepositoryService, GetAllSlovenianRegions, SesameHttpProtocolConnector is not ready"); } } return null; }
/// <summary> /// Save post details page in RDF format /// Updates post title /// </summary> /// <param name="post"></param> /// <param name="update"></param> /// <returns>Guid url</returns> public string SavePostDetails(Post post, bool update = false) { using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName)) { if (connector.IsReady) { /// SELECT ?guidUrl ?predicate ?object /// WHERE { /// ?guidUrl rdf:type sioc:Post /// ; news:ID "id" /// ; ?predicate ?object /// } string query = string.Format("SELECT ?guidUrl ?predicate ?object WHERE {{ ?guidUrl {0} {1} ; {2} \"{3}\" ; ?predicate ?object }}", Predicate.RdfType, Predicate.SiocPost, Predicate.NewsId, post.Id.ToString(), Predicate.SiocTopic); SparqlResultSet queryResult = connector.QueryFormat(query); if (queryResult == null || queryResult.Results.IsEmpty()) { this._logger.FatalFormat("RepositoryService, SavePostDetails, Query result has no results - QUERY: {0}", query); return null; ; } /// save using (IGraph g = new Graph()) { g.BaseUri = RepositoryHelper.BaseUrl.ToUri(); IList<Triple> newTriples = new List<Triple>(); IList<Triple> removeTriples = new List<Triple>(); INode postGuid = queryResult.Results.First().Value("guidUrl").CopyNode(g); post.RepositoryGuidUrl = ((UriNode)postGuid).Uri.AbsoluteUri; #region Post Content /// remove old title this.RemoveTriples(removeTriples, queryResult, g, postGuid, new string[] { Predicate.DctTitle }); if (!update) { /// published at newTriples.Add(new Triple(postGuid, Predicate.NewsPublishedAt.ToUriNode(g), RepositoryHelper.SiteUrl.ToUriNode(g))); } /// date created newTriples.Add(new Triple(postGuid, Predicate.DctCreated.ToUriNode(g), post.DateCreated.Value.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType))); /// accessed date newTriples.Add(new Triple(postGuid, Predicate.NewsAccessed.ToUriNode(g), post.AccessedDate.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType))); /// title newTriples.Add(new Triple(postGuid, Predicate.DctTitle.ToUriNode(g), post.Title.ToLiteralNode(g))); /// subtitle if (!string.IsNullOrEmpty(post.Subtitle)) { newTriples.Add(new Triple(postGuid, Predicate.MmcSubtitle.ToUriNode(g), post.Subtitle.ToLiteralNode(g))); } /// abstract if (!string.IsNullOrEmpty(post.Abstract)) { newTriples.Add(new Triple(postGuid, Predicate.DctAbstract.ToUriNode(g), post.Abstract.ToLiteralNode(g))); } /// last updated newTriples.Add(new Triple(postGuid, Predicate.MmcLastUpdated.ToUriNode(g), post.LastUpdated.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType))); /// location if (!string.IsNullOrEmpty(post.Location)) { newTriples.Add(new Triple(postGuid, Predicate.NewsLocation.ToUriNode(g), post.Location.ToLiteralNode(g, language: RepositoryHelper.LanguageEnglish))); /// hack to get posts from region } /// content if (!string.IsNullOrEmpty(post.Content)) { newTriples.Add(new Triple(postGuid, Predicate.SiocContent.ToUriNode(g), post.Content.ToLiteralNode(g))); } /// authors if (!post.Authors.IsEmpty()) { foreach (User author in post.Authors) { if (!string.IsNullOrEmpty(author.RepositoryGuidUrl)) { newTriples.Add(new Triple(postGuid, Predicate.SiocHasCreator.ToUriNode(g), author.RepositoryGuidUrl.ToUriNode(g))); } } } if (update) { /// remove old triples this.RemoveTriples(removeTriples, queryResult, g, postGuid, new string[]{ Predicate.DctCreated, Predicate.NewsAccessed, Predicate.MmcSubtitle, Predicate.DctAbstract, Predicate.MmcLastUpdated, Predicate.NewsLocation, Predicate.SiocContent, Predicate.SiocHasCreator }); } #endregion Post Content #region Statistics string statsGuid = Guid.NewGuid().ToString(); string statsGuidUrl = string.Format(RepositoryHelper.StatisticsUrlPattern, statsGuid); UriNode statsGuidNode = null; /// read existing statsGuidUrl if (update) { statsGuidNode = queryResult.Results .First(x => x.Value("predicate").ToSafeString() == Predicate.NewsStatistics.ToFullNamespaceUrl()) .Value("object") as UriNode; statsGuidUrl = statsGuidNode.Uri.AbsoluteUri; /// SELECT ?predicate ?object /// WHERE { /// <guid> rdf:type news:Stat /// ; ?predicate ?object /// } query = string.Format("SELECT ?predicate ?object WHERE {{ <{0}> {1} {2} ; ?predicate ?object }}", statsGuidUrl, Predicate.RdfType, Predicate.NewsStat); queryResult = connector.QueryFormat(query); if (queryResult == null || queryResult.Results.IsEmpty()) { this._logger.FatalFormat("RepositoryService, SavePostDetails, Update statistics ERROR - QUERY: {0}", query); } } INode statsSubject = statsGuidNode != null ? statsGuidNode.CopyNode(g) : statsGuidUrl.ToUriNode(g); if (!update) { /// initialize newTriples.Add(new Triple(statsSubject, Predicate.RdfType.ToUriNode(g), Predicate.NewsStat.ToUriNode(g))); newTriples.Add(new Triple(postGuid, Predicate.NewsStatistics.ToUriNode(g), statsSubject)); } /// number of comments if (post.NumOfComments > -1) { newTriples.Add(new Triple(statsSubject, Predicate.NewsNComments.ToUriNode(g), post.NumOfComments.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType))); } /// avgerage rating if (post.AvgRating > -1) { newTriples.Add(new Triple(statsSubject, Predicate.NewsAvgRating.ToUriNode(g), post.AvgRating.ToString().ToLiteralNode(g, dataType: RepositoryHelper.DecimalDataType))); } /// number of ratings if (post.NumOfRatings > -1) { newTriples.Add(new Triple(statsSubject, Predicate.NewsNRatings.ToUriNode(g), post.NumOfRatings.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType))); } /// number of FB likes if (post.NumOfFbLikes > -1) { newTriples.Add(new Triple(statsSubject, Predicate.NewsNFBLikes.ToUriNode(g), post.NumOfFbLikes.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType))); } /// number of tweets if (post.NumOfTweets > -1) { newTriples.Add(new Triple(statsSubject, Predicate.NewsNTweets.ToUriNode(g), post.NumOfTweets.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType))); } if (update) { /// remove old triples this.RemoveTriples(removeTriples, queryResult, g, statsSubject, new string[] { Predicate.NewsNComments, Predicate.NewsAvgRating, Predicate.NewsNRatings, Predicate.NewsNFBLikes, Predicate.NewsNTweets }); } #endregion Statistics connector.UpdateGraph(g.BaseUri, newTriples, removeTriples); return post.RepositoryGuidUrl; } } else { this._logger.FatalFormat("RepositoryService, SavePostDetails, SesameHttpProtocolConnector is not ready"); } } return null; }
/// <summary> /// Save only post data scraped from archive page in RDF format if don't already exist /// ID, Title, Url, Category /// </summary> /// <param name="post"></param> /// <returns>Guid url</returns> public string SaveOrUpdatePostOverview(Post post) { /// save category string categoryUrl = this.CheckAndSaveCategories(post.Category); using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName)) { if (connector.IsReady) { /// SELECT ?guidUrl ?predicate ?object /// WHERE { /// ?guidUrl rdf:type sioc:Post /// ; news:ID "id" /// ; ?predicate ?object /// } SparqlResultSet queryResult = connector.QueryFormat("SELECT ?guidUrl ?predicate ?object WHERE {{ ?guidUrl {0} {1} ; {2} \"{3}\" ; ?predicate ?object }}", Predicate.RdfType, Predicate.SiocPost, Predicate.NewsId, post.Id.ToString(), Predicate.SiocTopic); /// update existing if (queryResult != null && !queryResult.Results.IsEmpty()) { using (IGraph g = new Graph()) { g.BaseUri = RepositoryHelper.BaseUrl.ToUri(); IList<Triple> newTriples = new List<Triple>(); INode postGuid = queryResult.Results.First().Value("guidUrl").CopyNode(g); post.RepositoryGuidUrl = ((UriNode)postGuid).Uri.AbsoluteUri; /// select categories IEnumerable<SparqlResult> categories = queryResult.Results .Where(x => x.Value("predicate").ToSafeString() == Predicate.SiocTopic.ToFullNamespaceUrl()); SparqlResult categoryResult = categories.FirstOrDefault(x => x.Value("object").ToSafeString() == categoryUrl); if (categoryResult == null) { newTriples.Add(new Triple(postGuid, Predicate.SiocTopic.ToUriNode(g), categoryUrl.ToUriNode(g))); } /// select url IEnumerable<SparqlResult> postUrls = queryResult.Results .Where(x => x.Value("predicate").ToSafeString() == Predicate.RdfsSeeAlso.ToFullNamespaceUrl()); SparqlResult postUrlResult = postUrls.FirstOrDefault(x => x.Value("object").ToSafeString() == post.Url); if (postUrlResult == null) { newTriples.Add(new Triple(postGuid, Predicate.RdfsSeeAlso.ToUriNode(g), post.Url.ToUriNode(g))); } connector.UpdateGraph(g.BaseUri, newTriples, new List<Triple>()); } } /// save new else { using (IGraph g = new Graph()) { g.BaseUri = RepositoryHelper.BaseUrl.ToUri(); IList<Triple> newTriples = new List<Triple>(); string guidUrl = string.Format(RepositoryHelper.PostUrlPattern, Guid.NewGuid().ToString()); INode guidNode = guidUrl.ToUriNode(g); post.RepositoryGuidUrl = guidUrl; /// define post newTriples.Add(new Triple(guidNode, Predicate.RdfType.ToUriNode(g), Predicate.SiocPost.ToUriNode(g))); /// ID newTriples.Add(new Triple(guidNode, Predicate.NewsId.ToUriNode(g), post.Id.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType))); /// title newTriples.Add(new Triple(guidNode, Predicate.DctTitle.ToUriNode(g), post.Title.ToLiteralNode(g))); /// post url newTriples.Add(new Triple(guidNode, Predicate.RdfsSeeAlso.ToUriNode(g), post.Url.ToUriNode(g))); /// category if (!string.IsNullOrEmpty(categoryUrl)) { newTriples.Add(new Triple(guidNode, Predicate.SiocTopic.ToUriNode(g), categoryUrl.ToUriNode(g))); } connector.UpdateGraph(g.BaseUri, newTriples, new List<Triple>()); } } return post.RepositoryGuidUrl; } else { this._logger.FatalFormat("RepositoryService, SavePostOverview, SesameHttpProtocolConnector is not ready"); } } return null; }