Ejemplo n.º 1
0
        private Post ScrapePost(HtmlNode contentNode, Post post)
        {
            post = post ?? new Post();

            contentNode.NullCheck();

            /// title
            HtmlNode titleNode = contentNode.SelectSingleNode(PostPageXPath.Title);
            if (titleNode != null)
            {
                post.Title = titleNode.InnerText.SafeTrimAndEscapeHtml();
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePost, Title node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
            }

            /// subtitle
            HtmlNode subtitleNode = contentNode.SelectSingleNode(PostPageXPath.Subtitle);
            if (subtitleNode != null)
            {
                post.Subtitle = subtitleNode.InnerText.SafeTrimAndEscapeHtml();
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePost, Subtitle node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
            }

            /// author
            HtmlNode authorNode = contentNode.SelectSingleNode(PostPageXPath.Auhtor);
            if (authorNode != null)
            {
                IList<HtmlNode> authorNameNodes = authorNode.ChildNodes.Where(x => x.Name == "b" && x.ChildNodes.Where(t => t.Name == "a").Count() == 0).ToList();
                if (!authorNameNodes.IsEmpty())
                {
                    foreach (HtmlNode author in authorNameNodes)
                    {
                        //TODO http://www.rtvslo.si/mmc-priporoca/dame-niso-sposobne-zmagati-na-dirki-formule-ena/306771
                        User authorUser = new User()
                        {
                            Name = author.InnerText.SafeTrim().Replace(",", string.Empty).Replace("foto:", string.Empty).SafeTrimAndEscapeHtml(),
                            Function = UserFunctionEnum.Journalist
                        };

                        post.Authors.Add(authorUser);
                    }
                }

                //HtmlNode authorName = authorNode.ChildNodes.FindFirst("b");
                //if (authorName != null)
                //{
                //    post.Authors = authorName.InnerText.SafeTrimAndEscapeHtml();
                //}
            }

            if (post.Authors.IsEmpty())
            {
                //this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
                this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}", post.Url);
            }

            /// info
            HtmlNode infoNode = contentNode.SelectSingleNode(PostPageXPath.InfoContent);
            if (infoNode != null)
            {
                // <div class="info">16. februar 2013 ob 07:22,<br>zadnji poseg: 16. februar 2013 ob 15:16<br>Schladming - MMC RTV SLO</div>

                IList<HtmlNode> textNodes = infoNode.ChildNodes.Where(x => x.Name == "#text").ToList();
                if (textNodes != null && textNodes.Count > 1)
                {
                    /// Created datetime
                    string createdDateTimeString = textNodes.First().InnerText.SafeTrim();

                    DateTime createdDate;
                    if (createdDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out createdDate))
                    {
                        post.DateCreated = createdDate.ToUniversalTime();
                        post.LastUpdated = createdDate.ToUniversalTime();
                    }

                    /// Location
                    string locationString = textNodes.Last().InnerText;
                    IList<string> locationList = locationString.Split(new string[]{"-"}, StringSplitOptions.RemoveEmptyEntries).ToList();
                    if (locationList != null && locationList.Count > 1)
                    {
                        post.Location = locationList.First().SafeTrim();

                        if (locationList.Last().SafeTrim() != "MMC RTV SLO")
                        {
                            this._logger.DebugFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, LIST: {1}", post.Url, locationList.SerializeObject());
                        }
                    }
                    else
                    {
                        this._logger.WarnFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode());
                    }

                    if (textNodes.Count == 3)
                    {
                        /// Updated datetime
                        string updatedDateTimeString = textNodes[1].InnerText.SafeTrim();

                        Regex dateTimeRegex = new Regex(@"(?<date>[0-9\.]+[\w+\s+]+[0-9\:]+)", RegexOptions.IgnoreCase);

                        //TODO fix regex
                        Match dateTimeMatch = dateTimeRegex.Match(updatedDateTimeString);

                        if (dateTimeMatch.Success)
                        {
                            updatedDateTimeString = dateTimeMatch.Groups["date"].Value;

                            DateTime updatedDate;
                            if (updatedDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out updatedDate))
                            {
                                post.DateCreated = updatedDate.ToUniversalTime();
                            }
                        }
                    }
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapePost, InfoNode - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode());
                }
            }

            /// Main content
            IList<HtmlNode> contentNodes = new List<HtmlNode>();
            foreach (HtmlNode node in contentNode.ChildNodes)
            {
                /// ends with author
                if (node.Name == "div" && node.Attributes.FirstOrDefault(x => x.Value == "author") != null)
                {
                    break;
                }

                if ((node.Name == "p" || node.Name == "div") && node.FirstChild != null && node.FirstChild.Name != "div" && contentNodes.Count > 0)
                {
                    contentNodes.Add(node);
                }

                /// starts with p tag
                if (node.Name == "p" && node.FirstChild.Name != "div" && contentNodes.Count == 0)
                {
                    contentNodes.Add(node);
                }
            }

            //TODO remove
            string sasas = post.Url;

            if (!contentNodes.IsEmpty())
            {
                /// Abstract - text inside strong tag in first node
                HtmlNode abstractNode = contentNodes.First();
                HtmlNode strongAbstractNode = abstractNode.ChildNodes.First(x => x.Name == "strong");
                post.Abstract = strongAbstractNode.InnerText.SafeTrimAndEscapeHtml();

                /// remove abstract from main content
                abstractNode.ChildNodes.Remove(strongAbstractNode);

                /// Content
                StringBuilder content = new StringBuilder();

                foreach (HtmlNode node in contentNodes)
                {
                    // to get white space after paragraph title
                    foreach (HtmlNode childNode in node.ChildNodes)
                    {
                        string text = childNode.InnerText.SafeTrimAndEscapeHtml();
                        if (text.Length > 0)
                        {
                            content.AppendFormat("{0} ", text);
                        }
                    }
                }

                post.Content = content.ToString().SafeTrim();
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePost - Post content is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
            }

            return post;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Scrape post statistics
        /// Rating, Number of comments, Number of FB likes, Number of tweets
        /// </summary>
        /// <param name="rootNode"></param>
        /// <param name="post"></param>
        /// <returns></returns>
        private Post ScrapePostStatistics(HtmlNode rootNode, Post post)
        {
            post = post ?? new Post();

            /// rating
            HtmlNode ratingNode = rootNode.SelectSingleNode(PostPageXPath.RatingContent);
            if (ratingNode != null)
            {
                string ratingContent = ratingNode.InnerText;
                Regex ratingRegex = new Regex(@"\w+\s+(?<rating>[0-9\,]+)\s+\w+\s+(?<numRatings>[0-9]+)", RegexOptions.IgnoreCase);
                Match ratingMatch = ratingRegex.Match(ratingContent);

                if (ratingMatch.Success)
                {
                    decimal rating;
                    int numRatings;
                    if (ratingMatch.Groups["rating"].Value.TryParseLogging(out rating))
                    {
                        post.AvgRating = rating;
                    }

                    if (ratingMatch.Groups["numRatings"].Value.TryParseLogging(out numRatings))
                    {
                        post.NumOfRatings = numRatings;
                    }
                }
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, Rating node is null - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode());
            }

            /// num of comments
            HtmlNode numOfCommentsNode = rootNode.SelectSingleNode(PostPageXPath.NumOfComments);
            if (numOfCommentsNode != null &&
                !string.IsNullOrEmpty(numOfCommentsNode.InnerText) &&
                numOfCommentsNode.InnerText.StartsWith("(") &&
                numOfCommentsNode.InnerText.EndsWith(")"))
            {
                int numOfComments;
                string numOfCommentsString = numOfCommentsNode.InnerText.Replace("(", string.Empty).Replace(")", string.Empty);
                if (int.TryParse(numOfCommentsString, out numOfComments))
                {
                    post.NumOfComments = numOfComments;
                }
                else
                {
                    this._logger.WarnFormat("ScrapingService, ScrapePostStatistics, NumOfComments parsing: {2} - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode(), numOfCommentsString);
                }
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfComments - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode());
            }

            /// FB social plugin
            // https://www.facebook.com/plugins/like.php?href=http%3A%2F%2Fwww.rtvslo.si%2Fsport%2Fodbojka%2Fvodebova-in-fabjanova-ubranili-naslov-drzavnih-prvakinj%2F314078&layout=button_count

            string fbUrlPattern = "http://www.facebook.com/plugins/like.php?href={0}&layout=button_count";
            string encodedUrl = HttpUtility.UrlEncode(post.Url);

            string fbUrl = string.Format(fbUrlPattern, encodedUrl);
            string fbPluginPage = this.CreateWebRequest(new Uri(fbUrl));

            if (!string.IsNullOrEmpty(fbPluginPage))
            {
                HtmlNode fbRootNode = fbPluginPage.CreateRootNode();
                if (fbRootNode != null)
                {
                    int fbLikes;

                    HtmlNode fbLikesNode = fbRootNode.SelectSingleNode(PostPageXPath.FbLikes);
                    if (fbLikesNode != null && !string.IsNullOrEmpty(fbLikesNode.InnerText) && int.TryParse(fbLikesNode.InnerText, out fbLikes))
                    {
                        post.NumOfFbLikes = fbLikes;
                    }
                    else
                    {
                        this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, FbLikes - POST URL: {0}, FB URL: {1}, NODE: {2}", post.Url, fbUrl, fbRootNode.SerializeHtmlNode());
                    }
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, FbLikes Node NULL - POST URL: {0}, FB URL: {1}, NODE: {2}", post.Url, fbUrl, fbRootNode.SerializeHtmlNode());
                }
            }

            /// Tweeter social plugin
            // http://platform.twitter.com/widgets/tweet_button.1375828408.html?url=http%3A%2F%2Fwww.rtvslo.si%2Fzabava%2Fiz-sveta-znanih%2Fboy-george-napadel-isinbajevo-zaradi-homofobnih-izjav%2F315495
            // http://cdn.api.twitter.com/1/urls/count.json?url=http%3A%2F%2Fwww.rtvslo.si%2Fzabava%2Fiz-sveta-znanih%2Fboy-george-napadel-isinbajevo-zaradi-homofobnih-izjav%2F315495&callback=twttr.receiveCount

            string twUrlPattern = "http://cdn.api.twitter.com/1/urls/count.json?url={0}";

            string twUrl = string.Format(twUrlPattern, encodedUrl);
            string twJsonPage = this.CreateWebRequest(new Uri(twUrl));

            try
            {
                JObject twJson = JObject.Parse(twJsonPage);
                string countString = (string)twJson["count"];

                int numOfTweets;
                if (!string.IsNullOrEmpty(countString) && int.TryParse(countString, out numOfTweets))
                {
                    post.NumOfTweets = numOfTweets;
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfTweets - POST URL: {0}, TW URL: {1}, NODE: {2}", post.Url, twUrl, twJsonPage);
                }
            }
            catch (JsonReaderException ex)
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfTweets Parse EXCEPTION - POST URL: {0}, TW URL: {1}, NODE: {2}, EX:{3}", post.Url, twUrl, twJsonPage, ex.Message);
            }

            return post;
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Scrape page of post comments
        /// </summary>
        /// <param name="html"></param>
        /// <param name="newsPost"></param>
        /// <returns></returns>
        public IList<Comment> ScrapeCommentsPage(string html, Post newsPost)
        {
            HtmlNode rootNode = html.CreateRootNode();

            IList<Comment> comments = new List<Comment>();

            HtmlNodeCollection commentsCollection = rootNode.SelectNodes(CommentsPageXPath.CommentContent);
            if (commentsCollection == null || commentsCollection.Count == 0)
            {
                return new List<Comment>();
            }

            IList<HtmlNode> commentNodes = commentsCollection.Where(x => x.SelectSingleNode(CommentsPageXPath.HeaderInfo) != null).ToList();

            if (commentNodes != null && commentNodes.Count > 0)
            {
                foreach (HtmlNode node in commentNodes)
                {
                    Comment comment = this.ScrapeComment(node);
                    if (comment != null)
                    {
                        //comment.PostGuidUrl = newsPost.GuidUrl;
                        //comment.PostUrl = newsPost.Url;
                        comment.PostId = newsPost.Id;

                        comments.Add(comment);
                    }
                }
            }
            else
            {
                this._logger.InfoFormat("ScrapingService, ScrapeCommentsPage, No comments - POST-URL: {0}, HTML: {1}", newsPost.Url, html);
            }

            return comments;
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Scrape post page
        /// </summary>
        /// <param name="postUrl"></param>
        /// <param name="post"></param>
        /// <returns></returns>
        public Post ScrapePostPage(Uri postUrl, Post post)
        {
            post = post ?? new Post();

            string html = this.CreateWebRequest(postUrl);
            if (!string.IsNullOrEmpty(html))
            {
                HtmlNode rootNode = html.CreateRootNode();

                /// accessed time
                post.AccessedDate = DateTime.UtcNow.ToUniversalTime();

                HtmlNode postContent = rootNode.SelectSingleNode(PostPageXPath.PostContent);
                post = this.ScrapePost(postContent, post);

                post = this.ScrapePostStatistics(rootNode, post);

                return post;
            }
            return null;
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Scrape rtvslo.si archive page html
        /// Fill some post properties: Id, Url, Category, CategoryUrl
        /// </summary>
        /// <param name="html"></param>
        /// <param name="hasNextPage"></param>
        /// <returns></returns>
        public IList<Post> ScrapeArhivePage(string html, out bool hasNextPage)
        {
            hasNextPage = false;

            HtmlNode rootNode = html.CreateRootNode();

            IList<Post> result = new List<Post>();

            IList<HtmlNode> posts = rootNode.SelectNodes(ArchivePageXPath.PostTitlesContent).ToList();
            if (posts != null && posts.Count > 0)
            {
                /// posts
                foreach (HtmlNode node in posts)
                {
                    Post post = null;

                    /// <a href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384" class="title">Kdo je “kriv” za zgodnjo upokojitev Mateje Robnik?</a>
                    HtmlNode linkNode = node.ChildNodes["a"];
                    if (linkNode != null)
                    {
                        string href = string.Empty;
                        string url = string.Empty;
                        Category category = new Category();

                        /// href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384"
                        if (linkNode.Attributes["href"] != null && linkNode.Attributes["href"].Value.SafeTrim().Length > 5)
                        {
                            href = linkNode.Attributes["href"].Value.SafeTrim();
                            url = href.ToFullRtvSloUrl();

                            /// scrape categories
                            string[] splittedUrl = href.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries);
                            if (splittedUrl.Length > 2)
                            {
                                for (int i = 0; i < splittedUrl.Length - 2; i++)
                                {
                                    if (i == 0)
                                    {
                                        /// top level category
                                        category.Label = splittedUrl[i];
                                        category.Url = string.Format("{0}/", splittedUrl[i].ToFullRtvSloUrl());
                                    }
                                    else
                                    {
                                        category.SaveChildCategory(new Category()
                                        {
                                            Label = splittedUrl[i]
                                        });
                                    }
                                }
                            }
                            else
                            {
                                this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Splitted URL length - URL: {2}, NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html, href);
                            }
                        }
                        else
                        {
                            this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post link - NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html);
                        }

                        string title = linkNode.InnerHtml.SafeTrimAndEscapeHtml();

                        post = new Post()
                        {
                            Id = this.GetIdFromUrl(url),
                            Url = url,
                            Title = title,
                            Category = category,
                        };

                        if (post.Id == 0 ||
                            string.IsNullOrEmpty(post.Url) ||
                            string.IsNullOrEmpty(post.Title) ||
                            string.IsNullOrEmpty(post.Category.Label)
                            )
                        {
                            this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post is not filled right - POST: {0}, HTML: {1}", post.SerializeObject(), html);
                        }
                    }

                    if (post != null)
                    {
                        result.Add(post);
                    }
                }
            }
            else
            {
                this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, There are no posts - HTML: {0}", html);
            }

            /// pager
            HtmlNode pager = rootNode.SelectSingleNode(ArchivePageXPath.PagerContent);
            if (pager != null)
            {
                /// <a href="/arhiv/?date_from=2013-02-13&amp;date_to=2013-02-13&amp;section=1.2.16.43.4.5.3.8.129.12.9.28.6.24&amp;page=1">2</a>
                HtmlNode nextPage = pager.SelectSingleNode(ArchivePageXPath.PagerNextPage);

                if (nextPage != null && nextPage.Attributes["href"] != null)
                {
                    hasNextPage = true;
                }
            }
            else
            {
                this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Pager is null - HTML: {0}", html);
            }

            return result;
        }
Ejemplo n.º 6
0
        public void RunDebug()
        {
            //this._repositoryService.ClearRepository();
            //this._repositoryService.Initialize();

            //User user1 = this._scrapingService.ScrapeUserPage(new Uri("http://www.rtvslo.si/profil/veselo-na-delo"));
            //if (user1 != null)
            //{
            //    this._logger.DebugFormat("Simulator, RunStep2 - USER: {0}", user1.SerializeObject());
            //    string userGuid = this._repositoryService.SaveUser(user1);
            //}
            //else
            //{
            //    //this._logger.WarnFormat("Simulator, RunStep2, Scrape user unsuccessfull - USER_URL: {0}, USER: {1}", comment.UserUrl, user.SerializeObject());
            //}

            Post post1 = new Post()
            {
                Url = "http://www.rtvslo.si/slovenija/jankovic-cestital-bratuskovi-za-napoved-da-bo-kandidirala-za-predsednico-ps-ja/317891"
            };
            post1 = this._scrapingService.ScrapePostPage(new Uri(post1.Url), post1);

            int startPage = 0;
            string html = this._scrapingService.GetFilteredArchivePage(startPage);

            bool hasNextPage = true;

            while (hasNextPage && !string.IsNullOrEmpty(html))
            {
                /// scrape archive page
                IList<Post> posts = this._scrapingService.ScrapeArhivePage(html, out hasNextPage);

                if (!posts.IsEmpty())
                {
                    foreach (Post post in posts)
                    {
                        /// save post from archive page
                        this._repositoryService.SaveOrUpdatePostOverview(post);

                        /// scrape details page
                        Post newPost = this._scrapingService.ScrapePostPage(new Uri(post.Url), post);
                        this._logger.DebugFormat("Simulator, RunStep2 - POST: {0}", post.SerializeObject());

                        foreach (User author in post.Authors)
                        {
                            string authorGuid = this._repositoryService.SearchUserByName(author.Name);
                            if (string.IsNullOrEmpty(authorGuid))
                            {
                                authorGuid = this._repositoryService.SaveAuthor(author);
                                if (!string.IsNullOrEmpty(authorGuid))
                                {
                                    author.RepositoryGuidUrl = authorGuid;
                                }
                            }
                            else
                            {
                                author.RepositoryGuidUrl = authorGuid;
                            }
                        }

                        /// save details page
                        string postGuid = this._repositoryService.SavePostDetails(post);

                        /// check if save was successsfull
                        if (string.IsNullOrEmpty(postGuid))
                        {
                            this._logger.FatalFormat("Simulator, RunStep2, SavePostDetails unsuccessfull - POST_GUID: {0}, POST: {1}", postGuid, post.SerializeObject());
                            continue;
                        }

                        /// post comments
                        int commentsPage = 0;
                        string commentsHtml = this._scrapingService.GetCommentsPage(post.Id, commentsPage++);

                        while (!string.IsNullOrEmpty(commentsHtml))
                        {
                            IList<Comment> comments = this._scrapingService.ScrapeCommentsPage(commentsHtml, post);
                            if (comments.IsEmpty())
                            {
                                break;
                            }

                            foreach (Comment comment in comments)
                            {
                                comment.PostGuidUrl = postGuid;

                                string userGuid = this._repositoryService.SearchUserById(comment.UserId);
                                if (string.IsNullOrEmpty(userGuid))
                                {
                                    User user = this._scrapingService.ScrapeUserPage(new Uri(comment.UserUrl));
                                    if (user != null)
                                    {
                                        this._logger.DebugFormat("Simulator, RunStep2 - USER: {0}", user.SerializeObject());
                                        userGuid = this._repositoryService.SaveUser(user);
                                    }
                                    else
                                    {
                                        this._logger.WarnFormat("Simulator, RunStep2, Scrape user unsuccessfull - USER_URL: {0}, USER: {1}", comment.UserUrl, user.SerializeObject());
                                        continue;
                                    }
                                }

                                comment.UserGuidUrl = userGuid;

                                this._logger.DebugFormat("Simulator, RunStep2 - COMMENT: {0}", comment.SerializeObject());
                                this._repositoryService.SaveComment(comment);
                            }

                            /// get next page of comments
                            commentsHtml = this._scrapingService.GetCommentsPage(post.Id, commentsPage++);
                        }
                    }
                }

                /// load next page
                html = this._scrapingService.GetFilteredArchivePage(startPage++);
            }
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Get posts from selected region in Slovenia
        /// </summary>
        /// <param name="region"></param>
        /// <param name="fromDate"></param>
        /// <param name="toDate"></param>
        /// <returns></returns>
        public IList<Post> GetPostsFromRegion(string region, DateTime? fromDate = null, DateTime? toDate = null)
        {
            region.NullCheck();

            IList<Post> result = new List<Post>();

            using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName))
            {
                if (connector.IsReady)
                {
                    /// SELECT DISTINCT ?post ?locationName ?seeAlso
                    /// WHERE {
                    ///    ?post a sioc:Post .
                    ///    ?post news:location ?locationName .
                    ///    ?post rdfs:seeAlso ?seeAlso .
                    ///    ?post dct:created ?date .
                    ///
                    ///    SERVICE <http://dbpedia.org/sparql> {
                    ///        ?region dbpedia-owl:type dbpedia:Statistical_regions_of_Slovenia .
                    ///        ?region rdfs:label "Gorenjska"@NL .
                    ///
                    ///        { ?city ?x ?region } UNION { ?region ?z ?city }
                    ///        ?city ?y ?locationName
                    ///    }
                    ///    FILTER(langMatches(lang(?locationName ), "EN") && ?date >= "fromDate" && ?date <= "toDate")
                    /// }

                    StringBuilder query = new StringBuilder();
                    query.AppendFormat(
                        "SELECT DISTINCT ?post ?locationName ?seeAlso " +
                        "WHERE {{ " +
                        "?post rdf:type sioc:Post . " +
                        "?post news:location ?locationName . " +
                        "?post rdfs:seeAlso ?seeAlso . ");

                    if (fromDate.HasValue && toDate.HasValue)
                    {
                        query.AppendFormat(
                            "?post {0} ?date . ",
                            Predicate.DctCreated);
                    }

                    query.AppendFormat(
                        "SERVICE <http://dbpedia.org/sparql> {{ " +
                        "?region dbpedia-owl:type dbpedia:Statistical_regions_of_Slovenia . " +
                        "?region rdfs:label \"{0}\"@NL . " +
                        "{{ ?city ?x ?region }} UNION {{ ?region ?z ?city }} " +
                        "?city ?y ?locationName " +
                        "}} ",
                        region);

                    query.AppendFormat(
                        "FILTER(langMatches(lang(?locationName ), \"EN\")");

                    if (fromDate.HasValue && toDate.HasValue)
                    {
                        query.AppendFormat(
                            " && ?date >= \"{0}\" && ?date <= \"{1}\"",
                            fromDate.Value.ToString(RepositoryHelper.DateTimeFormat),
                            toDate.Value.ToString(RepositoryHelper.DateTimeFormat));
                    }

                    query.AppendFormat(
                        ") }}");

                    SparqlResultSet queryResult = connector.QueryFormat(query.ToString());

                    if (queryResult == null)
                    {
                        this._logger.FatalFormat("RepositoryService, GetAllSlovenianRegions, Query result is null - QUERY: {0}", query);
                    }
                    else if (!queryResult.Results.IsEmpty())
                    {
                        LiteralNode literalNode;
                        UriNode uriNode;
                        foreach (SparqlResult res in queryResult.Results)
                        {
                            Post p = new Post();

                            uriNode = res.Value("post") as UriNode;
                            if (uriNode != null)
                            {
                                p.RepositoryGuidUrl = uriNode.Uri.AbsoluteUri;
                            }

                            uriNode = res.Value("seeAlso") as UriNode;
                            if (uriNode != null)
                            {
                                p.Url = uriNode.Uri.AbsoluteUri;
                            }

                            literalNode = res.Value("locationName") as LiteralNode;
                            if (literalNode != null)
                            {
                                p.Location = literalNode.Value;
                            }

                            result.Add(p);
                        }
                    }

                    return result;
                }
                else
                {
                    this._logger.FatalFormat("RepositoryService, GetAllSlovenianRegions, SesameHttpProtocolConnector is not ready");
                }
            }

            return null;
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Save post details page in RDF format
        /// Updates post title
        /// </summary>
        /// <param name="post"></param>
        /// <param name="update"></param>
        /// <returns>Guid url</returns>
        public string SavePostDetails(Post post, bool update = false)
        {
            using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName))
            {
                if (connector.IsReady)
                {
                    /// SELECT ?guidUrl ?predicate ?object
                    /// WHERE {
                    ///     ?guidUrl rdf:type sioc:Post
                    ///     ; news:ID "id"
                    ///     ; ?predicate ?object
                    /// }
                    string query = string.Format("SELECT ?guidUrl ?predicate ?object WHERE {{ ?guidUrl {0} {1} ; {2} \"{3}\" ; ?predicate ?object }}",
                                                            Predicate.RdfType, Predicate.SiocPost, Predicate.NewsId, post.Id.ToString(), Predicate.SiocTopic);
                    SparqlResultSet queryResult = connector.QueryFormat(query);

                    if (queryResult == null || queryResult.Results.IsEmpty())
                    {
                        this._logger.FatalFormat("RepositoryService, SavePostDetails, Query result has no results - QUERY: {0}", query);
                        return null; ;
                    }

                    /// save
                    using (IGraph g = new Graph())
                    {
                        g.BaseUri = RepositoryHelper.BaseUrl.ToUri();
                        IList<Triple> newTriples = new List<Triple>();
                        IList<Triple> removeTriples = new List<Triple>();

                        INode postGuid = queryResult.Results.First().Value("guidUrl").CopyNode(g);
                        post.RepositoryGuidUrl = ((UriNode)postGuid).Uri.AbsoluteUri;

                        #region Post Content

                        /// remove old title
                        this.RemoveTriples(removeTriples, queryResult, g, postGuid, new string[] { Predicate.DctTitle });

                        if (!update)
                        {
                            /// published at
                            newTriples.Add(new Triple(postGuid, Predicate.NewsPublishedAt.ToUriNode(g), RepositoryHelper.SiteUrl.ToUriNode(g)));
                        }

                        /// date created
                        newTriples.Add(new Triple(postGuid, Predicate.DctCreated.ToUriNode(g),
                            post.DateCreated.Value.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType)));

                        /// accessed date
                        newTriples.Add(new Triple(postGuid, Predicate.NewsAccessed.ToUriNode(g),
                            post.AccessedDate.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType)));

                        /// title
                        newTriples.Add(new Triple(postGuid, Predicate.DctTitle.ToUriNode(g), post.Title.ToLiteralNode(g)));

                        /// subtitle
                        if (!string.IsNullOrEmpty(post.Subtitle))
                        {
                            newTriples.Add(new Triple(postGuid, Predicate.MmcSubtitle.ToUriNode(g), post.Subtitle.ToLiteralNode(g)));
                        }

                        /// abstract
                        if (!string.IsNullOrEmpty(post.Abstract))
                        {
                            newTriples.Add(new Triple(postGuid, Predicate.DctAbstract.ToUriNode(g), post.Abstract.ToLiteralNode(g)));
                        }

                        /// last updated
                        newTriples.Add(new Triple(postGuid, Predicate.MmcLastUpdated.ToUriNode(g),
                            post.LastUpdated.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType)));

                        /// location
                        if (!string.IsNullOrEmpty(post.Location))
                        {
                            newTriples.Add(new Triple(postGuid, Predicate.NewsLocation.ToUriNode(g),
                                post.Location.ToLiteralNode(g, language: RepositoryHelper.LanguageEnglish))); /// hack to get posts from region
                        }

                        /// content
                        if (!string.IsNullOrEmpty(post.Content))
                        {
                            newTriples.Add(new Triple(postGuid, Predicate.SiocContent.ToUriNode(g), post.Content.ToLiteralNode(g)));
                        }

                        /// authors
                        if (!post.Authors.IsEmpty())
                        {
                            foreach (User author in post.Authors)
                            {
                                if (!string.IsNullOrEmpty(author.RepositoryGuidUrl))
                                {
                                    newTriples.Add(new Triple(postGuid, Predicate.SiocHasCreator.ToUriNode(g), author.RepositoryGuidUrl.ToUriNode(g)));
                                }
                            }
                        }

                        if (update)
                        {
                            /// remove old triples
                            this.RemoveTriples(removeTriples, queryResult, g, postGuid,
                                new string[]{ Predicate.DctCreated, Predicate.NewsAccessed, Predicate.MmcSubtitle, Predicate.DctAbstract, Predicate.MmcLastUpdated,
                                   Predicate.NewsLocation, Predicate.SiocContent, Predicate.SiocHasCreator });
                        }

                        #endregion Post Content

                        #region Statistics

                        string statsGuid = Guid.NewGuid().ToString();
                        string statsGuidUrl = string.Format(RepositoryHelper.StatisticsUrlPattern, statsGuid);
                        UriNode statsGuidNode = null;

                        /// read existing statsGuidUrl
                        if (update)
                        {
                            statsGuidNode = queryResult.Results
                                .First(x => x.Value("predicate").ToSafeString() == Predicate.NewsStatistics.ToFullNamespaceUrl())
                                .Value("object") as UriNode;

                            statsGuidUrl = statsGuidNode.Uri.AbsoluteUri;

                            /// SELECT ?predicate ?object
                            /// WHERE {
                            ///     <guid> rdf:type news:Stat
                            ///     ; ?predicate ?object
                            /// }
                            query = string.Format("SELECT ?predicate ?object WHERE {{ <{0}> {1} {2} ; ?predicate ?object }}",
                                        statsGuidUrl, Predicate.RdfType, Predicate.NewsStat);

                            queryResult = connector.QueryFormat(query);

                            if (queryResult == null || queryResult.Results.IsEmpty())
                            {
                                this._logger.FatalFormat("RepositoryService, SavePostDetails, Update statistics ERROR - QUERY: {0}", query);
                            }
                        }

                        INode statsSubject = statsGuidNode != null ? statsGuidNode.CopyNode(g) : statsGuidUrl.ToUriNode(g);
                        if (!update)
                        {
                            /// initialize
                            newTriples.Add(new Triple(statsSubject, Predicate.RdfType.ToUriNode(g), Predicate.NewsStat.ToUriNode(g)));
                            newTriples.Add(new Triple(postGuid, Predicate.NewsStatistics.ToUriNode(g), statsSubject));
                        }

                        /// number of comments
                        if (post.NumOfComments > -1)
                        {
                            newTriples.Add(new Triple(statsSubject, Predicate.NewsNComments.ToUriNode(g),
                                post.NumOfComments.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));
                        }

                        /// avgerage rating
                        if (post.AvgRating > -1)
                        {
                            newTriples.Add(new Triple(statsSubject, Predicate.NewsAvgRating.ToUriNode(g),
                                post.AvgRating.ToString().ToLiteralNode(g, dataType: RepositoryHelper.DecimalDataType)));
                        }

                        /// number of ratings
                        if (post.NumOfRatings > -1)
                        {
                            newTriples.Add(new Triple(statsSubject, Predicate.NewsNRatings.ToUriNode(g),
                                post.NumOfRatings.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));
                        }

                        /// number of FB likes
                        if (post.NumOfFbLikes > -1)
                        {
                            newTriples.Add(new Triple(statsSubject, Predicate.NewsNFBLikes.ToUriNode(g),
                                post.NumOfFbLikes.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));
                        }

                        /// number of tweets
                        if (post.NumOfTweets > -1)
                        {
                            newTriples.Add(new Triple(statsSubject, Predicate.NewsNTweets.ToUriNode(g),
                                post.NumOfTweets.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));
                        }

                        if (update)
                        {
                            /// remove old triples
                            this.RemoveTriples(removeTriples, queryResult, g, statsSubject,
                                new string[] { Predicate.NewsNComments, Predicate.NewsAvgRating, Predicate.NewsNRatings, Predicate.NewsNFBLikes, Predicate.NewsNTweets });
                        }

                        #endregion Statistics

                        connector.UpdateGraph(g.BaseUri, newTriples, removeTriples);
                        return post.RepositoryGuidUrl;
                    }
                }
                else
                {
                    this._logger.FatalFormat("RepositoryService, SavePostDetails, SesameHttpProtocolConnector is not ready");
                }
            }

            return null;
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Save only post data scraped from archive page in RDF format if don't already exist
        /// ID, Title, Url, Category
        /// </summary>
        /// <param name="post"></param>
        /// <returns>Guid url</returns>
        public string SaveOrUpdatePostOverview(Post post)
        {
            /// save category
            string categoryUrl = this.CheckAndSaveCategories(post.Category);

            using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName))
            {
                if (connector.IsReady)
                {
                    /// SELECT ?guidUrl ?predicate ?object
                    /// WHERE {
                    ///     ?guidUrl rdf:type sioc:Post
                    ///     ; news:ID "id"
                    ///     ; ?predicate ?object
                    /// }
                    SparqlResultSet queryResult = connector.QueryFormat("SELECT ?guidUrl ?predicate ?object WHERE {{ ?guidUrl {0} {1} ; {2} \"{3}\" ; ?predicate ?object }}",
                                                            Predicate.RdfType, Predicate.SiocPost, Predicate.NewsId, post.Id.ToString(), Predicate.SiocTopic);
                    /// update existing
                    if (queryResult != null && !queryResult.Results.IsEmpty())
                    {
                        using (IGraph g = new Graph())
                        {
                            g.BaseUri = RepositoryHelper.BaseUrl.ToUri();
                            IList<Triple> newTriples = new List<Triple>();

                            INode postGuid = queryResult.Results.First().Value("guidUrl").CopyNode(g);
                            post.RepositoryGuidUrl = ((UriNode)postGuid).Uri.AbsoluteUri;

                            /// select categories
                            IEnumerable<SparqlResult> categories = queryResult.Results
                                .Where(x => x.Value("predicate").ToSafeString() == Predicate.SiocTopic.ToFullNamespaceUrl());

                            SparqlResult categoryResult = categories.FirstOrDefault(x => x.Value("object").ToSafeString() == categoryUrl);
                            if (categoryResult == null)
                            {
                                newTriples.Add(new Triple(postGuid, Predicate.SiocTopic.ToUriNode(g), categoryUrl.ToUriNode(g)));
                            }

                            /// select url
                            IEnumerable<SparqlResult> postUrls = queryResult.Results
                                .Where(x => x.Value("predicate").ToSafeString() == Predicate.RdfsSeeAlso.ToFullNamespaceUrl());

                            SparqlResult postUrlResult = postUrls.FirstOrDefault(x => x.Value("object").ToSafeString() == post.Url);
                            if (postUrlResult == null)
                            {
                                newTriples.Add(new Triple(postGuid, Predicate.RdfsSeeAlso.ToUriNode(g), post.Url.ToUriNode(g)));
                            }

                            connector.UpdateGraph(g.BaseUri, newTriples, new List<Triple>());
                        }
                    }
                    /// save new
                    else
                    {
                        using (IGraph g = new Graph())
                        {
                            g.BaseUri = RepositoryHelper.BaseUrl.ToUri();
                            IList<Triple> newTriples = new List<Triple>();

                            string guidUrl = string.Format(RepositoryHelper.PostUrlPattern, Guid.NewGuid().ToString());
                            INode guidNode = guidUrl.ToUriNode(g);
                            post.RepositoryGuidUrl = guidUrl;

                            /// define post
                            newTriples.Add(new Triple(guidNode, Predicate.RdfType.ToUriNode(g), Predicate.SiocPost.ToUriNode(g)));
                            /// ID
                            newTriples.Add(new Triple(guidNode, Predicate.NewsId.ToUriNode(g),
                                post.Id.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));
                            /// title
                            newTriples.Add(new Triple(guidNode, Predicate.DctTitle.ToUriNode(g), post.Title.ToLiteralNode(g)));
                            /// post url
                            newTriples.Add(new Triple(guidNode, Predicate.RdfsSeeAlso.ToUriNode(g), post.Url.ToUriNode(g)));

                            /// category
                            if (!string.IsNullOrEmpty(categoryUrl))
                            {
                                newTriples.Add(new Triple(guidNode, Predicate.SiocTopic.ToUriNode(g), categoryUrl.ToUriNode(g)));
                            }

                            connector.UpdateGraph(g.BaseUri, newTriples, new List<Triple>());
                        }
                    }

                    return post.RepositoryGuidUrl;
                }
                else
                {
                    this._logger.FatalFormat("RepositoryService, SavePostOverview, SesameHttpProtocolConnector is not ready");
                }
            }

            return null;
        }