Пример #1
0
        private Comment ScrapeComment(HtmlNode commentNode)
        {
            Comment comment = new Comment()
            {
                AccessedDate = DateTime.UtcNow.ToUniversalTime(),
            };

            commentNode.NullCheck();

            HtmlNode headerNode = commentNode.SelectSingleNode(CommentsPageXPath.HeaderInfo);
            IList<HtmlNode> innerHeaderNodes = headerNode.ChildNodes.Where(x => x.Name == "a").ToList();

            /// userUrl, url, username, id
            if (innerHeaderNodes != null && innerHeaderNodes.Count == 2)
            {
                if (innerHeaderNodes[0].Attributes["href"] != null)
                {
                    comment.UserUrl = innerHeaderNodes[0].Attributes["href"].Value.SafeTrim().ToFullRtvSloUrl();
                    comment.UserId = this.GetIdStringFromUrl(comment.UserUrl);
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapeComment - User url is null - NODE: {0}", commentNode.SerializeHtmlNode());
                }

                comment.UserName = innerHeaderNodes[0].InnerText.SafeTrim();

                if (innerHeaderNodes[1].Attributes["href"] != null)
                {
                    comment.Url = innerHeaderNodes[1].Attributes["href"].Value.SafeTrim().ToFullRtvSloUrl();
                    comment.Id = this.GetIdFromUrl(comment.Url);
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapeComment - Comment url is null - NODE: {0}", commentNode.SerializeHtmlNode());
                }
            }

            /// created date time
            string dateCreatedString = headerNode.LastChild.InnerText.SafeTrim();

            DateTime created;
            if (dateCreatedString.TryParseExactLogging(ParsingHelper.ShortDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out created))
            {
                comment.DateCreated = created.ToUniversalTime();
            }

            HtmlNode contentNode = commentNode.SelectSingleNode(CommentsPageXPath.Content);

            if (contentNode != null)
            {
                string content = contentNode.InnerText.SafeTrimAndEscapeHtml();
                comment.Content = content;
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapeComment - Comment content is null - URL: {0}", comment.Url);
            }

            /// rating
            HtmlNode ratingNode = commentNode.SelectSingleNode(CommentsPageXPath.Rating);

            string plusRatingString = ratingNode.SelectSingleNode(CommentsPageXPath.PlusRating).InnerText.SafeTrim();
            string minusRatingString = ratingNode.SelectSingleNode(CommentsPageXPath.MinusRating).InnerText.SafeTrim();

            int plusRating = this.ScrapeCommentRating(plusRatingString, comment.Url);
            int minusRating = this.ScrapeCommentRating(minusRatingString, comment.Url);

            comment.Rating = plusRating + minusRating;

            return comment;
        }
        /// <summary>
        /// Save new comment in RDF format
        /// </summary>
        /// <param name="comment"></param>
        /// <param name="update"></param>
        /// <returns>Guid url</returns>
        public string SaveComment(Comment comment, bool update = false)
        {
            using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName))
            {
                if (connector.IsReady)
                {
                    /// SELECT ?guidUrl
                    /// WHERE {
                    ///     ?guidUrl rdf:type news:Comment
                    ///     ; news:ID "id"
                    /// }
                    string query = string.Format("SELECT ?guidUrl WHERE {{ ?guidUrl {0} {1} ; {2} \"{3}\" }}",
                                                                    Predicate.RdfType, Predicate.NewsComment, Predicate.NewsId, comment.Id.ToSafeString());

                    if (update)
                    {
                        /// SELECT ?guidUrl ?predicate ?object
                        /// WHERE {
                        ///     ?guidUrl rdf:type news:Comment
                        ///     ; news:ID "id"
                        ///     ; ?predicate ?object
                        /// }
                        query = string.Format("SELECT ?guidUrl ?predicate ?object WHERE {{ ?guidUrl {0} {1} ; {2} \"{3}\" ; ?predicate ?object }}",
                                                                    Predicate.RdfType, Predicate.NewsComment, Predicate.NewsId, comment.Id.ToSafeString());
                    }

                    SparqlResultSet queryResult = connector.QueryFormat(query);

                    string guidUrl = null;
                    INode guidUrlNode = null;
                    if (queryResult != null && !queryResult.Results.IsEmpty())
                    {
                        /// comment already exist
                        guidUrlNode = queryResult.Results.First().Value("guidUrl");
                        guidUrl = ((UriNode)guidUrlNode).Uri.AbsoluteUri;
                        if (!update)
                        {
                            return guidUrl;
                        }
                    }

                    /// save new or update
                    using (IGraph g = new Graph())
                    {
                        g.BaseUri = RepositoryHelper.BaseUrl.ToUri();
                        IList<Triple> newTriples = new List<Triple>();
                        IList<Triple> removeTriples = new List<Triple>();

                        if (string.IsNullOrEmpty(guidUrl))
                        {
                            update = false;
                            guidUrl = string.Format(RepositoryHelper.CommentUrlPattern, Guid.NewGuid().ToString());
                        }

                        INode subject = guidUrlNode != null ? guidUrlNode.CopyNode(g) : guidUrl.ToUriNode(g);
                        comment.RepositoryGuidUrl = guidUrl;

                        /// save new
                        if (!update)
                        {
                            /// initialize
                            newTriples.Add(new Triple(subject, Predicate.RdfType.ToUriNode(g), Predicate.NewsComment.ToUriNode(g)));

                            /// ID
                            newTriples.Add(new Triple(subject, Predicate.NewsId.ToUriNode(g),
                                comment.Id.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));

                            /// see also
                            newTriples.Add(new Triple(subject, Predicate.RdfsSeeAlso.ToUriNode(g), comment.Url.ToUriNode(g)));

                            /// link with post
                            if (!string.IsNullOrEmpty(comment.PostGuidUrl))
                            {
                                newTriples.Add(new Triple(comment.PostGuidUrl.ToUriNode(g), Predicate.SiocHasReply.ToUriNode(g), subject));
                            }
                            else
                            {
                                this._logger.FatalFormat("RepositoryService, SaveComment, PostGuidUrl IS NULL - COMMENT: {0}", comment.Url);
                            }

                            /// has creator
                            if (!string.IsNullOrEmpty(comment.UserGuidUrl))
                            {
                                newTriples.Add(new Triple(subject, Predicate.SiocHasCreator.ToUriNode(g), comment.UserGuidUrl.ToUriNode(g)));
                            }
                            else
                            {
                                this._logger.FatalFormat("RepositoryService, SaveComment, UserGuidUrl IS NULL - COMMENT: {0}", comment.Url);
                            }
                        }

                        /// accessed date
                        newTriples.Add(new Triple(subject, Predicate.NewsAccessed.ToUriNode(g),
                            comment.AccessedDate.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType)));

                        /// content
                        newTriples.Add(new Triple(subject, Predicate.SiocContent.ToUriNode(g), comment.Content.ToLiteralNode(g)));

                        /// date created
                        newTriples.Add(new Triple(subject, Predicate.DctCreated.ToUriNode(g),
                            comment.DateCreated.Value.ToString(RepositoryHelper.DateTimeFormat).ToLiteralNode(g, dataType: RepositoryHelper.DateTimeDataType)));

                        /// rating
                        newTriples.Add(new Triple(subject, Predicate.MmcRating.ToUriNode(g),
                            comment.Rating.ToString().ToLiteralNode(g, dataType: RepositoryHelper.IntegerDataType)));

                        /// remove old triples
                        if (update)
                        {
                            this.RemoveTriples(removeTriples, queryResult, g, subject,
                                new string[] { Predicate.NewsAccessed, Predicate.SiocContent, Predicate.DctCreated, Predicate.MmcRating });
                        }

                        /// save
                        connector.UpdateGraph(g.BaseUri, newTriples, removeTriples);
                        return guidUrl;
                    }
                }
                else
                {
                    this._logger.FatalFormat("RepositoryService, SaveComment, SesameHttpProtocolConnector is not ready");
                }
            }

            return null;
        }