예제 #1
0
        public void SaveChildCategory(Category category)
        {
            /// first child
            if (!this.HasChild)
            {
                category.Url = string.Format("{0}{1}/", this.Url, category.Label);
                this.ChildCategory = category;
                category.ParentCategory = this;
                return;
            }

            Category tempCategory = this.ChildCategory;

            while (tempCategory.ChildCategory != null)
            {
                tempCategory = tempCategory.ChildCategory;
            }

            category.Url = string.Format("{0}{1}/", tempCategory.Url, category.Label);
            category.ParentCategory = tempCategory;
            tempCategory.ChildCategory = category;
        }
예제 #2
0
        /// <summary>
        /// Scrape rtvslo.si archive page html
        /// Fill some post properties: Id, Url, Category, CategoryUrl
        /// </summary>
        /// <param name="html"></param>
        /// <param name="hasNextPage"></param>
        /// <returns></returns>
        public IList<Post> ScrapeArhivePage(string html, out bool hasNextPage)
        {
            hasNextPage = false;

            HtmlNode rootNode = html.CreateRootNode();

            IList<Post> result = new List<Post>();

            IList<HtmlNode> posts = rootNode.SelectNodes(ArchivePageXPath.PostTitlesContent).ToList();
            if (posts != null && posts.Count > 0)
            {
                /// posts
                foreach (HtmlNode node in posts)
                {
                    Post post = null;

                    /// <a href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384" class="title">Kdo je “kriv” za zgodnjo upokojitev Mateje Robnik?</a>
                    HtmlNode linkNode = node.ChildNodes["a"];
                    if (linkNode != null)
                    {
                        string href = string.Empty;
                        string url = string.Empty;
                        Category category = new Category();

                        /// href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384"
                        if (linkNode.Attributes["href"] != null && linkNode.Attributes["href"].Value.SafeTrim().Length > 5)
                        {
                            href = linkNode.Attributes["href"].Value.SafeTrim();
                            url = href.ToFullRtvSloUrl();

                            /// scrape categories
                            string[] splittedUrl = href.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries);
                            if (splittedUrl.Length > 2)
                            {
                                for (int i = 0; i < splittedUrl.Length - 2; i++)
                                {
                                    if (i == 0)
                                    {
                                        /// top level category
                                        category.Label = splittedUrl[i];
                                        category.Url = string.Format("{0}/", splittedUrl[i].ToFullRtvSloUrl());
                                    }
                                    else
                                    {
                                        category.SaveChildCategory(new Category()
                                        {
                                            Label = splittedUrl[i]
                                        });
                                    }
                                }
                            }
                            else
                            {
                                this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Splitted URL length - URL: {2}, NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html, href);
                            }
                        }
                        else
                        {
                            this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post link - NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html);
                        }

                        string title = linkNode.InnerHtml.SafeTrimAndEscapeHtml();

                        post = new Post()
                        {
                            Id = this.GetIdFromUrl(url),
                            Url = url,
                            Title = title,
                            Category = category,
                        };

                        if (post.Id == 0 ||
                            string.IsNullOrEmpty(post.Url) ||
                            string.IsNullOrEmpty(post.Title) ||
                            string.IsNullOrEmpty(post.Category.Label)
                            )
                        {
                            this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post is not filled right - POST: {0}, HTML: {1}", post.SerializeObject(), html);
                        }
                    }

                    if (post != null)
                    {
                        result.Add(post);
                    }
                }
            }
            else
            {
                this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, There are no posts - HTML: {0}", html);
            }

            /// pager
            HtmlNode pager = rootNode.SelectSingleNode(ArchivePageXPath.PagerContent);
            if (pager != null)
            {
                /// <a href="/arhiv/?date_from=2013-02-13&amp;date_to=2013-02-13&amp;section=1.2.16.43.4.5.3.8.129.12.9.28.6.24&amp;page=1">2</a>
                HtmlNode nextPage = pager.SelectSingleNode(ArchivePageXPath.PagerNextPage);

                if (nextPage != null && nextPage.Attributes["href"] != null)
                {
                    hasNextPage = true;
                }
            }
            else
            {
                this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Pager is null - HTML: {0}", html);
            }

            return result;
        }
        /// <summary>
        /// Check if categories already exist in repository and save them if not
        /// </summary>
        /// <param name="category"></param>
        /// <returns>Child category repository url</returns>
        private string CheckAndSaveCategories(Category category)
        {
            using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName))
            {
                if (connector.IsReady)
                {
                    Category tempCategory = category;
                    if (category.HasChild)
                    {
                        tempCategory = category.LastChild;
                    }

                    /// SELECT ?url
                    /// WHERE {
                    ///     ?url rdf:type sioc:Category .
                    ///     ?url rdfs:seeAlso <cat_url> .
                    /// }
                    string queryPattern = "SELECT ?url WHERE {{ ?url {0} {1} . ?url {2} <{3}> . }}";
                    SparqlResultSet result = connector.QueryFormat(queryPattern, Predicate.RdfType, Predicate.SiocCategory, Predicate.RdfsSeeAlso, tempCategory.Url);
                    if (result == null)
                    {
                        return null;
                    }
                    else if (!result.Results.IsEmpty())
                    {
                        /// child category already exists
                        return result.Results.First().Value("url").ToSafeString();
                    }

                    string categoryRepositoryUrl = null;

                    /// save category
                    using (IGraph g = new Graph())
                    {
                        g.BaseUri = RepositoryHelper.BaseUrl.ToUri();

                        IList<Triple> newTriples = new List<Triple>();

                        tempCategory = category;

                        bool topCategory = true;
                        do
                        {
                            /// fetch and check if category exists
                            result = connector.QueryFormat(queryPattern, Predicate.RdfType, Predicate.SiocCategory, Predicate.RdfsSeeAlso, tempCategory.Url);
                            if (result == null)
                            {
                                return null;
                            }
                            else if (result.Results.IsEmpty())
                            {

                                categoryRepositoryUrl = string.Format(RepositoryHelper.CategoryUrlPattern, tempCategory.Label);
                                INode subject = categoryRepositoryUrl.ToUriNode(g);

                                newTriples.Add(new Triple(subject, Predicate.RdfType.ToUriNode(g), Predicate.SiocCategory.ToUriNode(g)));
                                newTriples.Add(new Triple(subject, Predicate.RdfsSeeAlso.ToUriNode(g), tempCategory.Url.ToUriNode(g)));
                                newTriples.Add(new Triple(subject, Predicate.RdfsLabel.ToUriNode(g), tempCategory.Label.ToLiteralNode(g)));

                                if (!topCategory)
                                {
                                    INode parentObject = string.Format(RepositoryHelper.CategoryUrlPattern, tempCategory.Parent.Label).ToUriNode(g);
                                    newTriples.Add(new Triple(subject, Predicate.NewsSubCategoryOf.ToUriNode(g), parentObject));
                                }
                            }

                            topCategory = false;
                            tempCategory = tempCategory.NextChild;
                        }
                        while (tempCategory != null);

                        /// save new category
                        connector.UpdateGraph(g.BaseUri, newTriples, new List<Triple>());
                    }

                    return categoryRepositoryUrl;
                }
                else
                {
                    this._logger.FatalFormat("RepositoryService, CheckAndSaveCategories, SesameHttpProtocolConnector is not ready");
                }
            }

            return null;
        }