public void SaveChildCategory(Category category) { /// first child if (!this.HasChild) { category.Url = string.Format("{0}{1}/", this.Url, category.Label); this.ChildCategory = category; category.ParentCategory = this; return; } Category tempCategory = this.ChildCategory; while (tempCategory.ChildCategory != null) { tempCategory = tempCategory.ChildCategory; } category.Url = string.Format("{0}{1}/", tempCategory.Url, category.Label); category.ParentCategory = tempCategory; tempCategory.ChildCategory = category; }
/// <summary> /// Scrape rtvslo.si archive page html /// Fill some post properties: Id, Url, Category, CategoryUrl /// </summary> /// <param name="html"></param> /// <param name="hasNextPage"></param> /// <returns></returns> public IList<Post> ScrapeArhivePage(string html, out bool hasNextPage) { hasNextPage = false; HtmlNode rootNode = html.CreateRootNode(); IList<Post> result = new List<Post>(); IList<HtmlNode> posts = rootNode.SelectNodes(ArchivePageXPath.PostTitlesContent).ToList(); if (posts != null && posts.Count > 0) { /// posts foreach (HtmlNode node in posts) { Post post = null; /// <a href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384" class="title">Kdo je “kriv” za zgodnjo upokojitev Mateje Robnik?</a> HtmlNode linkNode = node.ChildNodes["a"]; if (linkNode != null) { string href = string.Empty; string url = string.Empty; Category category = new Category(); /// href="/sport/zimski-sporti/sp-v-alpskem-smucanju-2013/kdo-je-kriv-za-zgodnjo-upokojitev-mateje-robnik/302384" if (linkNode.Attributes["href"] != null && linkNode.Attributes["href"].Value.SafeTrim().Length > 5) { href = linkNode.Attributes["href"].Value.SafeTrim(); url = href.ToFullRtvSloUrl(); /// scrape categories string[] splittedUrl = href.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries); if (splittedUrl.Length > 2) { for (int i = 0; i < splittedUrl.Length - 2; i++) { if (i == 0) { /// top level category category.Label = splittedUrl[i]; category.Url = string.Format("{0}/", splittedUrl[i].ToFullRtvSloUrl()); } else { category.SaveChildCategory(new Category() { Label = splittedUrl[i] }); } } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Splitted URL length - URL: {2}, NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html, href); } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post link - NODE: {0}, HTML: {1}", linkNode.SerializeHtmlNode(), html); } string title = linkNode.InnerHtml.SafeTrimAndEscapeHtml(); post = new Post() { Id = this.GetIdFromUrl(url), Url = url, Title = title, Category = category, }; if (post.Id == 0 || string.IsNullOrEmpty(post.Url) || string.IsNullOrEmpty(post.Title) || string.IsNullOrEmpty(post.Category.Label) ) { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Post is not filled right - POST: {0}, HTML: {1}", post.SerializeObject(), html); } } if (post != null) { result.Add(post); } } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, There are no posts - HTML: {0}", html); } /// pager HtmlNode pager = rootNode.SelectSingleNode(ArchivePageXPath.PagerContent); if (pager != null) { /// <a href="/arhiv/?date_from=2013-02-13&date_to=2013-02-13&section=1.2.16.43.4.5.3.8.129.12.9.28.6.24&page=1">2</a> HtmlNode nextPage = pager.SelectSingleNode(ArchivePageXPath.PagerNextPage); if (nextPage != null && nextPage.Attributes["href"] != null) { hasNextPage = true; } } else { this._logger.FatalFormat("ScrapingService, ScrapeArhivePage, Pager is null - HTML: {0}", html); } return result; }
/// <summary> /// Check if categories already exist in repository and save them if not /// </summary> /// <param name="category"></param> /// <returns>Child category repository url</returns> private string CheckAndSaveCategories(Category category) { using (SesameHttpProtocolConnector connector = new SesameHttpProtocolConnector(RtvSloConfig.RepositoryUrl, RtvSloConfig.RepositoryName)) { if (connector.IsReady) { Category tempCategory = category; if (category.HasChild) { tempCategory = category.LastChild; } /// SELECT ?url /// WHERE { /// ?url rdf:type sioc:Category . /// ?url rdfs:seeAlso <cat_url> . /// } string queryPattern = "SELECT ?url WHERE {{ ?url {0} {1} . ?url {2} <{3}> . }}"; SparqlResultSet result = connector.QueryFormat(queryPattern, Predicate.RdfType, Predicate.SiocCategory, Predicate.RdfsSeeAlso, tempCategory.Url); if (result == null) { return null; } else if (!result.Results.IsEmpty()) { /// child category already exists return result.Results.First().Value("url").ToSafeString(); } string categoryRepositoryUrl = null; /// save category using (IGraph g = new Graph()) { g.BaseUri = RepositoryHelper.BaseUrl.ToUri(); IList<Triple> newTriples = new List<Triple>(); tempCategory = category; bool topCategory = true; do { /// fetch and check if category exists result = connector.QueryFormat(queryPattern, Predicate.RdfType, Predicate.SiocCategory, Predicate.RdfsSeeAlso, tempCategory.Url); if (result == null) { return null; } else if (result.Results.IsEmpty()) { categoryRepositoryUrl = string.Format(RepositoryHelper.CategoryUrlPattern, tempCategory.Label); INode subject = categoryRepositoryUrl.ToUriNode(g); newTriples.Add(new Triple(subject, Predicate.RdfType.ToUriNode(g), Predicate.SiocCategory.ToUriNode(g))); newTriples.Add(new Triple(subject, Predicate.RdfsSeeAlso.ToUriNode(g), tempCategory.Url.ToUriNode(g))); newTriples.Add(new Triple(subject, Predicate.RdfsLabel.ToUriNode(g), tempCategory.Label.ToLiteralNode(g))); if (!topCategory) { INode parentObject = string.Format(RepositoryHelper.CategoryUrlPattern, tempCategory.Parent.Label).ToUriNode(g); newTriples.Add(new Triple(subject, Predicate.NewsSubCategoryOf.ToUriNode(g), parentObject)); } } topCategory = false; tempCategory = tempCategory.NextChild; } while (tempCategory != null); /// save new category connector.UpdateGraph(g.BaseUri, newTriples, new List<Triple>()); } return categoryRepositoryUrl; } else { this._logger.FatalFormat("RepositoryService, CheckAndSaveCategories, SesameHttpProtocolConnector is not ready"); } } return null; }