private void AddBooksToPublisher(Publisher publisher, string url, List<string> visitedLinks, List<string> linksToVisit) { visitedLinks.Add(url); linksToVisit.Remove(url); HtmlDocument doc = new HtmlDocument(); int httpCode = 0; doc.LoadHtml(GetContent(url, ref httpCode)); if (httpCode == 200) { HtmlNodeCollection books = doc.DocumentNode.SelectNodes("//div[@id='conteudo']//a[not(@class='l11')]"); if (books != null) { string href = string.Empty; foreach (HtmlNode link in books) { href = link.Attributes["href"].Value; if (href.StartsWith("/editora/livros/")) { href = "http://www.skoob.com.br" + href; if (!visitedLinks.Contains(href) && !linksToVisit.Contains(href)) linksToVisit.Add(href); } else { href = href.Replace("/livro/", "").Split('-')[0]; publisher.Books.Add(href); } } } } }
public void ParsePublishers(object info) { JavaScriptSerializer oSerializer = new JavaScriptSerializer(); Parameters parameters = (Parameters)info; string url = string.Empty; int httpCode = 0; foreach (int id in parameters.Ids) { url = string.Format(baseUrl, "editora", id); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(GetContent(url, ref httpCode)); if (httpCode == 200) { Publisher publisher = new Publisher(); publisher.Id = id; HtmlNode name = doc.DocumentNode.SelectSingleNode("//h1[@class='titulo_editora']"); publisher.Name = string.Empty; if (name != null) publisher.Name = name.InnerText; HtmlNode description = doc.DocumentNode.SelectSingleNode("//div[@id='historico']"); publisher.Description = string.Empty; if (description != null) publisher.Description = description.InnerHtml; HtmlNode image = doc.DocumentNode.SelectSingleNode("//div[@id='menu']//img"); publisher.Image = string.Empty; if (image != null) publisher.Image = "http://www.skoob.com.br" + image.Attributes["src"].Value; HtmlNodeCollection authors = doc.DocumentNode.SelectNodes("//div[@class='viewport']//ul[@class='overview']//li//img"); if (authors != null) { string[] parts; foreach (HtmlNode link in authors) { parts = link.Attributes["src"].Value.Split('/'); publisher.Authors.Add(parts[parts.Length-2]); } } HtmlNode blog = doc.DocumentNode.SelectSingleNode("//a[text()='Blog']"); publisher.Blog = string.Empty; if (blog != null) publisher.Blog = blog.Attributes["href"].Value; HtmlNode site = doc.DocumentNode.SelectSingleNode("//a[text()='Site oficial']"); publisher.Site = string.Empty; if (site != null) publisher.Site = site.Attributes["href"].Value; string network = string.Empty; HtmlNodeCollection networks = doc.DocumentNode.SelectNodes("//a[child::img[@class='logoredes']]"); if (networks != null) { foreach (HtmlNode link in networks) { publisher.AddNetworkLink(link.Attributes["href"].Value); } } List<string> visitedLinks = new List<string>(); List<string> linksToVisit = new List<string>(); url = string.Concat(string.Format(baseUrl, "editora/livros", id), "/mpage:1"); AddBooksToPublisher(publisher, url, visitedLinks, linksToVisit); while (linksToVisit.Count > 0) { AddBooksToPublisher(publisher, linksToVisit[0], visitedLinks, linksToVisit); } CustomSave("Publishers", oSerializer.Serialize(publisher), 0, 100, id); SaveImage(publisher.Image, 0, 100, id, "Publishers"); } } }