private void AddBooksToAuthor(Author author, string url, List<string> visitedLinks, List<string> linksToVisit) { visitedLinks.Add(url); linksToVisit.Remove(url); HtmlDocument booksDoc = new HtmlDocument(); int httpCode = 0; booksDoc.LoadHtml(GetContent(url, ref httpCode)); if (httpCode == 200) { HtmlNodeCollection books = booksDoc.DocumentNode.SelectNodes("//div[preceding-sibling::div[@id='menubusca']]//div[position()=3]//a"); if (books != null) { string href = string.Empty; foreach (HtmlNode link in books) { href = link.Attributes["href"].Value.Replace("/livro/", "").Split('-')[0]; if (href.StartsWith("/autor/livros/")) { href = "http://www.skoob.com.br" + href; if (!visitedLinks.Contains(href) && !linksToVisit.Contains(href)) linksToVisit.Add(href); } else author.Books.Add(href); } } } }
private Author SetAuthorData(Author author, string field, string value) { field = field.Replace(":",""); switch (field) { case "Gêneros": author.Categories = CleanData(value); break; case "Nascimento": author.BirthDate = CleanData(value); break; case "Local": author.Local = CleanData(value); break; } return author; }
public void ParseAuthors(object info) { JavaScriptSerializer oSerializer = new JavaScriptSerializer(); Parameters parameters = (Parameters)info; string url = string.Empty; int httpCode = 0; for (int id = parameters.StartId; id <= parameters.EndId; id++) { url = string.Format(baseUrl, "autor", id); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(GetContent(url, ref httpCode)); if (httpCode == 200) { Author author = new Author(); author.Id = id; HtmlNode image = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//img[position()=1]"); author.Image = string.Empty; if (image != null) author.Image = image.Attributes["src"].Value; HtmlNode name = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//h1"); author.Name = string.Empty; if (name != null) author.Name = name.InnerText; HtmlNode description = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//div[@id='biografia']"); author.Description = string.Empty; if (description != null) author.Description = CleanData(description.InnerText).Substring(10); HtmlNode data = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//div[@style='border:#000 0px solid; float:right; width:175px; color:#666666; font-family:arial; font-size:11px; line-height:20px;']"); string rx = @"<strong>([\w:]+)<\/strong><br>([\w\s,-:\/]+)"; MatchCollection matches = Regex.Matches(data.InnerHtml, rx, RegexOptions.IgnoreCase); foreach (Match m in matches) { if (m.Success && m.Groups.Count == 3) SetAuthorData(author, m.Groups[1].Value, m.Groups[2].Value); } rx = "<a href=\"([.\\/\\w:]+)\" class=\"l11out\" target=\"_blank\">([\\w\\/\\s.:!]+)<\\/a>"; matches = Regex.Matches(data.InnerHtml, rx, RegexOptions.IgnoreCase); foreach (Match m in matches) { if (m.Success && m.Groups.Count == 3) author.Links.Add(m.Groups[1].Value); } HtmlNode bookCount = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=3]//h1[@style='font-size:17px; font-family: trebuchet ms; color:#666666; float:left;']"); int count = SafeConvert.ToInt(bookCount.InnerText.Replace("Livros do Autor (","").Replace(")","")); if (count <= 15) { HtmlNodeCollection books = doc.DocumentNode.SelectNodes("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=3]//div[@style='border:green 0px solid; width:430px; margin: 10px 0px;']//a"); if (books != null) { foreach (HtmlNode link in books) { author.Books.Add(link.Attributes["href"].Value.Replace("/livro/", "").Split('-')[0]); } } } else { List<string> visitedLinks = new List<string>(); List<string> linksToVisit = new List<string>(); string booksFromAuthorUrl = string.Format(baseUrl, "autor/livros", id + "/page:1"); AddBooksToAuthor(author, booksFromAuthorUrl, visitedLinks, linksToVisit); while (linksToVisit.Count > 0) { AddBooksToAuthor(author, linksToVisit[0], visitedLinks, linksToVisit); } } CustomSave("Authors", oSerializer.Serialize(author), parameters.StartId, parameters.EndId, id); SaveImage(author.Image, parameters.StartId, parameters.EndId, id, "Authors"); } } }