예제 #1
0
        private void AddBooksToAuthor(Author author, string url, List<string> visitedLinks, List<string> linksToVisit)
        {
            visitedLinks.Add(url);
            linksToVisit.Remove(url);
            HtmlDocument booksDoc = new HtmlDocument();
            int httpCode = 0;
            booksDoc.LoadHtml(GetContent(url, ref httpCode));

            if (httpCode == 200)
            {
                HtmlNodeCollection books = booksDoc.DocumentNode.SelectNodes("//div[preceding-sibling::div[@id='menubusca']]//div[position()=3]//a");
                if (books != null)
                {
                    string href = string.Empty;
                    foreach (HtmlNode link in books)
                    {
                        href = link.Attributes["href"].Value.Replace("/livro/", "").Split('-')[0];
                        if (href.StartsWith("/autor/livros/"))
                        {
                            href = "http://www.skoob.com.br" + href;
                            if (!visitedLinks.Contains(href) && !linksToVisit.Contains(href))
                                linksToVisit.Add(href);
                        }
                        else
                            author.Books.Add(href);
                    }
                }
            }
        }
예제 #2
0
 private Author SetAuthorData(Author author, string field, string value)
 {
     field = field.Replace(":","");
     switch (field)
     {
         case "Gêneros": author.Categories = CleanData(value); break;
         case "Nascimento": author.BirthDate = CleanData(value); break;
         case "Local": author.Local = CleanData(value); break;
     }
     return author;
 }
예제 #3
0
        public void ParseAuthors(object info)
        {
            JavaScriptSerializer oSerializer = new JavaScriptSerializer();
            Parameters parameters = (Parameters)info;
            string url = string.Empty;
            int httpCode = 0;
            for (int id = parameters.StartId; id <= parameters.EndId; id++)
            {
                url = string.Format(baseUrl, "autor", id);
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(GetContent(url, ref httpCode));

                if (httpCode == 200)
                {
                    Author author = new Author();
                    author.Id = id;

                    HtmlNode image = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//img[position()=1]");
                    author.Image = string.Empty;
                    if (image != null)
                        author.Image = image.Attributes["src"].Value;

                    HtmlNode name = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//h1");
                    author.Name = string.Empty;
                    if (name != null)
                        author.Name = name.InnerText;

                    HtmlNode description = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//div[@id='biografia']");
                    author.Description = string.Empty;
                    if (description != null)
                        author.Description = CleanData(description.InnerText).Substring(10);

                    HtmlNode data = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=1]//div[@style='border:#000 0px solid; float:right; width:175px; color:#666666;  font-family:arial; font-size:11px; line-height:20px;']");

                    string rx = @"<strong>([\w:]+)<\/strong><br>([\w\s,-:\/]+)";
                    MatchCollection matches = Regex.Matches(data.InnerHtml, rx, RegexOptions.IgnoreCase);

                    foreach (Match m in matches)
                    {
                        if (m.Success && m.Groups.Count == 3)
                            SetAuthorData(author, m.Groups[1].Value, m.Groups[2].Value);
                    }

                    rx = "<a href=\"([.\\/\\w:]+)\" class=\"l11out\" target=\"_blank\">([\\w\\/\\s.:!]+)<\\/a>";
                    matches = Regex.Matches(data.InnerHtml, rx, RegexOptions.IgnoreCase);

                    foreach (Match m in matches)
                    {
                        if (m.Success && m.Groups.Count == 3)
                            author.Links.Add(m.Groups[1].Value);
                    }

                    HtmlNode bookCount = doc.DocumentNode.SelectSingleNode("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=3]//h1[@style='font-size:17px; font-family: trebuchet ms; color:#666666; float:left;']");
                    int count = SafeConvert.ToInt(bookCount.InnerText.Replace("Livros do Autor (","").Replace(")",""));

                    if (count <= 15)
                    {
                        HtmlNodeCollection books = doc.DocumentNode.SelectNodes("//div[preceding-sibling::div[@id='breadCrumb']]//div[position()=3]//div[@style='border:green 0px solid; width:430px; margin: 10px  0px;']//a");

                        if (books != null)
                        {
                            foreach (HtmlNode link in books)
                            {
                                author.Books.Add(link.Attributes["href"].Value.Replace("/livro/", "").Split('-')[0]);
                            }
                        }
                    }
                    else
                    {
                        List<string> visitedLinks = new List<string>();
                        List<string> linksToVisit = new List<string>();
                        string booksFromAuthorUrl = string.Format(baseUrl, "autor/livros", id + "/page:1");
                        AddBooksToAuthor(author, booksFromAuthorUrl, visitedLinks, linksToVisit);
                        while (linksToVisit.Count > 0)
                        {
                            AddBooksToAuthor(author, linksToVisit[0], visitedLinks, linksToVisit);
                        }
                    }
                    CustomSave("Authors", oSerializer.Serialize(author), parameters.StartId, parameters.EndId, id);
                    SaveImage(author.Image, parameters.StartId, parameters.EndId, id, "Authors");
                }
            }
        }