public Post GetSinglePost(SearchResult result) { Post post = new Post(result); int currentChild = 1; var document = RetrieveDocumentAsync(result.Link); var body = document.GetElementbyId("ContainerMain"); var content = body.ChildNodes.Where(node => node.Attributes.Contains("class") && node.Attributes["class"].Value.Contains("content-border")) .FirstOrDefault(); if (content == null) { throw new Exception("Couldn't find any div with a .content-border class"); } // HAP Parses text as a #text node. ChildNodes[0] is always a #text node. // The next one is the one we're interested in. //Debug.WriteLine("Parsing the content"); content = content.ChildNodes[1].ChildNodes.Where(node => node.Attributes.Contains("class") && node.Attributes["class"].Value.Contains("lbcContainer")) .FirstOrDefault(); if (content == null) { throw new Exception("Couldn't find .lbcContainer"); } // -- User Info post.UserName = StripLBCSpaces(content.ChildNodes[1].ChildNodes[3].ChildNodes[1].InnerText); post.UserMail = StripLBCSpaces(content.ChildNodes[1].ChildNodes[3].ChildNodes[1].Attributes["href"].Value); // -- Images // If there is no image, there will be an empty div in place of the image container // However, .print-lbcImages won't be there. We need to go up one div, or we'd try to access an unexistant div currentChild = 1; if (String.IsNullOrWhiteSpace(StripLBCSpaces(content.ChildNodes[5].ChildNodes[currentChild].InnerHtml))) { currentChild = 3; Debug.WriteLine("No images on node :" + result.Title); } else { currentChild = 5; } var infos = content.ChildNodes[5].ChildNodes[currentChild].ChildNodes[1]; foreach (var node in infos.ChildNodes) { if (node.Name.Equals("tr")) { //Debug.WriteLine("Found a tr"); post.Properties.Add(new Tuple<string, string>(StripLBCSpaces(node.ChildNodes[1].InnerText), StripLBCSpaces(node.ChildNodes[3].InnerText))); } } currentChild += 2; post.Description = StripLBCSpaces(content.ChildNodes[5].ChildNodes[currentChild].ChildNodes[3].InnerText); return post; }
private HtmlDocument RetrieveDocumentAsync(string uri) { Post p = new Post(null); Debug.WriteLine("Retrieving " + uri + "..."); var connection = (HttpWebRequest)HttpWebRequest.Create(uri); var response = connection.GetResponse(); HtmlDocument doc = new HtmlDocument(); using (StreamReader sr = new StreamReader(response.GetResponseStream())) { string content = sr.ReadToEnd(); //Debug.WriteLine("Document size : " + content.Length + " chars"); doc.OptionAutoCloseOnEnd = true; doc.LoadHtml(content); } Debug.WriteLine("Done."); return doc; }
public Post GetSinglePost(SearchResult result) { Post post = new Post(result); int currentChild = 1; var document = RetrieveDocumentAsync(result.Link); var body = document.GetElementbyId("ContainerMain"); var content = body.ChildNodes.Where(node => node.Attributes.Contains("class") && node.Attributes["class"].Value.Contains("content-border")) .FirstOrDefault(); if (content == null) { throw new Exception("Couldn't find any div with a .content-border class"); } // HAP Parses text as a #text node. ChildNodes[0] is always a #text node. // The next one is the one we're interested in. //Debug.WriteLine("Parsing the content"); content = content.ChildNodes[1].ChildNodes.Where(node => node.Attributes.Contains("class") && node.Attributes["class"].Value.Contains("lbcContainer")) .FirstOrDefault(); if (content == null) { throw new Exception("Couldn't find .lbcContainer"); } // -- User Info post.UserName = StripLBCSpaces(content.ChildNodes[1].ChildNodes[3].ChildNodes[1].InnerText); post.UserMail = StripLBCSpaces(content.ChildNodes[1].ChildNodes[3].ChildNodes[1].Attributes["href"].Value); // -- Images // If there is no image, there will be an empty div in place of the image container // However, .print-lbcImages won't be there. We need to go up one div, or we'd try to access an unexistant div currentChild = 1; if (String.IsNullOrWhiteSpace(StripLBCSpaces(content.ChildNodes[5].ChildNodes[currentChild].InnerHtml))) { currentChild = 3; Debug.WriteLine("No images on node :" + result.Title); } else { currentChild = 5; } var infos = content.ChildNodes[5].ChildNodes[currentChild].ChildNodes[1]; foreach (var node in infos.ChildNodes) { if (node.Name.Equals("tr")) { //Debug.WriteLine("Found a tr"); post.Properties.Add(new Tuple <string, string>(StripLBCSpaces(node.ChildNodes[1].InnerText), StripLBCSpaces(node.ChildNodes[3].InnerText))); } } currentChild += 2; post.Description = StripLBCSpaces(content.ChildNodes[5].ChildNodes[currentChild].ChildNodes[3].InnerText); return(post); }