private List <BinaryFile> GetFiles(Guid pageId, string address, string content) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); List <BinaryFile> files = new List <BinaryFile>(); WebClient downloadClient = new WebClient(); try { //get all of the hrefs on the page foreach (var link in document.DocumentNode.SelectNodes("//a[@href]")) { var hrefAttribute = link.Attributes["href"]; var urlObject = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() != null) { bool hasValidExtension = false; foreach (var extension in ValidFileExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { try { byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false)); files.Add(new BinaryFile(pageId) { Url = urlObject, Tag = link.OuterHtml, Name = urlObject.Path.LastOrDefault(), Contents = new MemoryStream(fileBytes) }); Console.WriteLine("Found files: " + urlObject.GetFullPath(false)); } catch (WebException wex) { log.Warn(wex); } } } } } catch (Exception e) { log.Warn("No files in the document", e); } return(files); }
private List <BinaryFile> GetImages(Guid pageId, string address, string content) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); List <BinaryFile> images = new List <BinaryFile>(); WebClient downloadClient = new WebClient(); try { foreach (HtmlNode image in document.DocumentNode.SelectNodes("//img[@src]")) { HtmlAttribute imgSrcAttribute = image.Attributes["src"]; UrlObject urlObject = UrlObject.FromRelativeString(address, imgSrcAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() != null) { bool hasValidExtension = false; foreach (var extension in ValidImgExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { try { byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false)); images.Add(new BinaryFile(pageId) { Url = urlObject, Tag = image.OuterHtml, Name = urlObject.Path.LastOrDefault(), Contents = new MemoryStream(fileBytes), }); Console.WriteLine("Found image: " + urlObject.GetFullPath(false)); } catch (WebException wex) { log.Warn(wex); } } } } } catch (Exception e) { log.Warn("No image tags present in document", e); } return(images); }
private List <LinkTag> GetLinks(Guid pageId, string address, string content) { //attempt to parse the document var document = new HtmlDocument(); document.LoadHtml(content); var tags = new List <LinkTag>(); try { //get all of the hrefs on the page foreach (var link in document.DocumentNode.SelectNodes("//a[@href]")) { var hrefAttribute = link.Attributes["href"]; var urlObject = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() == null) { continue; } var hasValidExtension = false; foreach (var extension in ValidFileExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { continue; } tags.Add(new LinkTag(pageId) { Tag = link.OuterHtml, Url = urlObject }); Console.WriteLine("Found link: " + urlObject.GetFullPath(false)); } } catch (Exception e) { log.Warn("No links available in the document", e); } return(tags); }