private List <BinaryFile> GetImages(Guid pageId, string address, string content) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); List <BinaryFile> images = new List <BinaryFile>(); WebClient downloadClient = new WebClient(); try { foreach (HtmlNode image in document.DocumentNode.SelectNodes("//img[@src]")) { HtmlAttribute imgSrcAttribute = image.Attributes["src"]; UrlObject urlObject = UrlObject.FromRelativeString(address, imgSrcAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() != null) { bool hasValidExtension = false; foreach (var extension in ValidImgExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { try { byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false)); images.Add(new BinaryFile(pageId) { Url = urlObject, Tag = image.OuterHtml, Name = urlObject.Path.LastOrDefault(), Contents = new MemoryStream(fileBytes), }); Console.WriteLine("Found image: " + urlObject.GetFullPath(false)); } catch (WebException wex) { log.Warn(wex); } } } } } catch (Exception e) { log.Warn("No image tags present in document", e); } return(images); }
public override bool Equals(Object o1) { if (o1 != null) { UrlObject o1obj = (UrlObject)o1; return(o1obj.GetFullPath(false) == this.GetFullPath(false)); } else { return(false); } }
private Page GetPage(UrlObject address) { // web client for downloading the file try { string fullPath = address.GetFullPath(false); string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada... string title = GetTitle(pageContent); var pageId = Guid.NewGuid(); List <LinkTag> links = null; links = GetLinks(pageId, fullPath, pageContent); List <BinaryFile> images = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { images = new List <BinaryFile>(); } else { images = GetImages(pageId, fullPath, pageContent); } List <BinaryFile> files = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { files = new List <BinaryFile>(); } else { files = GetFiles(pageId, fullPath, pageContent); } return(new Page { Content = pageContent, Name = title, FileTags = files, ImageTags = images, LinkTags = links, Link = address, PageId = pageId }); } catch (Exception wex) { log.Warn(wex); return(null); } }
private Page GetPage(UrlObject address) { // web client for downloading the file try { string fullPath = address.GetFullPath(false); string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada... string title = GetTitle(pageContent); var pageId = Guid.NewGuid(); List<LinkTag> links = null; links = GetLinks(pageId, fullPath, pageContent); List<BinaryFile> images = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { images = new List<BinaryFile>(); } else { images = GetImages(pageId, fullPath, pageContent); } List<BinaryFile> files = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { files = new List<BinaryFile>(); } else { files = GetFiles(pageId, fullPath, pageContent); } return new Page { Content = pageContent, Name = title, FileTags = files, ImageTags = images, LinkTags = links, Link = address, PageId = pageId }; } catch (Exception wex) { log.Warn(wex); return null; } }