private List <BinaryFile> GetFiles(Guid pageId, string address, string content) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); List <BinaryFile> files = new List <BinaryFile>(); WebClient downloadClient = new WebClient(); try { //get all of the hrefs on the page foreach (var link in document.DocumentNode.SelectNodes("//a[@href]")) { var hrefAttribute = link.Attributes["href"]; var urlObject = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() != null) { bool hasValidExtension = false; foreach (var extension in ValidFileExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { try { byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false)); files.Add(new BinaryFile(pageId) { Url = urlObject, Tag = link.OuterHtml, Name = urlObject.Path.LastOrDefault(), Contents = new MemoryStream(fileBytes) }); Console.WriteLine("Found files: " + urlObject.GetFullPath(false)); } catch (WebException wex) { log.Warn(wex); } } } } } catch (Exception e) { log.Warn("No files in the document", e); } return(files); }
private List <BinaryFile> GetImages(Guid pageId, string address, string content) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); List <BinaryFile> images = new List <BinaryFile>(); WebClient downloadClient = new WebClient(); try { foreach (HtmlNode image in document.DocumentNode.SelectNodes("//img[@src]")) { HtmlAttribute imgSrcAttribute = image.Attributes["src"]; UrlObject urlObject = UrlObject.FromRelativeString(address, imgSrcAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() != null) { bool hasValidExtension = false; foreach (var extension in ValidImgExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { try { byte[] fileBytes = downloadClient.DownloadData(urlObject.GetFullPath(false)); images.Add(new BinaryFile(pageId) { Url = urlObject, Tag = image.OuterHtml, Name = urlObject.Path.LastOrDefault(), Contents = new MemoryStream(fileBytes), }); Console.WriteLine("Found image: " + urlObject.GetFullPath(false)); } catch (WebException wex) { log.Warn(wex); } } } } } catch (Exception e) { log.Warn("No image tags present in document", e); } return(images); }
public override bool Equals(Object o1) { if (o1 != null) { UrlObject o1obj = (UrlObject)o1; return(o1obj.GetFullPath(false) == this.GetFullPath(false)); } else { return(false); } }
private Page GetPage(UrlObject address) { // web client for downloading the file try { string fullPath = address.GetFullPath(false); string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada... string title = GetTitle(pageContent); var pageId = Guid.NewGuid(); List <LinkTag> links = null; links = GetLinks(pageId, fullPath, pageContent); List <BinaryFile> images = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { images = new List <BinaryFile>(); } else { images = GetImages(pageId, fullPath, pageContent); } List <BinaryFile> files = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { files = new List <BinaryFile>(); } else { files = GetFiles(pageId, fullPath, pageContent); } return(new Page { Content = pageContent, Name = title, FileTags = files, ImageTags = images, LinkTags = links, Link = address, PageId = pageId }); } catch (Exception wex) { log.Warn(wex); return(null); } }
private List <LinkTag> GetLinks(Guid pageId, string address, string content) { //attempt to parse the document var document = new HtmlDocument(); document.LoadHtml(content); var tags = new List <LinkTag>(); try { //get all of the hrefs on the page foreach (var link in document.DocumentNode.SelectNodes("//a[@href]")) { var hrefAttribute = link.Attributes["href"]; var urlObject = UrlObject.FromRelativeString(address, hrefAttribute.Value.ToString()); if (urlObject.Path.LastOrDefault() == null) { continue; } var hasValidExtension = false; foreach (var extension in ValidFileExtensions) { var lastOrDefault = urlObject.Path.LastOrDefault(); if (lastOrDefault != null && lastOrDefault.Contains("." + extension)) { hasValidExtension = true; } } if (hasValidExtension) { continue; } tags.Add(new LinkTag(pageId) { Tag = link.OuterHtml, Url = urlObject }); Console.WriteLine("Found link: " + urlObject.GetFullPath(false)); } } catch (Exception e) { log.Warn("No links available in the document", e); } return(tags); }
public void Start() { // ignore ssl errors ServicePointManager.ServerCertificateValidationCallback = (obj, certificate, chain, errors) => (true); // start var starter = UrlObject.FromString(Frontier); if (!Unvisited.Any()) { Unvisited.Add(starter.GetFullPath(false), starter); } // while still pages unprocessed while (Unvisited.Any() && Visited.Count < MaxAllowedPages) { Parallel.ForEach(Unvisited, (urlPair) => { try { try { var p = PageFromUrl(urlPair.Value); ProcessNewPaths(p, urlPair.Value); } catch (ArgumentOutOfRangeException) { } var unprocessed = Visited.Where(x => x.Value.Processed == false); foreach (var page in unprocessed) { if (this.JobType == SpiderJobType.PAGE_ONLY) { page.Value.LinkTags = new List <LinkTag>(); } PersistenceInserter.PersistData(page.Value); page.Value.Processed = true; } } catch (ArgumentException) { } catch (Exception e) { Console.WriteLine(e); } }); } }
public void ProcessNewPaths(Page p, UrlObject domainObject) { if (p != null && domainObject != null) { Console.WriteLine("Visited: " + p.Link.GetFullPath(false)); Unvisited.Remove(p.Link.GetFullPath(false)); if (!Visited.ContainsKey(p.Link.GetFullPath(false))) { Visited.Add(p.Link.GetFullPath(false), p); } foreach (LinkTag l in p.LinkTags) { var toBeVisited = false; var visited = false; try { var key = Unvisited[l.Url.GetFullPath(false)]; toBeVisited = true; } catch (KeyNotFoundException /* knfe */) { } try { var key = Visited[l.Url.GetFullPath(false)]; visited = true; } catch (KeyNotFoundException /* knfe */) { } if (toBeVisited != true & visited != true) { if (l.Url.GetDomain() == domainObject.GetDomain()) { Unvisited.Add(l.Url.GetFullPath(false), l.Url); } } } } }
public Page PageFromUrl(UrlObject address) { return(GetPage(address)); }
private Page GetPage(UrlObject address) { // web client for downloading the file try { string fullPath = address.GetFullPath(false); string pageContent = StringFromAddress(fullPath); // Client.DownloadString yada yada yada... string title = GetTitle(pageContent); var pageId = Guid.NewGuid(); List<LinkTag> links = null; links = GetLinks(pageId, fullPath, pageContent); List<BinaryFile> images = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { images = new List<BinaryFile>(); } else { images = GetImages(pageId, fullPath, pageContent); } List<BinaryFile> files = null; if (this.JobType == SpiderJobType.PAGE_ONLY || this.JobType == SpiderJobType.PING_ONLY) { files = new List<BinaryFile>(); } else { files = GetFiles(pageId, fullPath, pageContent); } return new Page { Content = pageContent, Name = title, FileTags = files, ImageTags = images, LinkTags = links, Link = address, PageId = pageId }; } catch (Exception wex) { log.Warn(wex); return null; } }
public void ProcessNewPaths(Page p, UrlObject domainObject) { if (p != null && domainObject != null) { Console.WriteLine("Visited: " + p.Link.GetFullPath(false)); Unvisited.Remove(p.Link.GetFullPath(false)); if (!Visited.ContainsKey(p.Link.GetFullPath(false))) { Visited.Add(p.Link.GetFullPath(false), p); } foreach (LinkTag l in p.LinkTags) { var toBeVisited = false; var visited = false; try { var key = Unvisited[l.Url.GetFullPath(false)]; toBeVisited = true; } catch (KeyNotFoundException /* knfe */) { } try { var key = Visited[l.Url.GetFullPath(false)]; visited = true; } catch (KeyNotFoundException /* knfe */) { } if (toBeVisited != true & visited != true) { if (l.Url.GetDomain() == domainObject.GetDomain()) Unvisited.Add(l.Url.GetFullPath(false), l.Url); } } } }
public Page PageFromUrl(UrlObject address) { return GetPage(address); }