public ISet <string> ScrapeLinks() { if (this.Processor == null || this.Data == null || this.PageQuery == null) { throw new InvalidOperationException(); } var result = new HashSet <string>(); // Prepare regex Regex rx = new Regex(this.PageQuery.PageRegex); var links = this.Processor.Document.DocumentNode.SelectNodes("//a[@href]"); if (links != null) { foreach (var link in links) { var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["href"].DeEntitizeValue); if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink)) { result.Add(HtmlProcessor.SimplifyUrl(absoluteLink)); } } } return(result); }
public void Process() { this.Processor = new HtmlProcessor(this.Input, this.Url); this.Processor.Load(); // Process HREF attributes var links = this.Processor.Document.DocumentNode.SelectNodes("//*[@href]"); if (links != null) { foreach (var link in links) { var address = link.Attributes["href"].Value; if (HtmlProcessor.IsDownloadable(address)) { link.Attributes["href"].Value = this.FixLink(address); if (link.Name == "a") { link.SetAttributeValue("data-websitecacher-link", this.Processor.GetAbsoluteLink(address)); } } } } // Process SRC attributes links = this.Processor.Document.DocumentNode.SelectNodes("//*[@src]"); if (links != null) { foreach (var link in links) { if (HtmlProcessor.IsDownloadable(link.Attributes["src"].Value)) { link.Attributes["src"].Value = this.FixLink(link.Attributes["src"].Value); } } } // Insert into head var head = this.Processor.Document.DocumentNode.SelectSingleNode("//head"); if (head != null) { var script = this.Processor.Document.CreateElement("script"); script.SetAttributeValue("src", "/website-cacher://static-content/webInjector.js"); head.AppendChild(script); var style = this.Processor.Document.CreateElement("link"); style.SetAttributeValue("rel", "stylesheet"); style.SetAttributeValue("type", "text/css"); style.SetAttributeValue("href", "/website-cacher://static-content/webInjector.css"); head.AppendChild(style); } }
public ISet <string> ScrapeMedia() { if (this.Processor == null || this.Data == null || this.PageQuery == null) { throw new InvalidOperationException(); } var result = new HashSet <string>(); // Prepare regex Regex rx = new Regex(this.PageQuery.MediaRegex); var hrefs = this.Processor.Document.DocumentNode.SelectNodes("//link[@href]"); if (hrefs != null) { foreach (var link in hrefs) { var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["href"].DeEntitizeValue); if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink)) { result.Add(HtmlProcessor.SimplifyUrl(absoluteLink)); } } } var srcs = this.Processor.Document.DocumentNode.SelectNodes("//script[@src] | //stylesheet[@src] | //img[@src]"); if (srcs != null) { foreach (var link in srcs) { var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["src"].DeEntitizeValue); if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink)) { result.Add(HtmlProcessor.SimplifyUrl(absoluteLink)); } } } return(result); }