Example #1
0
        public ISet <string> ScrapeLinks()
        {
            if (this.Processor == null || this.Data == null || this.PageQuery == null)
            {
                throw new InvalidOperationException();
            }

            var result = new HashSet <string>();

            // Prepare regex
            Regex rx = new Regex(this.PageQuery.PageRegex);

            var links = this.Processor.Document.DocumentNode.SelectNodes("//a[@href]");

            if (links != null)
            {
                foreach (var link in links)
                {
                    var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["href"].DeEntitizeValue);
                    if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink))
                    {
                        result.Add(HtmlProcessor.SimplifyUrl(absoluteLink));
                    }
                }
            }

            return(result);
        }
        public void Process()
        {
            this.Processor = new HtmlProcessor(this.Input, this.Url);
            this.Processor.Load();

            // Process HREF attributes
            var links = this.Processor.Document.DocumentNode.SelectNodes("//*[@href]");

            if (links != null)
            {
                foreach (var link in links)
                {
                    var address = link.Attributes["href"].Value;
                    if (HtmlProcessor.IsDownloadable(address))
                    {
                        link.Attributes["href"].Value = this.FixLink(address);
                        if (link.Name == "a")
                        {
                            link.SetAttributeValue("data-websitecacher-link", this.Processor.GetAbsoluteLink(address));
                        }
                    }
                }
            }

            // Process SRC attributes
            links = this.Processor.Document.DocumentNode.SelectNodes("//*[@src]");

            if (links != null)
            {
                foreach (var link in links)
                {
                    if (HtmlProcessor.IsDownloadable(link.Attributes["src"].Value))
                    {
                        link.Attributes["src"].Value = this.FixLink(link.Attributes["src"].Value);
                    }
                }
            }

            // Insert into head
            var head = this.Processor.Document.DocumentNode.SelectSingleNode("//head");

            if (head != null)
            {
                var script = this.Processor.Document.CreateElement("script");
                script.SetAttributeValue("src", "/website-cacher://static-content/webInjector.js");
                head.AppendChild(script);

                var style = this.Processor.Document.CreateElement("link");
                style.SetAttributeValue("rel", "stylesheet");
                style.SetAttributeValue("type", "text/css");
                style.SetAttributeValue("href", "/website-cacher://static-content/webInjector.css");
                head.AppendChild(style);
            }
        }
Example #3
0
        public ISet <string> ScrapeMedia()
        {
            if (this.Processor == null || this.Data == null || this.PageQuery == null)
            {
                throw new InvalidOperationException();
            }

            var result = new HashSet <string>();

            // Prepare regex
            Regex rx = new Regex(this.PageQuery.MediaRegex);

            var hrefs = this.Processor.Document.DocumentNode.SelectNodes("//link[@href]");

            if (hrefs != null)
            {
                foreach (var link in hrefs)
                {
                    var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["href"].DeEntitizeValue);
                    if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink))
                    {
                        result.Add(HtmlProcessor.SimplifyUrl(absoluteLink));
                    }
                }
            }

            var srcs = this.Processor.Document.DocumentNode.SelectNodes("//script[@src] | //stylesheet[@src] | //img[@src]");

            if (srcs != null)
            {
                foreach (var link in srcs)
                {
                    var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["src"].DeEntitizeValue);
                    if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink))
                    {
                        result.Add(HtmlProcessor.SimplifyUrl(absoluteLink));
                    }
                }
            }

            return(result);
        }