Example #1
0
        public ISet <string> ScrapeLinks()
        {
            if (this.Processor == null || this.Data == null || this.PageQuery == null)
            {
                throw new InvalidOperationException();
            }

            var result = new HashSet <string>();

            // Prepare regex
            Regex rx = new Regex(this.PageQuery.PageRegex);

            var links = this.Processor.Document.DocumentNode.SelectNodes("//a[@href]");

            if (links != null)
            {
                foreach (var link in links)
                {
                    var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["href"].DeEntitizeValue);
                    if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink))
                    {
                        result.Add(HtmlProcessor.SimplifyUrl(absoluteLink));
                    }
                }
            }

            return(result);
        }
Example #2
0
        public ISet <string> ScrapeMedia()
        {
            if (this.Processor == null || this.Data == null || this.PageQuery == null)
            {
                throw new InvalidOperationException();
            }

            var result = new HashSet <string>();

            // Prepare regex
            Regex rx = new Regex(this.PageQuery.MediaRegex);

            var hrefs = this.Processor.Document.DocumentNode.SelectNodes("//link[@href]");

            if (hrefs != null)
            {
                foreach (var link in hrefs)
                {
                    var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["href"].DeEntitizeValue);
                    if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink))
                    {
                        result.Add(HtmlProcessor.SimplifyUrl(absoluteLink));
                    }
                }
            }

            var srcs = this.Processor.Document.DocumentNode.SelectNodes("//script[@src] | //stylesheet[@src] | //img[@src]");

            if (srcs != null)
            {
                foreach (var link in srcs)
                {
                    var absoluteLink = this.Processor.GetAbsoluteLink(link.Attributes["src"].DeEntitizeValue);
                    if (rx.IsMatch(absoluteLink) && HtmlProcessor.IsDownloadable(absoluteLink))
                    {
                        result.Add(HtmlProcessor.SimplifyUrl(absoluteLink));
                    }
                }
            }

            return(result);
        }