public IEnumerable <ImageData> ScrapePage(WebpageConfig configuration) { WebClient w = new WebClient(); //super high level hacks w.Headers.Add(HttpRequestHeader.UserAgent, configuration.UserAgent); foreach (var entry in configuration.RealCookies) { w.Headers.Add(HttpRequestHeader.Cookie, String.Format("{0}={1}", entry.Key, entry.Value)); } Uri url = new Uri(configuration.PageUrl); CQ dom = w.DownloadString(url); w.Dispose(); var allImages = dom["img"].ToList(); if (configuration.SkipImagesInsideLinks) { //Don't get images within links, ie thumbnails allImages = dom[":not(a) > img"].ToList(); } var allLinks = dom["a"].ToList(); var allVideos = dom["video > source:first-child"].ToList(); _pageTitle = dom["title"].Text().Trim(); if (String.IsNullOrWhiteSpace(configuration.TitleOverride)) { _pageTitle = configuration.TitleOverride; } _sourceUrl = url.ToString(); var imageLinks = allImages.Select(d => d.GetAttribute("src")); var textLinks = allLinks.Select(d => d.GetAttribute("href")); var vidLinks = allVideos.Select(d => d.GetAttribute("src")); Uri baseUri = new Uri(url.Scheme + "://" + url.Host + String.Join("", url.Segments.Reverse().Skip(1).Reverse())); Uri newUri; //This can probably made a bit cleaner, but it works List <ImageData> genericLinks = imageLinks.Union(textLinks).Union(vidLinks) .Distinct() .Where(d => !String.IsNullOrWhiteSpace(Path.GetExtension(d))) .Where(d => PeruserConfig.Current.AllowedFileTypes.Contains(Path.GetExtension(d).Substring(1).ToLower())) .Select(d => IsOkayUri(d, baseUri, out newUri) ? newUri : null) .Where(d => d != null) .Select(d => new ImageData { FileName = Path.GetFileName(d.ToString()), LastModified = DateTime.Now, Path = (d.IsAbsoluteUri ? d.ToString() : new Uri(baseUri, d).ToString()) }).ToList(); return(genericLinks); }
private WebpageLibrary(WebpageConfig configuration) { Images = new ObservableCollection <ImageData>(ScrapePage(configuration)); }
public ChoosePage() { Config = new WebpageConfig(); InitializeComponent(); AllowsTransparency = true; }