/// <summary> /// Extracts all Favicons and Apple Touch Icons from a page. /// </summary> /// <param name="uri">Uri to start extracting from.</param> /// <param name="settings">Extraction Settings.</param> /// <returns>Extracted Favicons and Apple Touch Icons as instances of <see cref="WebImage"/>.</returns> public static async Task <IEnumerable <WebImage> > GetAllIcons(string uri, ExtractionSettings settings = null) { return(await Extract(uri, settings, async (Uri u, HtmlDocument d, ExtractionSettings s) => { List <WebImage> favicons = await HtmlUtilities.GetFavicons(u, d, s); List <WebImage> appleTouchIcons = await HtmlUtilities.GetAppleTouchIcons(u, d, s); favicons.AddRange(appleTouchIcons); return favicons; })); }
private static async Task <IEnumerable <WebImage> > Extract(string uri, ExtractionSettings settings, Recurser.ExtractionMethod extractionMethod) { if (settings == null) { settings = new ExtractionSettings(); } ExtractionSettings = settings; return(await Recurser.Recurse(extractionMethod, uri, settings)); }
/// <summary> /// Extracts all images from a page (including Favicons and Apple Touch Icons). /// </summary> /// <param name="uri">Uri to start extracting from.</param> /// <param name="settings">Extraction Settings.</param> /// <param name="cancellationToken">Cancellation Token.</param> /// <returns>Extracted images as instances of <see cref="WebImage"/>.</returns> public static async Task <IEnumerable <WebImage> > GetAllImages(string uri, ExtractionSettings settings = null, CancellationToken cancellationToken = default(CancellationToken)) { return(await Extract(uri, settings, cancellationToken, async (Uri u, HtmlDocument d, ExtractionSettings s) => { List <WebImage> favicons = await HtmlUtilities.GetFavicons(u, d, s); List <WebImage> appleTouchIcons = await HtmlUtilities.GetAppleTouchIcons(u, d, s); List <WebImage> images = await HtmlUtilities.GetPageImages(u, d, s); favicons.AddRange(appleTouchIcons); favicons.AddRange(images); return favicons; })); }
/// <summary> /// Returns images for a Uri (excluding Favicons and Apple Touch Icons). /// </summary> /// <param name="uri">Uri to explore.</param> /// <param name="doc">Instance of <see cref="HtmlDocument"/> containing parsed Html for the Uri.</param> /// <param name="settings">Extraction settings.</param> /// <returns>List of images as instances of <see cref="WebImage"/>.</returns> public static async Task <List <WebImage> > GetPageImages(Uri uri, HtmlDocument doc, ExtractionSettings settings) { List <WebImage> arr = new List <WebImage>(); if (doc == null) { return(arr); } if (settings.GetLinkTagImages) { IEnumerable <HtmlLinkTag> htmlLinkTags = HtmlExtractor.ExtractLinks(doc); if (htmlLinkTags != null) { foreach (HtmlLinkTag link in htmlLinkTags) { if (!link.IsFavicon() && !link.IsAppleTouchIcon()) { Uri absUri = uri.AddHtmlLink(link.Link); if (settings.SvgOnly && !absUri.HasSvgExtension()) { continue; } if (!settings.SvgOnly) { if (!absUri.HasImageExtension()) { continue; } } if (Recurser.FoundUris.Contains(absUri.ToString())) { continue; } if (await ImageDownloader.IsOkUri(absUri)) { Recurser.FoundUris.Add(absUri.ToString()); arr.Add(new WebImage(absUri.ToString(), false, false)); } } } } } if (settings.GetMetaTagImages) { IEnumerable <HtmlMetaTag> htmlMetaTags = HtmlExtractor.ExtractMetadata(doc); if (htmlMetaTags != null) { foreach (HtmlMetaTag meta in htmlMetaTags) { Uri absUri = uri.AddHtmlLink(meta.Content); if (settings.SvgOnly && !absUri.HasSvgExtension()) { continue; } if (!settings.SvgOnly) { if (!absUri.HasImageExtension()) { continue; } } if (Recurser.FoundUris.Contains(absUri.ToString())) { continue; } if (await ImageDownloader.IsOkUri(absUri)) { Recurser.FoundUris.Add(absUri.ToString()); arr.Add(new WebImage(absUri.ToString(), false, false)); } } } } if (settings.GetInlineBackgroundImages) { // Inline background images HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//body"); if (bodyNode != null) { await TraverseNode(bodyNode, async (HtmlNode node) => { string styleValue = node.GetAttributeValue("style", null); if (!string.IsNullOrEmpty(styleValue)) { MatchCollection matches = Regex.Matches(styleValue, @"(?<=url\()(.*)(?=\))"); foreach (Match match in matches) { string value = match.Value; if (!string.IsNullOrEmpty(value)) { if ((value[0] == '\'' && value[value.Length - 1] == '\'') || (value[0] == '\"' && value[value.Length - 1] == '\"')) { value = value.Substring(1, value.Length - 2); } Uri link = uri.AddHtmlLink(value); if (settings.SvgOnly && !link.HasSvgExtension()) { continue; } if (!settings.SvgOnly) { if (!link.HasImageExtension()) { continue; } } if (Recurser.FoundUris.Contains(link.ToString())) { continue; } if (await ImageDownloader.IsOkUri(link)) { Recurser.FoundUris.Add(link.ToString()); arr.Add(new WebImage(link.ToString(), false, false, true)); } } } } }); } // TODO: Add Option to get background images from CSS files } IEnumerable <HtmlImgTag> imgTags = HtmlExtractor.ExtractImages(doc); if (imgTags != null) { foreach (HtmlImgTag img in imgTags) { Uri absUri = uri.AddHtmlLink(img.Src); if (settings.SvgOnly && !absUri.HasSvgExtension()) { continue; } if (absUri.IsBadMagickType()) { continue; } if (Recurser.FoundUris.Contains(absUri.ToString())) { continue; } if (await ImageDownloader.IsOkUri(absUri)) { Recurser.FoundUris.Add(absUri.ToString()); arr.Add(new WebImage(absUri.ToString(), false, false)); } } } return(arr); }
/// <summary> /// Returns Apple Touch Icons for a Uri. /// </summary> /// <param name="uri">Uri to explore.</param> /// <param name="doc">Instance of <see cref="HtmlDocument"/> containing parsed Html for the Uri.</param> /// <param name="settings">Extraction settings.</param> /// <returns>List of Apple Touch Icons as instances of <see cref="WebImage"/>.</returns> public static async Task <List <WebImage> > GetAppleTouchIcons(Uri uri, HtmlDocument doc, ExtractionSettings settings) { List <WebImage> arr = new List <WebImage>(); // Check default apple touch icon location if (!settings.SvgOnly) { Uri atiUri = uri.AddHtmlLink(Constants.DefaultAppleTouchIconPath); if (await ImageDownloader.IsOkUri(atiUri)) { if (!Recurser.FoundUris.Contains(atiUri.ToString())) { Recurser.FoundUris.Add(atiUri.ToString()); arr.Add(new WebImage(atiUri.ToString(), false, true)); } } } // Parse HTML Tree if (doc == null) { return(arr); } // Get icons through link info IEnumerable <HtmlLinkTag> htmlLinkTags = HtmlExtractor.ExtractLinks(doc); if (htmlLinkTags != null) { foreach (HtmlLinkTag link in htmlLinkTags) { if (link.IsAppleTouchIcon()) { Uri absUri = uri.AddHtmlLink(link.Link); if (settings.SvgOnly && !absUri.HasSvgExtension()) { continue; } if (!settings.SvgOnly) { if (!absUri.HasImageExtension()) { continue; } } if (Recurser.FoundUris.Contains(absUri.ToString())) { continue; } if (await ImageDownloader.IsOkUri(absUri)) { Recurser.FoundUris.Add(absUri.ToString()); arr.Add(new WebImage(absUri.ToString(), false, true)); } } } } return(arr); }
/// <summary> /// Extracts all images from a page (excluding Favicons and Apple Touch Icons). /// </summary> /// <param name="uri">Uri to start extracting from.</param> /// <param name="settings">Extraction Settings.</param> /// <returns>Extracted images as instances of <see cref="WebImage"/>.</returns> public static async Task <IEnumerable <WebImage> > GetPageImages(string uri, ExtractionSettings settings = null) { return(await Extract(uri, settings, HtmlUtilities.GetPageImages)); }
/// <summary> /// Recurses through pages where images need to be extracted. Invokes a extraction method on each Uri found. /// </summary> /// <param name="method">Method which extracts images from speicific Uri.</param> /// <param name="uri">Uri to start recursion from.</param> /// <param name="settings">Extraction Settings.</param> /// <param name="cancellationToken">Cancellation Token.</param> /// <returns>Returns Images extracted from explored Uris.</returns> public static async Task <IEnumerable <WebImage> > Recurse(ExtractionMethod method, string uri, ExtractionSettings settings, CancellationToken cancellationToken) { stopAlg = false; exploredUris = new List <string>(); FoundUris = new List <string>(); if (!settings.RecurseHyperlinks) { settings.HyperlinkRecursionDepth = 0; } else { if (settings.HyperlinkRecursionDepth < 0) { settings.HyperlinkRecursionDepth = 0; } } // Extract images for start Uri and linked pages. Uri extractUri = new Uri(uri); List <WebImage> images = await HyperlinkRecurse(method, extractUri, settings, 0, cancellationToken); // If enabled, recurse through Uris by removing segments from the end if (settings.RecurseUri) { while (extractUri.AbsoluteUri != "/" && !stopAlg) { if (cancellationToken.IsCancellationRequested) { return(null); } extractUri = extractUri.RemoveLastSegment(); List <WebImage> moreImages = await HyperlinkRecurse(method, extractUri, settings, 0, cancellationToken); images.AddRange(moreImages); } } if (!settings.LazyDownload) { // Images have already been downloaded, so remove any which failed return(images.Where(i => i.GetImageIfDownloaded() != null)); } return(images); }
/// <summary> /// Extracts images for a page and linked pages. /// </summary> /// <param name="method">Method which extracts images from speicific Uri.</param> /// <param name="uri">Uri to extract images and start hyperlink recursion from.</param> /// <param name="settings">Extraction Settings.</param> /// <param name="depth">Depth to recurse hyperlinks to.</param> /// <param name="cancellationToken">Cancellation Token.</param> /// <returns>Returns extracted images for given Uri and linked pages.</returns> public static async Task <List <WebImage> > HyperlinkRecurse(ExtractionMethod method, Uri uri, ExtractionSettings settings, int depth, CancellationToken cancellationToken) { HtmlDocument doc = null; bool gotDoc = false; List <WebImage> images = new List <WebImage>(); if (!exploredUris.Contains(uri.ToString())) { exploredUris.Add(uri.ToString()); if (settings.OnStartNewPage != null) { if (cancellationToken.IsCancellationRequested) { return(null); } await settings.OnStartNewPage.Invoke(uri.ToString()); } if (cancellationToken.IsCancellationRequested) { return(null); } gotDoc = true; doc = await GetDocumnent(uri, cancellationToken); images = await method.Invoke(uri, doc, settings); if (cancellationToken.IsCancellationRequested) { return(null); } if (settings.OnEndNewPage != null) { await settings.OnEndNewPage.Invoke(uri.ToString(), images); } if (settings.ShouldStopOnFoundImage != null) { // Take all images up to the point where should stop int index = images.TakeWhile(i => !settings.ShouldStopOnFoundImage.Invoke(i)).Count(); if (index != images.Count) { images.RemoveRange(index + 1, images.Count - index - 1); stopAlg = true; } } if (!settings.LazyDownload) { if (cancellationToken.IsCancellationRequested) { return(null); } await Task.WhenAll(images.Select(i => i.GetImageAsync(cancellationToken)).ToArray()); } if (settings.OnFoundImage != null) { images.ForEach(i => settings.OnFoundImage.Invoke(i)); } } if (!stopAlg && settings.RecurseHyperlinks && depth < settings.HyperlinkRecursionDepth) { if (!gotDoc) { if (cancellationToken.IsCancellationRequested) { return(null); } doc = await GetDocumnent(uri, cancellationToken); } if (doc != null) { IEnumerable <HtmlATag> aTags = HtmlExtractor.ExtractATags(doc); foreach (HtmlATag aTag in aTags) { if (cancellationToken.IsCancellationRequested) { return(null); } Uri newUri = uri.AddHtmlLink(aTag.Href); List <WebImage> moreImages = await HyperlinkRecurse(method, newUri, settings, depth + 1, cancellationToken); images.AddRange(moreImages); } } } return(images); }
/// <summary> /// Extracts all images from a page (excluding Favicons and Apple Touch Icons). /// </summary> /// <param name="uri">Uri to start extracting from.</param> /// <param name="settings">Extraction Settings.</param> /// <param name="cancellationToken">Cancellation Token.</param> /// <returns>Extracted images as instances of <see cref="WebImage"/>.</returns> public static async Task <IEnumerable <WebImage> > GetPageImages(string uri, ExtractionSettings settings = null, CancellationToken cancellationToken = default(CancellationToken)) { return(await Extract(uri, settings, cancellationToken, HtmlUtilities.GetPageImages)); }