예제 #1
0
 /// <summary>
 /// Extracts all Favicons and Apple Touch Icons from a page.
 /// </summary>
 /// <param name="uri">Uri to start extracting from.</param>
 /// <param name="settings">Extraction Settings.</param>
 /// <returns>Extracted Favicons and Apple Touch Icons as instances of <see cref="WebImage"/>.</returns>
 public static async Task <IEnumerable <WebImage> > GetAllIcons(string uri, ExtractionSettings settings = null)
 {
     return(await Extract(uri, settings, async (Uri u, HtmlDocument d, ExtractionSettings s) =>
     {
         List <WebImage> favicons = await HtmlUtilities.GetFavicons(u, d, s);
         List <WebImage> appleTouchIcons = await HtmlUtilities.GetAppleTouchIcons(u, d, s);
         favicons.AddRange(appleTouchIcons);
         return favicons;
     }));
 }
예제 #2
0
        private static async Task <IEnumerable <WebImage> > Extract(string uri, ExtractionSettings settings, Recurser.ExtractionMethod extractionMethod)
        {
            if (settings == null)
            {
                settings = new ExtractionSettings();
            }

            ExtractionSettings = settings;

            return(await Recurser.Recurse(extractionMethod, uri, settings));
        }
예제 #3
0
 /// <summary>
 /// Extracts all images from a page (including Favicons and Apple Touch Icons).
 /// </summary>
 /// <param name="uri">Uri to start extracting from.</param>
 /// <param name="settings">Extraction Settings.</param>
 /// <param name="cancellationToken">Cancellation Token.</param>
 /// <returns>Extracted images as instances of <see cref="WebImage"/>.</returns>
 public static async Task <IEnumerable <WebImage> > GetAllImages(string uri, ExtractionSettings settings = null, CancellationToken cancellationToken = default(CancellationToken))
 {
     return(await Extract(uri, settings, cancellationToken, async (Uri u, HtmlDocument d, ExtractionSettings s) =>
     {
         List <WebImage> favicons = await HtmlUtilities.GetFavicons(u, d, s);
         List <WebImage> appleTouchIcons = await HtmlUtilities.GetAppleTouchIcons(u, d, s);
         List <WebImage> images = await HtmlUtilities.GetPageImages(u, d, s);
         favicons.AddRange(appleTouchIcons);
         favicons.AddRange(images);
         return favicons;
     }));
 }
        /// <summary>
        /// Returns images for a Uri (excluding Favicons and Apple Touch Icons).
        /// </summary>
        /// <param name="uri">Uri to explore.</param>
        /// <param name="doc">Instance of <see cref="HtmlDocument"/> containing parsed Html for the Uri.</param>
        /// <param name="settings">Extraction settings.</param>
        /// <returns>List of images as instances of <see cref="WebImage"/>.</returns>
        public static async Task <List <WebImage> > GetPageImages(Uri uri, HtmlDocument doc, ExtractionSettings settings)
        {
            List <WebImage> arr = new List <WebImage>();

            if (doc == null)
            {
                return(arr);
            }

            if (settings.GetLinkTagImages)
            {
                IEnumerable <HtmlLinkTag> htmlLinkTags = HtmlExtractor.ExtractLinks(doc);
                if (htmlLinkTags != null)
                {
                    foreach (HtmlLinkTag link in htmlLinkTags)
                    {
                        if (!link.IsFavicon() && !link.IsAppleTouchIcon())
                        {
                            Uri absUri = uri.AddHtmlLink(link.Link);
                            if (settings.SvgOnly && !absUri.HasSvgExtension())
                            {
                                continue;
                            }

                            if (!settings.SvgOnly)
                            {
                                if (!absUri.HasImageExtension())
                                {
                                    continue;
                                }
                            }

                            if (Recurser.FoundUris.Contains(absUri.ToString()))
                            {
                                continue;
                            }

                            if (await ImageDownloader.IsOkUri(absUri))
                            {
                                Recurser.FoundUris.Add(absUri.ToString());
                                arr.Add(new WebImage(absUri.ToString(), false, false));
                            }
                        }
                    }
                }
            }

            if (settings.GetMetaTagImages)
            {
                IEnumerable <HtmlMetaTag> htmlMetaTags = HtmlExtractor.ExtractMetadata(doc);
                if (htmlMetaTags != null)
                {
                    foreach (HtmlMetaTag meta in htmlMetaTags)
                    {
                        Uri absUri = uri.AddHtmlLink(meta.Content);
                        if (settings.SvgOnly && !absUri.HasSvgExtension())
                        {
                            continue;
                        }

                        if (!settings.SvgOnly)
                        {
                            if (!absUri.HasImageExtension())
                            {
                                continue;
                            }
                        }

                        if (Recurser.FoundUris.Contains(absUri.ToString()))
                        {
                            continue;
                        }

                        if (await ImageDownloader.IsOkUri(absUri))
                        {
                            Recurser.FoundUris.Add(absUri.ToString());
                            arr.Add(new WebImage(absUri.ToString(), false, false));
                        }
                    }
                }
            }

            if (settings.GetInlineBackgroundImages)
            {
                // Inline background images
                HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//body");
                if (bodyNode != null)
                {
                    await TraverseNode(bodyNode, async (HtmlNode node) =>
                    {
                        string styleValue = node.GetAttributeValue("style", null);
                        if (!string.IsNullOrEmpty(styleValue))
                        {
                            MatchCollection matches = Regex.Matches(styleValue, @"(?<=url\()(.*)(?=\))");
                            foreach (Match match in matches)
                            {
                                string value = match.Value;
                                if (!string.IsNullOrEmpty(value))
                                {
                                    if ((value[0] == '\'' && value[value.Length - 1] == '\'') || (value[0] == '\"' && value[value.Length - 1] == '\"'))
                                    {
                                        value = value.Substring(1, value.Length - 2);
                                    }

                                    Uri link = uri.AddHtmlLink(value);
                                    if (settings.SvgOnly && !link.HasSvgExtension())
                                    {
                                        continue;
                                    }

                                    if (!settings.SvgOnly)
                                    {
                                        if (!link.HasImageExtension())
                                        {
                                            continue;
                                        }
                                    }

                                    if (Recurser.FoundUris.Contains(link.ToString()))
                                    {
                                        continue;
                                    }

                                    if (await ImageDownloader.IsOkUri(link))
                                    {
                                        Recurser.FoundUris.Add(link.ToString());
                                        arr.Add(new WebImage(link.ToString(), false, false, true));
                                    }
                                }
                            }
                        }
                    });
                }

                // TODO: Add Option to get background images from CSS files
            }

            IEnumerable <HtmlImgTag> imgTags = HtmlExtractor.ExtractImages(doc);

            if (imgTags != null)
            {
                foreach (HtmlImgTag img in imgTags)
                {
                    Uri absUri = uri.AddHtmlLink(img.Src);
                    if (settings.SvgOnly && !absUri.HasSvgExtension())
                    {
                        continue;
                    }

                    if (absUri.IsBadMagickType())
                    {
                        continue;
                    }

                    if (Recurser.FoundUris.Contains(absUri.ToString()))
                    {
                        continue;
                    }

                    if (await ImageDownloader.IsOkUri(absUri))
                    {
                        Recurser.FoundUris.Add(absUri.ToString());
                        arr.Add(new WebImage(absUri.ToString(), false, false));
                    }
                }
            }

            return(arr);
        }
        /// <summary>
        /// Returns Apple Touch Icons for a Uri.
        /// </summary>
        /// <param name="uri">Uri to explore.</param>
        /// <param name="doc">Instance of <see cref="HtmlDocument"/> containing parsed Html for the Uri.</param>
        /// <param name="settings">Extraction settings.</param>
        /// <returns>List of Apple Touch Icons as instances of <see cref="WebImage"/>.</returns>
        public static async Task <List <WebImage> > GetAppleTouchIcons(Uri uri, HtmlDocument doc, ExtractionSettings settings)
        {
            List <WebImage> arr = new List <WebImage>();

            // Check default apple touch icon location
            if (!settings.SvgOnly)
            {
                Uri atiUri = uri.AddHtmlLink(Constants.DefaultAppleTouchIconPath);
                if (await ImageDownloader.IsOkUri(atiUri))
                {
                    if (!Recurser.FoundUris.Contains(atiUri.ToString()))
                    {
                        Recurser.FoundUris.Add(atiUri.ToString());
                        arr.Add(new WebImage(atiUri.ToString(), false, true));
                    }
                }
            }

            // Parse HTML Tree
            if (doc == null)
            {
                return(arr);
            }

            // Get icons through link info
            IEnumerable <HtmlLinkTag> htmlLinkTags = HtmlExtractor.ExtractLinks(doc);

            if (htmlLinkTags != null)
            {
                foreach (HtmlLinkTag link in htmlLinkTags)
                {
                    if (link.IsAppleTouchIcon())
                    {
                        Uri absUri = uri.AddHtmlLink(link.Link);
                        if (settings.SvgOnly && !absUri.HasSvgExtension())
                        {
                            continue;
                        }

                        if (!settings.SvgOnly)
                        {
                            if (!absUri.HasImageExtension())
                            {
                                continue;
                            }
                        }

                        if (Recurser.FoundUris.Contains(absUri.ToString()))
                        {
                            continue;
                        }

                        if (await ImageDownloader.IsOkUri(absUri))
                        {
                            Recurser.FoundUris.Add(absUri.ToString());
                            arr.Add(new WebImage(absUri.ToString(), false, true));
                        }
                    }
                }
            }

            return(arr);
        }
예제 #6
0
 /// <summary>
 /// Extracts all images from a page (excluding Favicons and Apple Touch Icons).
 /// </summary>
 /// <param name="uri">Uri to start extracting from.</param>
 /// <param name="settings">Extraction Settings.</param>
 /// <returns>Extracted images as instances of <see cref="WebImage"/>.</returns>
 public static async Task <IEnumerable <WebImage> > GetPageImages(string uri, ExtractionSettings settings = null)
 {
     return(await Extract(uri, settings, HtmlUtilities.GetPageImages));
 }
예제 #7
0
        /// <summary>
        /// Recurses through pages where images need to be extracted. Invokes a extraction method on each Uri found.
        /// </summary>
        /// <param name="method">Method which extracts images from speicific Uri.</param>
        /// <param name="uri">Uri to start recursion from.</param>
        /// <param name="settings">Extraction Settings.</param>
        /// <param name="cancellationToken">Cancellation Token.</param>
        /// <returns>Returns Images extracted from explored Uris.</returns>
        public static async Task <IEnumerable <WebImage> > Recurse(ExtractionMethod method, string uri, ExtractionSettings settings, CancellationToken cancellationToken)
        {
            stopAlg      = false;
            exploredUris = new List <string>();
            FoundUris    = new List <string>();

            if (!settings.RecurseHyperlinks)
            {
                settings.HyperlinkRecursionDepth = 0;
            }
            else
            {
                if (settings.HyperlinkRecursionDepth < 0)
                {
                    settings.HyperlinkRecursionDepth = 0;
                }
            }

            // Extract images for start Uri and linked pages.
            Uri             extractUri = new Uri(uri);
            List <WebImage> images     = await HyperlinkRecurse(method, extractUri, settings, 0, cancellationToken);

            // If enabled, recurse through Uris by removing segments from the end
            if (settings.RecurseUri)
            {
                while (extractUri.AbsoluteUri != "/" && !stopAlg)
                {
                    if (cancellationToken.IsCancellationRequested)
                    {
                        return(null);
                    }

                    extractUri = extractUri.RemoveLastSegment();
                    List <WebImage> moreImages = await HyperlinkRecurse(method, extractUri, settings, 0, cancellationToken);

                    images.AddRange(moreImages);
                }
            }

            if (!settings.LazyDownload)
            {
                // Images have already been downloaded, so remove any which failed
                return(images.Where(i => i.GetImageIfDownloaded() != null));
            }

            return(images);
        }
예제 #8
0
        /// <summary>
        /// Extracts images for a page and linked pages.
        /// </summary>
        /// <param name="method">Method which extracts images from speicific Uri.</param>
        /// <param name="uri">Uri to extract images and start hyperlink recursion from.</param>
        /// <param name="settings">Extraction Settings.</param>
        /// <param name="depth">Depth to recurse hyperlinks to.</param>
        /// <param name="cancellationToken">Cancellation Token.</param>
        /// <returns>Returns extracted images for given Uri and linked pages.</returns>
        public static async Task <List <WebImage> > HyperlinkRecurse(ExtractionMethod method, Uri uri, ExtractionSettings settings, int depth, CancellationToken cancellationToken)
        {
            HtmlDocument    doc    = null;
            bool            gotDoc = false;
            List <WebImage> images = new List <WebImage>();

            if (!exploredUris.Contains(uri.ToString()))
            {
                exploredUris.Add(uri.ToString());

                if (settings.OnStartNewPage != null)
                {
                    if (cancellationToken.IsCancellationRequested)
                    {
                        return(null);
                    }

                    await settings.OnStartNewPage.Invoke(uri.ToString());
                }

                if (cancellationToken.IsCancellationRequested)
                {
                    return(null);
                }

                gotDoc = true;
                doc    = await GetDocumnent(uri, cancellationToken);

                images = await method.Invoke(uri, doc, settings);

                if (cancellationToken.IsCancellationRequested)
                {
                    return(null);
                }

                if (settings.OnEndNewPage != null)
                {
                    await settings.OnEndNewPage.Invoke(uri.ToString(), images);
                }

                if (settings.ShouldStopOnFoundImage != null)
                {
                    // Take all images up to the point where should stop
                    int index = images.TakeWhile(i => !settings.ShouldStopOnFoundImage.Invoke(i)).Count();
                    if (index != images.Count)
                    {
                        images.RemoveRange(index + 1, images.Count - index - 1);
                        stopAlg = true;
                    }
                }

                if (!settings.LazyDownload)
                {
                    if (cancellationToken.IsCancellationRequested)
                    {
                        return(null);
                    }

                    await Task.WhenAll(images.Select(i => i.GetImageAsync(cancellationToken)).ToArray());
                }

                if (settings.OnFoundImage != null)
                {
                    images.ForEach(i => settings.OnFoundImage.Invoke(i));
                }
            }

            if (!stopAlg && settings.RecurseHyperlinks && depth < settings.HyperlinkRecursionDepth)
            {
                if (!gotDoc)
                {
                    if (cancellationToken.IsCancellationRequested)
                    {
                        return(null);
                    }

                    doc = await GetDocumnent(uri, cancellationToken);
                }

                if (doc != null)
                {
                    IEnumerable <HtmlATag> aTags = HtmlExtractor.ExtractATags(doc);
                    foreach (HtmlATag aTag in aTags)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            return(null);
                        }

                        Uri             newUri     = uri.AddHtmlLink(aTag.Href);
                        List <WebImage> moreImages = await HyperlinkRecurse(method, newUri, settings, depth + 1, cancellationToken);

                        images.AddRange(moreImages);
                    }
                }
            }

            return(images);
        }
예제 #9
0
 /// <summary>
 /// Extracts all images from a page (excluding Favicons and Apple Touch Icons).
 /// </summary>
 /// <param name="uri">Uri to start extracting from.</param>
 /// <param name="settings">Extraction Settings.</param>
 /// <param name="cancellationToken">Cancellation Token.</param>
 /// <returns>Extracted images as instances of <see cref="WebImage"/>.</returns>
 public static async Task <IEnumerable <WebImage> > GetPageImages(string uri, ExtractionSettings settings = null, CancellationToken cancellationToken = default(CancellationToken))
 {
     return(await Extract(uri, settings, cancellationToken, HtmlUtilities.GetPageImages));
 }