Exemplo n.º 1
0
        /// <summary>
        /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set
        /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>.
        /// If an image does need to be rotated or does not fit then a local copy is made of
        /// the <paramref name="inputUri"/> file.
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param>
        /// <param name="rotate">When set to <c>true</c> then the EXIF information of an
        ///     image is read and when needed the image is automatic rotated</param>
        /// <param name="sanitizeHtml">When set to <c>true</c> then the HTML with get sanitized</param>
        /// <param name="pageSettings"><see cref="PageSettings"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns>
        /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception>
        public bool Validate(ConvertUri inputUri,
                             bool resize,
                             bool rotate,
                             bool sanitizeHtml,
                             PageSettings pageSettings,
                             out ConvertUri outputUri)
        {
            outputUri = null;

            string localDirectory = null;

            if (inputUri.IsFile)
            {
                localDirectory = Path.GetDirectoryName(inputUri.OriginalString);
            }

            using (var webpage = inputUri.IsFile
                ? File.OpenRead(inputUri.OriginalString)
                : DownloadStream(inputUri))
            {
                var maxWidth  = (pageSettings.PaperWidth - pageSettings.MarginLeft - pageSettings.MarginRight) * 96.0;
                var maxHeight = (pageSettings.PaperHeight - pageSettings.MarginTop - pageSettings.MarginBottom) * 96.0;

                var htmlChanged = false;
                var config      = Configuration.Default.WithCss();
                var context     = BrowsingContext.New(config);

                IDocument document;

                try
                {
                    // ReSharper disable AccessToDisposedClosure
                    document = inputUri.Encoding != null
                        ? context.OpenAsync(m =>
                                            m.Content(webpage).Header("Content-Type",
                                                                      $"text/html; charset={inputUri.Encoding.WebName}"))
                               .Result
                        : context.OpenAsync(m => m.Content(webpage)).Result;

                    // ReSharper restore AccessToDisposedClosure
                }
                catch (Exception exception)
                {
                    WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}");
                    return(true);
                }

                if (sanitizeHtml)
                {
                    WriteToLog("Sanitizing HTML");
                    new HtmlSanitizer().DoSanitize(document as IHtmlDocument, document.DocumentElement);
                    htmlChanged = true;
                    WriteToLog("HTML sanitized");
                }

                WriteToLog("Validating all images if they need to be rotated and if they fit the page");
                var unchangedImages = new List <IHtmlImageElement>();

                // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop
                foreach (var htmlImage in document.Images)
                {
                    var imageChanged = false;

                    if (string.IsNullOrWhiteSpace(htmlImage.Source))
                    {
                        WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'");
                        continue;
                    }

                    Image image  = null;
                    var   source = htmlImage.Source.Contains("?")
                        ? htmlImage.Source.Split('?')[0]
                        : htmlImage.Source;

                    var extension = Path.GetExtension(FileManager.RemoveInvalidFileNameChars(source));

                    var fileName = GetTempFile(extension);

                    try
                    {
                        // The local width and height attributes always go before css width and height
                        var width  = htmlImage.DisplayWidth;
                        var height = htmlImage.DisplayHeight;

                        if (rotate)
                        {
                            image = GetImage(htmlImage.Source, localDirectory);

                            if (image == null)
                            {
                                continue;
                            }

                            if (RotateImageByExifOrientationData(image))
                            {
                                htmlImage.DisplayWidth  = image.Width;
                                htmlImage.DisplayHeight = image.Height;
                                WriteToLog($"Image rotated and saved to location '{fileName}'");
                                image.Save(fileName);
                                htmlImage.DisplayWidth  = image.Width;
                                htmlImage.DisplayHeight = image.Height;
                                htmlImage.SetStyle(string.Empty);
                                htmlImage.Source = new Uri(fileName).ToString();
                                htmlChanged      = true;
                                imageChanged     = true;
                            }

                            width  = image.Width;
                            height = image.Height;
                        }

                        if (resize)
                        {
                            if (height == 0 && width == 0)
                            {
                                var style = context.Current.GetComputedStyle(htmlImage);
                                if (style != null)
                                {
                                    width  = ParseValue(style.GetPropertyValue("width"));
                                    height = ParseValue(style.GetPropertyValue("height"));
                                }
                            }

                            // If we don't know the image size then get if from the image itself
                            if (width <= 0 || height <= 0)
                            {
                                if (image == null)
                                {
                                    image = GetImage(htmlImage.Source, localDirectory);
                                }

                                if (image == null)
                                {
                                    continue;
                                }
                                width  = image.Width;
                                height = image.Height;
                            }

                            if (width > maxWidth || height > maxHeight)
                            {
                                // If we did not load the image already then load it

                                if (image == null)
                                {
                                    image = GetImage(htmlImage.Source, localDirectory);
                                }

                                if (image == null)
                                {
                                    continue;
                                }

                                ScaleImage(image, (int)maxWidth, out var newWidth, out var newHeight);
                                WriteToLog($"Image rescaled to width {newWidth} and height {newHeight}");
                                htmlImage.DisplayWidth  = newWidth;
                                htmlImage.DisplayHeight = newHeight;
                                htmlImage.SetStyle(string.Empty);
                                htmlChanged = true;
                            }
                        }
                    }
                    finally
                    {
                        image?.Dispose();
                    }

                    if (!imageChanged)
                    {
                        unchangedImages.Add(htmlImage);
                    }
                }

                if (!htmlChanged)
                {
                    return(true);
                }

                foreach (var unchangedImage in unchangedImages)
                {
                    using (var image = GetImage(unchangedImage.Source, localDirectory))
                    {
                        if (image == null)
                        {
                            WriteToLog($"Could not load unchanged image from location '{unchangedImage.Source}'");
                            continue;
                        }

                        var extension = Path.GetExtension(unchangedImage.Source.Contains("?")
                            ? unchangedImage.Source.Split('?')[0]
                            : unchangedImage.Source);
                        var fileName = GetTempFile(extension);

                        WriteToLog($"Unchanged image saved to location '{fileName}'");
                        image.Save(fileName);
                        unchangedImage.Source = new Uri(fileName).ToString();
                    }
                }

                var outputFile = GetTempFile(".htm");
                outputUri = new ConvertUri(outputFile, inputUri.Encoding);

                try
                {
                    using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write))
                    {
                        if (inputUri.Encoding != null)
                        {
                            using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                        else
                        {
                            using (var textWriter = new StreamWriter(fileStream))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                    }

                    return(false);
                }
                catch (Exception exception)
                {
                    WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                    return(true);
                }
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Returns the <see cref="Image"/> for the given <paramref name="imageSource"/>
        /// </summary>
        /// <param name="imageSource"></param>
        /// <param name="localDirectory"></param>
        /// <returns></returns>
        private Image GetImage(string imageSource, string localDirectory)
        {
            if (imageSource.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase))
            {
                WriteToLog("Decoding image from base64 string");

                try
                {
                    var base64Data = Regex.Match(imageSource, @"data:image/(?<type>.+?),(?<data>.+)").Groups["data"].Value;
                    var binaryData = Convert.FromBase64String(base64Data);

                    using (var stream = new MemoryStream(binaryData))
                    {
                        var image = Image.FromStream(stream);
                        WriteToLog("Image decoded");
                        return(image);
                    }
                }
                catch (Exception exception)
                {
                    WriteToLog($"Error decoding image: {ExceptionHelpers.GetInnerException(exception)}");
                    return(null);
                }
            }

            try
            {
                WriteToLog($"Getting image from uri '{imageSource}'");

                var imageUri = new Uri(imageSource);

                if (imageUri.IsLoopback || imageUri.IsFile)
                {
                    var fileName = imageUri.OriginalString;

                    if (!File.Exists(fileName))
                    {
                        fileName = Path.Combine(localDirectory, imageUri.AbsolutePath.Trim('/'));
                    }

                    if (File.Exists(fileName))
                    {
                        var fileStream = new FileStream(fileName, FileMode.Open, FileAccess.Read);
                        return(Image.FromStream(fileStream, true, false));
                    }
                }

                switch (imageUri.Scheme)
                {
                case "https":
                case "http":
                    using (var webStream = WebClient.OpenReadTaskAsync(imageUri).Timeout(_timeout).GetAwaiter().GetResult())
                    {
                        if (webStream != null)
                        {
                            return(Image.FromStream(webStream, true, false));
                        }
                    }
                    break;

                default:
                    WriteToLog($"Unsupported scheme {imageUri.Scheme} to get image");
                    return(null);
                }
            }
            catch (Exception exception)
            {
                WriteToLog("Getting image failed with exception: " + ExceptionHelpers.GetInnerException(exception));
            }

            return(null);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Sanitizes the HTML by removing all forbidden elements
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <returns></returns>
        public bool FitPageToContent(ConvertUri inputUri, out ConvertUri outputUri)
        {
            outputUri = null;

            using (var webpage = inputUri.IsFile
                ? File.OpenRead(inputUri.OriginalString)
                : DownloadStream(inputUri))
            {
                var config  = Configuration.Default.WithCss();
                var context = BrowsingContext.New(config);

                IDocument document;

                try
                {
                    // ReSharper disable AccessToDisposedClosure
                    document = inputUri.Encoding != null
                        ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}").Address(inputUri.ToString())).Result
                        : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result;

                    // ReSharper restore AccessToDisposedClosure

                    var styleElement = new HtmlElement(document as Document, "style")
                    {
                        InnerHtml = "html, body " + Environment.NewLine +
                                    "{" + Environment.NewLine +
                                    "   width: fit-content;" + Environment.NewLine +
                                    "   height: fit-content;" + Environment.NewLine +
                                    "   margin: 0px;" + Environment.NewLine +
                                    "   padding: 0px;" + Environment.NewLine +
                                    "}" + Environment.NewLine
                    };

                    document.Head.AppendElement(styleElement);

                    var pageStyleElement = new HtmlElement(document as Document, "style")
                    {
                        Id        = "pagestyle",
                        InnerHtml = "@page " + Environment.NewLine +
                                    "{ " + Environment.NewLine +
                                    "   size: 595px 842px ; " + Environment.NewLine +
                                    "   margin: 0px " + Environment.NewLine +
                                    "}" + Environment.NewLine
                    };

                    document.Head.AppendElement(pageStyleElement);

                    var pageElement = new HtmlElement(document as Document, "script")
                    {
                        InnerHtml = "window.onload = function () {" + Environment.NewLine +
                                    "" + Environment.NewLine +
                                    "   var page = document.getElementsByTagName('html')[0];" + Environment.NewLine +
                                    "   var pageInfo = window.getComputedStyle(page);" + Environment.NewLine +
                                    "" + Environment.NewLine +
                                    "    var height = parseInt(pageInfo.height) + 10 + 'px';" +
                                    Environment.NewLine +
                                    "" + Environment.NewLine +
                                    "    var pageCss = '@page { size: ' + pageInfo.width + ' ' + height + '; margin: 0; }'" +
                                    Environment.NewLine +
                                    "    document.getElementById('pagestyle').innerHTML = pageCss;" + Environment.NewLine +
                                    "}" + Environment.NewLine
                    };

                    document.Body.AppendElement(pageElement);
                }
                catch (Exception exception)
                {
                    WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}");
                    return(false);
                }

                var outputFile = GetTempFile(".htm");
                outputUri = new ConvertUri(outputFile, inputUri.Encoding);

                try
                {
                    WriteToLog($"Writing changed webpage to '{outputFile}'");

                    using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write))
                    {
                        if (inputUri.Encoding != null)
                        {
                            using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                        else
                        {
                            using (var textWriter = new StreamWriter(fileStream))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                    }

                    WriteToLog("Changed webpage written");
                    return(true);
                }
                catch (Exception exception)
                {
                    WriteToLog($"Could not write new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                    return(false);
                }
            }
        }
Exemplo n.º 4
0
        /// <summary>
        /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set
        /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>.
        /// If an image does need to be rotated or does not fit then a local copy is maded of
        /// the <paramref name="inputUri"/> file.
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param>
        /// <param name="rotate">When set to <c>true</c> then the EXIF information of an
        /// image is read and when needed the image is automaticly rotated</param>
        /// <param name="pageSettings"><see cref="PageSettings"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns>
        /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception>
        public bool ValidateImages(ConvertUri inputUri,
                                   bool resize,
                                   bool rotate,
                                   PageSettings pageSettings,
                                   out ConvertUri outputUri)
        {
            WriteToLog("Validating all images if they need to be rotated and if they fit the page");
            outputUri = null;

            string localDirectory = null;

            if (inputUri.IsFile)
            {
                localDirectory = Path.GetDirectoryName(inputUri.OriginalString);
            }

            var webpage = inputUri.IsFile
                ? inputUri.Encoding != null
                    ? File.ReadAllText(inputUri.OriginalString, inputUri.Encoding)
                    : File.ReadAllText(inputUri.OriginalString)
                : DownloadString(inputUri);

            var maxWidth  = pageSettings.PaperWidth * 96.0;
            var maxHeight = pageSettings.PaperHeight * 96.0;

            var changed = false;
            var config  = Configuration.Default.WithCss();
            var context = BrowsingContext.New(config);

            var document = inputUri.Encoding != null
                ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")).Result
                : context.OpenAsync(m => m.Content(webpage)).Result;

            // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop
            foreach (var htmlImage in document.Images)
            {
                if (string.IsNullOrWhiteSpace(htmlImage.Source))
                {
                    WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'");
                    continue;
                }

                Image image = null;

                try
                {
                    // The local width and height attributes always go before css width and height
                    var width  = htmlImage.DisplayWidth;
                    var height = htmlImage.DisplayHeight;

                    if (rotate)
                    {
                        image = GetImage(new Uri(htmlImage.Source), localDirectory);
                        if (image == null)
                        {
                            continue;
                        }
                        if (RotateImageByExifOrientationData(image))
                        {
                            htmlImage.DisplayWidth  = image.Width;
                            htmlImage.DisplayHeight = image.Height;
                            changed = true;
                        }
                        width  = image.Width;
                        height = image.Height;
                    }

                    if (!resize)
                    {
                        continue;
                    }

                    if (height == 0 && width == 0)
                    {
                        var style = context.Current.GetComputedStyle(htmlImage);
                        if (style != null)
                        {
                            width  = ParseValue(style.Width);
                            height = ParseValue(style.Height);
                        }
                    }

                    // If we don't know the image size then get if from the image itself
                    if (width <= 0 || height <= 0)
                    {
                        if (image == null)
                        {
                            image = GetImage(new Uri(htmlImage.Source), localDirectory);
                        }

                        if (image == null)
                        {
                            continue;
                        }
                        width  = image.Width;
                        height = image.Height;
                    }

                    if (width > maxWidth || height > maxHeight)
                    {
                        var extension = Path.GetExtension(htmlImage.Source.Contains("?")
                            ? htmlImage.Source.Split('?')[0]
                            : htmlImage.Source);

                        var fileName = GetTempFile(extension);

                        // If we did not load the image already then load it
                        if (image == null)
                        {
                            image = GetImage(new Uri(htmlImage.Source), localDirectory);
                        }

                        if (image == null)
                        {
                            continue;
                        }
                        image = ScaleImage(image, (int)maxWidth);
                        WriteToLog($"Image resized to width {image.Width} and height {image.Height}");
                        image.Save(fileName);
                        htmlImage.DisplayWidth  = image.Width;
                        htmlImage.DisplayHeight = image.Height;
                        htmlImage.Source        = new Uri(fileName).ToString();
                        changed = true;
                    }
                }
                finally
                {
                    image?.Dispose();
                }
            }

            if (!changed)
            {
                return(true);
            }

            var outputFile = GetTempFile(".htm");

            outputUri = new ConvertUri(outputFile, inputUri.Encoding);

            try
            {
                using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write))
                {
                    if (inputUri.Encoding != null)
                    {
                        using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                            document.ToHtml(textWriter, new AutoSelectedMarkupFormatter());
                    }
                    else
                    {
                        using (var textWriter = new StreamWriter(fileStream))
                            document.ToHtml(textWriter, new AutoSelectedMarkupFormatter());
                    }
                }

                return(false);
            }
            catch (Exception exception)
            {
                WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                return(true);
            }
        }
Exemplo n.º 5
0
        /// <summary>
        /// Sanitizes the HTML by removing all forbidden elements
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="sanitizer"><see cref="HtmlSanitizer"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <returns></returns>
        public bool SanitizeHtml(
            ConvertUri inputUri,
            HtmlSanitizer sanitizer,
            out ConvertUri outputUri)
        {
            outputUri = null;

            using (var webpage = inputUri.IsFile
                ? File.OpenRead(inputUri.OriginalString)
                : DownloadStream(inputUri))
            {
                var htmlChanged = false;
                var config      = Configuration.Default.WithCss();
                var context     = BrowsingContext.New(config);

                IDocument document;

                try
                {
                    // ReSharper disable AccessToDisposedClosure
                    document = inputUri.Encoding != null
                        ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}").Address(inputUri.ToString())).Result
                        : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result;

                    // ReSharper restore AccessToDisposedClosure
                }
                catch (Exception exception)
                {
                    WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}");
                    return(false);
                }

                WriteToLog("Sanitizing HTML");

                if (sanitizer == null)
                {
                    sanitizer = new HtmlSanitizer();
                }

                sanitizer.FilterUrl += delegate(object sender, FilterUrlEventArgs args)
                {
                    if (args.OriginalUrl != args.SanitizedUrl)
                    {
                        WriteToLog($"URL sanitized from '{args.OriginalUrl}' to '{args.SanitizedUrl}'");
                        htmlChanged = true;
                    }
                };

                sanitizer.RemovingAtRule += delegate(object sender, RemovingAtRuleEventArgs args)
                {
                    WriteToLog($"Removing CSS at-rule '{args.Rule.CssText}' from tag '{args.Tag.TagName}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingAttribute += delegate(object sender, RemovingAttributeEventArgs args)
                {
                    WriteToLog(
                        $"Removing attribute '{args.Attribute.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingComment += delegate(object sender, RemovingCommentEventArgs args)
                {
                    WriteToLog($"Removing comment '{args.Comment.TextContent}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingCssClass += delegate(object sender, RemovingCssClassEventArgs args)
                {
                    WriteToLog(
                        $"Removing CSS class '{args.CssClass}' from tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingStyle += delegate(object sender, RemovingStyleEventArgs args)
                {
                    WriteToLog(
                        $"Removing style '{args.Style.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingTag += delegate(object sender, RemovingTagEventArgs args)
                {
                    WriteToLog($"Removing tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.SanitizeDom(document as IHtmlDocument);

                WriteToLog("HTML sanitized");

                if (!htmlChanged)
                {
                    return(false);
                }

                var sanitizedOutputFile = GetTempFile(".htm");
                outputUri = new ConvertUri(sanitizedOutputFile, inputUri.Encoding);

                try
                {
                    WriteToLog($"Writing sanitized webpage to '{sanitizedOutputFile}'");

                    using (var fileStream =
                               new FileStream(sanitizedOutputFile, FileMode.CreateNew, FileAccess.Write))
                    {
                        if (inputUri.Encoding != null)
                        {
                            using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                        else
                        {
                            using (var textWriter = new StreamWriter(fileStream))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                    }

                    WriteToLog("Sanitized webpage written");
                    return(true);
                }
                catch (Exception exception)
                {
                    WriteToLog($"Could not write new html file '{sanitizedOutputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                    return(false);
                }
            }
        }
Exemplo n.º 6
0
        /// <summary>
        /// Validates all images if they are rotated correctly when <paramref name="rotate"/> is set
        /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>.
        /// If an image does need to be rotated or does not fit then a local copy is made of
        /// the <paramref name="inputUri"/> file.
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="sanitize"></param>
        /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param>
        /// <param name="rotate">When set to <c>true</c> then the EXIF information of an
        ///     image is read and when needed the image is automatically rotated</param>
        /// <param name="pageSettings"><see cref="PageSettings"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns>
        /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception>
        public bool Cleanup(ConvertUri inputUri,
                            bool sanitize,
                            bool resize,
                            bool rotate,
                            PageSettings pageSettings,
                            out ConvertUri outputUri)
        {
            outputUri = null;

            string localDirectory = null;

            if (inputUri.IsFile)
            {
                localDirectory = Path.GetDirectoryName(inputUri.OriginalString);
            }

            var webpage = inputUri.IsFile
                ? inputUri.Encoding != null
                    ? File.ReadAllText(inputUri.OriginalString, inputUri.Encoding)
                    : File.ReadAllText(inputUri.OriginalString)
                : DownloadString(inputUri);

            var changed = false;

            if (sanitize)
            {
                var sanitizer = new HtmlSanitizer();
                sanitizer.AllowedSchemes.Add("mailto");
                sanitizer.AllowedTags.Add("html");
                sanitizer.AllowedTags.Add("head");
                sanitizer.AllowedAttributes.Add("http-equiv");
                sanitizer.AllowedAttributes.Add("content");
                sanitizer.AllowedTags.Add("body");
                sanitizer.AllowedTags.Add("meta");
                sanitizer.AllowedAttributes.Add("class");
                sanitizer.AllowDataAttributes = true;

                var sanitizedWebPage = sanitizer.Sanitize(webpage, string.Empty, new AutoSelectedMarkupFormatter());
                if (webpage != sanitizedWebPage)
                {
                    changed = true;
                    webpage = sanitizedWebPage;
                    WriteToLog("Webpage sanitized");
                }
            }

            var maxWidth  = pageSettings.PaperWidth * 96.0;
            var maxHeight = pageSettings.PaperHeight * 96.0;

            var config  = Configuration.Default.WithCss();
            var context = BrowsingContext.New(config);

            var document = inputUri.Encoding != null
                ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")).Result
                : context.OpenAsync(m => m.Content(webpage)).Result;

            //document.TextContent

            var unchangedImages = new List <IHtmlImageElement>();

            // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop
            foreach (var htmlImage in document.Images)
            {
                if (string.IsNullOrWhiteSpace(htmlImage.Source))
                {
                    WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'");
                    continue;
                }

                Image image = null;

                var extension = Path.GetExtension(htmlImage.Source.Contains("?")
                    ? htmlImage.Source.Split('?')[0]
                    : htmlImage.Source);

                var fileName = GetTempFile(extension);

                try
                {
                    // The local width and height attributes always go before css width and height
                    var width  = htmlImage.DisplayWidth;
                    var height = htmlImage.DisplayHeight;

                    if (rotate)
                    {
                        image = htmlImage.Source.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase)
                            ? GetImageFromBase64(htmlImage.Source)
                            : GetImage(new Uri(htmlImage.Source), localDirectory);

                        if (image == null)
                        {
                            continue;
                        }

                        if (RotateImageByExifOrientationData(image))
                        {
                            htmlImage.DisplayWidth  = image.Width;
                            htmlImage.DisplayHeight = image.Height;
                            changed = true;
                        }
                        width  = image.Width;
                        height = image.Height;

                        if (!resize)
                        {
                            WriteToLog($"Image rotated and saved to location '{fileName}'");
                            image.Save(fileName);
                            htmlImage.DisplayWidth  = image.Width;
                            htmlImage.DisplayHeight = image.Height;
                            htmlImage.Source        = new Uri(fileName).ToString();
                        }
                    }

                    if (resize)
                    {
                        if (height == 0 && width == 0)
                        {
                            var style = context.Current.GetComputedStyle(htmlImage);
                            if (style != null)
                            {
                                width  = ParseValue(style.GetPropertyValue("width"));
                                height = ParseValue(style.GetPropertyValue("height"));
                            }
                        }

                        // If we don't know the image size then get if from the image itself
                        if (width <= 0 || height <= 0)
                        {
                            if (image == null)
                            {
                                image = htmlImage.Source.StartsWith("data:",
                                                                    StringComparison.InvariantCultureIgnoreCase)
                                    ? GetImageFromBase64(htmlImage.Source)
                                    : GetImage(new Uri(htmlImage.Source), localDirectory);
                            }

                            if (image == null)
                            {
                                continue;
                            }
                            width  = image.Width;
                            height = image.Height;
                        }

                        if (width > maxWidth || height > maxHeight)
                        {
                            // If we did not load the image already then load it

                            if (image == null)
                            {
                                image = htmlImage.Source.StartsWith("data:",
                                                                    StringComparison.InvariantCultureIgnoreCase)
                                    ? GetImageFromBase64(htmlImage.Source)
                                    : GetImage(new Uri(htmlImage.Source), localDirectory);
                            }

                            if (image == null)
                            {
                                continue;
                            }

                            image = ScaleImage(image, (int)maxWidth);
                            WriteToLog($"Image resized to width {image.Width} and height {image.Height} and saved to location '{fileName}'");
                            image.Save(fileName);
                            htmlImage.DisplayWidth  = image.Width;
                            htmlImage.DisplayHeight = image.Height;
                            htmlImage.Source        = new Uri(fileName).ToString();
                            changed = true;
                        }
                    }
                }
                finally
                {
                    image?.Dispose();
                }

                if (!changed)
                {
                    unchangedImages.Add(htmlImage);
                }
            }

            if (!changed)
            {
                return(true);
            }

            foreach (var unchangedImage in unchangedImages)
            {
                var imageSource = new Uri(unchangedImage.Source);
                using (var image = GetImage(imageSource, localDirectory))
                {
                    if (localDirectory != null)
                    {
                        var fileName = Path.Combine(localDirectory, Path.GetFileName(imageSource.ToString()));
                        unchangedImage.Source = new Uri(fileName).ToString();
                    }
                    else
                    {
                        var extension = Path.GetExtension(unchangedImage.Source.Contains("?")
                            ? unchangedImage.Source.Split('?')[0]
                            : unchangedImage.Source);
                        var fileName = GetTempFile(extension);

                        WriteToLog($"Unchanged image saved to location '{fileName}'");
                        image.Save(fileName);
                        unchangedImage.Source = new Uri(fileName).ToString();
                    }
                }
            }

            var outputFile = GetTempFile(".htm");

            outputUri = new ConvertUri(outputFile, inputUri.Encoding);

            try
            {
                using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write))
                {
                    if (inputUri.Encoding != null)
                    {
                        using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                            document.ToHtml(textWriter, new AutoSelectedMarkupFormatter());
                    }
                    else
                    {
                        using (var textWriter = new StreamWriter(fileStream))
                            document.ToHtml(textWriter, new AutoSelectedMarkupFormatter());
                    }
                }

                return(false);
            }
            catch (Exception exception)
            {
                WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                return(true);
            }
        }
Exemplo n.º 7
0
        /// <summary>
        /// Returns the <see cref="Image"/> for the given <paramref name="imageSource"/>
        /// </summary>
        /// <param name="imageSource"></param>
        /// <param name="localDirectory"></param>
        /// <returns></returns>
        private Image GetImage(string imageSource, string localDirectory)
        {
            if (imageSource.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase))
            {
                WriteToLog("Decoding image from base64 string");

                try
                {
                    var base64Data = Regex.Match(imageSource, @"data:image/(?<type>.+?),(?<data>.+)").Groups["data"].Value;
                    var binaryData = Convert.FromBase64String(base64Data);

                    using (var stream = new MemoryStream(binaryData))
                    {
                        var image = Image.FromStream(stream);
                        WriteToLog("Image decoded");
                        return(image);
                    }
                }
                catch (Exception exception)
                {
                    WriteToLog($"Error decoding image: {ExceptionHelpers.GetInnerException(exception)}");
                    return(null);
                }
            }

            try
            {
                WriteToLog($"Getting image from url '{imageSource}'");

                var imageUri = new Uri(imageSource);

                if (imageUri.IsFile)
                {
                    var fileName = imageUri.LocalPath;

                    if (!File.Exists(fileName))
                    {
                        fileName = Path.Combine(localDirectory, Path.GetFileName(imageUri.LocalPath));
                    }

                    if (File.Exists(fileName))
                    {
                        var fileStream = OpenFileStream(fileName);
                        return(Image.FromStream(fileStream, true, false));
                    }
                }

                switch (imageUri.Scheme)
                {
                case "https":
                case "http":
                    using (var webStream = OpenDownloadStream(imageUri, true))
                    {
                        if (webStream != null)
                        {
                            return(Image.FromStream(webStream, true, false));
                        }
                    }
                    break;

                case "file":
                    WriteToLog("Ignoring local file");
                    break;

                default:
                    WriteToLog($"Unsupported scheme {imageUri.Scheme} to get image");
                    return(null);
                }
            }
            catch (Exception exception)
            {
                WriteToLog($"Getting image failed with exception: {ExceptionHelpers.GetInnerException(exception)}");
            }

            return(null);
        }
Exemplo n.º 8
0
        /// <summary>
        /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set
        /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>.
        /// If an image does need to be rotated or does not fit then a local copy is made of
        /// the <paramref name="inputUri"/> file.
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param>
        /// <param name="rotate">When set to <c>true</c> then the EXIF information of an
        ///     image is read and when needed the image is automatic rotated</param>
        /// <param name="pageSettings"><see cref="PageSettings"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>true</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <param name="urlBlacklist">A list of URL's that need to be blocked (use * as a wildcard)</param>
        /// <param name="safeUrls">A list with URL's that are safe to load</param>
        /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns>
        /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception>
        public bool ValidateImages(
            ConvertUri inputUri,
            bool resize,
            bool rotate,
            PageSettings pageSettings,
            out ConvertUri outputUri,
            ref List <string> safeUrls,
            List <string> urlBlacklist)
        {
            outputUri = null;

            using (var graphics = Graphics.FromHwnd(IntPtr.Zero))
                using (var webpage = inputUri.IsFile ? OpenFileStream(inputUri.OriginalString) : OpenDownloadStream(inputUri))
                {
                    WriteToLog($"DPI settings for image, x: '{graphics.DpiX}' and y: '{graphics.DpiY}'");
                    var maxWidth  = (pageSettings.PaperWidth - pageSettings.MarginLeft - pageSettings.MarginRight) * graphics.DpiX;
                    var maxHeight = (pageSettings.PaperHeight - pageSettings.MarginTop - pageSettings.MarginBottom) * graphics.DpiY;

                    string localDirectory = null;

                    if (inputUri.IsFile)
                    {
                        localDirectory = Path.GetDirectoryName(inputUri.OriginalString);
                    }

                    var htmlChanged = false;

                    IConfiguration config;

                    if (_webProxy != null)
                    {
                        WriteToLog($"Using web proxy '{_webProxy.Address}' to download images");

                        var httpClientHandler = new HttpClientHandler
                        {
                            Proxy = _webProxy,
                            ServerCertificateCustomValidationCallback = (message, certificate, arg1, arg2) =>
                            {
                                WriteToLog($"Accepting certificate '{certificate.Subject}', message '{message}'");
                                return(true);
                            }
                        };

                        var client = new HttpClient(httpClientHandler);
                        config = Configuration.Default
                                 .With(new HttpClientRequester(client))
                                 .WithTemporaryCookies()
                                 .WithDefaultLoader()
                                 .WithCss();
                    }
                    else
                    {
                        config = Configuration.Default.WithCss();
                    }

                    var context = BrowsingContext.New(config);

                    IDocument document;

                    try
                    {
                        // ReSharper disable AccessToDisposedClosure
                        document = inputUri.Encoding != null
                        ? context.OpenAsync(m =>
                                            m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")
                                            .Address(inputUri.ToString())).Result
                        : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result;

                        // ReSharper restore AccessToDisposedClosure
                    }
                    catch (Exception exception)
                    {
                        WriteToLog($"Exception occurred in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}");
                        return(false);
                    }

                    WriteToLog("Validating all images if they need to be rotated and if they fit the page");
                    var unchangedImages = new List <IHtmlImageElement>();
                    var absoluteUri     = inputUri.AbsoluteUri.Substring(0, inputUri.AbsoluteUri.LastIndexOf('/') + 1);

                    // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop
                    foreach (var htmlImage in document.Images)
                    {
                        var imageChanged = false;

                        if (string.IsNullOrWhiteSpace(htmlImage.Source))
                        {
                            WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'");
                            continue;
                        }

                        Image image = null;

                        var source        = htmlImage.Source.Contains("?") ? htmlImage.Source.Split('?')[0] : htmlImage.Source;
                        var isSafeUrl     = safeUrls.Contains(source);
                        var isAbsoluteUri = source.StartsWith(absoluteUri, StringComparison.InvariantCultureIgnoreCase);

                        if (!RegularExpression.IsRegExMatch(urlBlacklist, source, out var matchedPattern) ||
                            isAbsoluteUri || isSafeUrl)
                        {
                            if (isAbsoluteUri)
                            {
                                WriteToLog($"The url '{source}' has been allowed because it start with the absolute uri '{absoluteUri}'");
                            }
                            else if (isSafeUrl)
                            {
                                WriteToLog($"The url '{source}' has been allowed because it is on the safe url list");
                            }
                            else
                            {
                                WriteToLog($"The url '{source}' has been allowed because it did not match anything on the url blacklist");
                            }
                        }
                        else
                        {
                            WriteToLog($"The url '{source}' has been blocked by url blacklist pattern '{matchedPattern}'");
                            continue;
                        }

                        var extension = Path.GetExtension(FileManager.RemoveInvalidFileNameChars(source));
                        var fileName  = GetTempFile(extension);

                        try
                        {
                            // The local width and height attributes always go before css width and height
                            var width  = htmlImage.DisplayWidth;
                            var height = htmlImage.DisplayHeight;

                            if (rotate)
                            {
                                image = GetImage(htmlImage.Source, localDirectory);

                                if (image == null)
                                {
                                    continue;
                                }

                                if (RotateImageByExifOrientationData(image))
                                {
                                    htmlImage.DisplayWidth  = image.Width;
                                    htmlImage.DisplayHeight = image.Height;
                                    WriteToLog($"Image rotated and saved to location '{fileName}'");
                                    image.Save(fileName);
                                    htmlImage.DisplayWidth  = image.Width;
                                    htmlImage.DisplayHeight = image.Height;
                                    htmlImage.SetStyle(string.Empty);
                                    var newSrc = new Uri(fileName).ToString();
                                    WriteToLog($"Adding url '{newSrc}' to the safe url list");
                                    safeUrls.Add(newSrc);
                                    htmlImage.Source = newSrc;
                                    htmlChanged      = true;
                                    imageChanged     = true;
                                }

                                width  = image.Width;
                                height = image.Height;
                            }

                            if (resize)
                            {
                                if (height == 0 && width == 0)
                                {
                                    ICssStyleDeclaration style = null;

                                    try
                                    {
                                        style = context.Current.GetComputedStyle(htmlImage);
                                    }
                                    catch (Exception exception)
                                    {
                                        WriteToLog($"Could not get computed style from html image, exception: '{exception.Message}'");
                                    }

                                    if (style != null)
                                    {
                                        width  = ParseValue(style.GetPropertyValue("width"));
                                        height = ParseValue(style.GetPropertyValue("height"));
                                    }
                                }

                                // If we don't know the image size then get if from the image itself
                                if (width <= 0 || height <= 0)
                                {
                                    if (image == null)
                                    {
                                        image = GetImage(htmlImage.Source, localDirectory);
                                    }

                                    if (image == null)
                                    {
                                        continue;
                                    }
                                    width  = image.Width;
                                    height = image.Height;
                                }

                                if (width > maxWidth || height > maxHeight)
                                {
                                    // If we did not load the image already then load it

                                    if (image == null)
                                    {
                                        image = GetImage(htmlImage.Source, localDirectory);
                                    }

                                    if (image == null)
                                    {
                                        continue;
                                    }

                                    ScaleImage(image, (int)maxWidth, out var newWidth, out var newHeight);
                                    WriteToLog($"Image rescaled to width {newWidth} and height {newHeight}");
                                    htmlImage.DisplayWidth  = newWidth;
                                    htmlImage.DisplayHeight = newHeight;
                                    htmlImage.SetStyle(string.Empty);
                                    htmlChanged = true;
                                }
                            }
                        }
                        finally
                        {
                            image?.Dispose();
                        }

                        if (!imageChanged)
                        {
                            unchangedImages.Add(htmlImage);
                        }
                    }

                    if (!htmlChanged)
                    {
                        return(false);
                    }

                    foreach (var unchangedImage in unchangedImages)
                    {
                        using (var image = GetImage(unchangedImage.Source, localDirectory))
                        {
                            if (image == null)
                            {
                                WriteToLog($"Could not load unchanged image from location '{unchangedImage.Source}'");
                                continue;
                            }

                            var extension = Path.GetExtension(unchangedImage.Source.Contains("?")
                            ? unchangedImage.Source.Split('?')[0]
                            : unchangedImage.Source);
                            var fileName = GetTempFile(extension);

                            WriteToLog($"Unchanged image saved to location '{fileName}'");
                            image.Save(fileName);
                            var newSrc = new Uri(fileName).ToString();
                            safeUrls.Add(newSrc);
                            unchangedImage.Source = newSrc;
                        }
                    }

                    var outputFile = GetTempFile(".htm");
                    outputUri = new ConvertUri(outputFile, inputUri.Encoding);
                    safeUrls.Add(outputUri.ToString());

                    try
                    {
                        WriteToLog($"Writing changed webpage to '{outputFile}'");

                        using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write))
                        {
                            if (inputUri.Encoding != null)
                            {
                                using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                                    document.ToHtml(textWriter, new HtmlMarkupFormatter());
                            }
                            else
                            {
                                using (var textWriter = new StreamWriter(fileStream))
                                    document.ToHtml(textWriter, new HtmlMarkupFormatter());
                            }
                        }

                        WriteToLog("Changed webpage written");

                        return(true);
                    }
                    catch (Exception exception)
                    {
                        WriteToLog($"Could not write new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                        return(false);
                    }
                }
        }
Exemplo n.º 9
0
        /// <summary>
        /// Sanitizes the HTML by removing all forbidden elements
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="mediaLoadTimeout">The media load timeout or <c>null</c> when not set</param>
        /// <param name="sanitizer"><see cref="HtmlSanitizer"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <param name="safeUrls">A list of safe URL's</param>
        /// <returns></returns>
        public bool SanitizeHtml(
            ConvertUri inputUri,
            int?mediaLoadTimeout,
            HtmlSanitizer sanitizer,
            out ConvertUri outputUri,
            ref List <string> safeUrls)
        {
            outputUri = null;

            using (var webpage = inputUri.IsFile ? OpenFileStream(inputUri.OriginalString) : OpenDownloadStream(inputUri))
            {
                var htmlChanged = false;
                var config      = Configuration.Default.WithCss();
                var context     = BrowsingContext.New(config);

                IDocument document;

                try
                {
                    // ReSharper disable AccessToDisposedClosure
                    document = inputUri.Encoding != null
                        ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}").Address(inputUri.ToString())).Result
                        : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result;

                    // ReSharper restore AccessToDisposedClosure
                }
                catch (Exception exception)
                {
                    WriteToLog($"Exception occurred in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}");
                    return(false);
                }

                WriteToLog("Sanitizing HTML");

                if (sanitizer == null)
                {
                    sanitizer = new HtmlSanitizer();
                }

                sanitizer.FilterUrl += delegate(object sender, FilterUrlEventArgs args)
                {
                    if (args.OriginalUrl != args.SanitizedUrl)
                    {
                        WriteToLog($"URL sanitized from '{args.OriginalUrl}' to '{args.SanitizedUrl}'");
                        htmlChanged = true;
                    }
                };

                sanitizer.RemovingAtRule += delegate(object sender, RemovingAtRuleEventArgs args)
                {
                    WriteToLog($"Removing CSS at-rule '{args.Rule.CssText}' from tag '{args.Tag.TagName}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingAttribute += delegate(object sender, RemovingAttributeEventArgs args)
                {
                    WriteToLog($"Removing attribute '{args.Attribute.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingComment += delegate(object sender, RemovingCommentEventArgs args)
                {
                    WriteToLog($"Removing comment '{args.Comment.TextContent}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingCssClass += delegate(object sender, RemovingCssClassEventArgs args)
                {
                    WriteToLog($"Removing CSS class '{args.CssClass}' from tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingStyle += delegate(object sender, RemovingStyleEventArgs args)
                {
                    WriteToLog($"Removing style '{args.Style.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.RemovingTag += delegate(object sender, RemovingTagEventArgs args)
                {
                    WriteToLog($"Removing tag '{args.Tag.TagName}', reason '{args.Reason}'");
                    htmlChanged = true;
                };

                sanitizer.SanitizeDom(document as IHtmlDocument);

                if (!htmlChanged)
                {
                    WriteToLog("HTML did not need any sanitization");
                    return(false);
                }

                WriteToLog("HTML sanitized");

                var sanitizedOutputFile = GetTempFile(".htm");
                outputUri = new ConvertUri(sanitizedOutputFile, inputUri.Encoding);
                var url = outputUri.ToString();
                WriteToLog($"Adding url '{url}' to the safe url list");
                safeUrls.Add(url);

                try
                {
                    if (document.BaseUrl.Scheme.StartsWith("file"))
                    {
                        var images = document.DocumentElement.Descendents()
                                     .Where(x => x.NodeType == NodeType.Element)
                                     .OfType <IHtmlImageElement>();

                        foreach (var image in images)
                        {
                            var src = image.Source;

                            if (src.StartsWith("http://", StringComparison.InvariantCultureIgnoreCase) ||
                                src.StartsWith("https://", StringComparison.InvariantCultureIgnoreCase))
                            {
                                continue;
                            }

                            WriteToLog($"Updating image source to '{src}' and adding it to the safe url list");
                            safeUrls.Add(src);
                            image.Source = src;
                        }
                    }

                    WriteToLog($"Writing sanitized webpage to '{sanitizedOutputFile}'");

                    using (var fileStream = new FileStream(sanitizedOutputFile, FileMode.CreateNew, FileAccess.Write))
                    {
                        if (inputUri.Encoding != null)
                        {
                            using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                        else
                        {
                            using (var textWriter = new StreamWriter(fileStream))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                    }

                    WriteToLog("Sanitized webpage written");
                    return(true);
                }
                catch (Exception exception)
                {
                    WriteToLog($"Could not write new html file '{sanitizedOutputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                    return(false);
                }
            }
        }