/// <summary> /// Open the file to the given <paramref name="fileName"/> and returns it as a stream /// </summary> /// <param name="fileName"></param> /// <returns></returns> private Stream OpenFileStream(string fileName) { try { WriteToLog($"Opening stream to file '{fileName}'"); var result = File.OpenRead(fileName); return(result); } catch (Exception exception) { WriteToLog($"Opening stream failed with exception: {ExceptionHelpers.GetInnerException(exception)}"); return(null); } }
/// <summary> /// Downloads from the given <paramref name="sourceUri"/> and it as a string /// </summary> /// <param name="sourceUri"></param> /// <returns></returns> private Stream DownloadStream(Uri sourceUri) { try { WriteToLog($"Downloading from uri '{sourceUri}'"); var result = WebClient.OpenReadTaskAsync(sourceUri).Timeout(_timeout).GetAwaiter().GetResult(); WriteToLog("Downloaded"); return(result); } catch (Exception exception) { WriteToLog("Downloading failed with exception: " + ExceptionHelpers.GetInnerException(exception)); return(null); } }
/// <summary> /// Downloads from the given <paramref name="sourceUri"/> and it as a string /// </summary> /// <param name="sourceUri"></param> /// <returns></returns> private string DownloadString(Uri sourceUri) { try { WriteToLog($"Downloading from uri '{sourceUri}'"); var result = WebClient.DownloadString(sourceUri); WriteToLog("Downloaded"); return(result); } catch (Exception exception) { WriteToLog("Downloading failed with exception: " + ExceptionHelpers.GetInnerException(exception)); return(null); } }
/// <summary> /// Returns the <see cref="Image"/> for the given <paramref name="imageUri"/> /// </summary> /// <param name="imageUri"></param> /// <param name="localDirectory"></param> /// <returns></returns> private Image GetImage(Uri imageUri, string localDirectory) { WriteToLog($"Getting image from uri '{imageUri}'"); try { if (imageUri.IsLoopback || imageUri.IsFile) { var fileName = imageUri.OriginalString; if (!File.Exists(fileName)) { fileName = Path.Combine(localDirectory, imageUri.AbsolutePath.Trim('/')); } if (File.Exists(fileName)) { var fileStream = new FileStream(fileName, FileMode.Open, FileAccess.Read); return(Image.FromStream(fileStream, true, false)); } } switch (imageUri.Scheme) { case "https": case "http": using (var webStream = WebClient.OpenRead(imageUri)) { if (webStream != null) { return(Image.FromStream(webStream, true, false)); } } break; default: WriteToLog($"Unsupported scheme {imageUri.Scheme} to get image"); return(null); } } catch (Exception exception) { WriteToLog("Getting image failed with exception: " + ExceptionHelpers.GetInnerException(exception)); } return(null); }
/// <summary> /// Opens a download stream to the given <paramref name="sourceUri"/> /// </summary> /// <param name="sourceUri"></param> /// <param name="checkTimeout"></param> /// <returns></returns> private Stream OpenDownloadStream(Uri sourceUri, bool checkTimeout = false) { try { var request = WebRequest.Create(sourceUri); var timeLeft = TimeLeft; if (_stopwatch != null && checkTimeout) { if (timeLeft == 0) { WriteToLog($"Image load has timed out, skipping opening stream to url '{sourceUri}'"); return(null); } request.Timeout = TimeLeft; } if (_webProxy != null) { request.Proxy = _webProxy; } if (_useCache) { request.CachePolicy = new HttpRequestCachePolicy(HttpCacheAgeControl.MaxAge, TimeSpan.FromDays(1)); } WriteToLog($"Opening stream to url '{sourceUri}'{(_stopwatch != null ? $" with a timeout of {timeLeft} milliseconds" : string.Empty)}"); var response = (HttpWebResponse)request.GetResponse(); WriteToLog($"Opened {(response.IsFromCache ? "cached " : string.Empty)}stream to url '{sourceUri}'"); return(response.GetResponseStream()); } catch (Exception exception) { WriteToLog($"Opening stream failed with exception: {ExceptionHelpers.GetInnerException(exception)}"); return(null); } }
private Image GetImageFromBase64(string data) { WriteToLog("Decoding image from base64 string"); try { var base64Data = Regex.Match(data, @"data:image/(?<type>.+?),(?<data>.+)").Groups["data"].Value; var binaryData = Convert.FromBase64String(base64Data); using (var stream = new MemoryStream(binaryData)) { var image = Image.FromStream(stream); WriteToLog("Image decoded"); return(image); } } catch (Exception exception) { WriteToLog($"Error decoding image: {ExceptionHelpers.GetInnerException(exception)}"); return(null); } }
/// <summary> /// Returns the <see cref="Image"/> for the given <paramref name="imageSource"/> /// </summary> /// <param name="imageSource"></param> /// <param name="localDirectory"></param> /// <returns></returns> private Image GetImage(string imageSource, string localDirectory) { if (imageSource.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase)) { WriteToLog("Decoding image from base64 string"); try { var base64Data = Regex.Match(imageSource, @"data:image/(?<type>.+?),(?<data>.+)").Groups["data"].Value; var binaryData = Convert.FromBase64String(base64Data); using (var stream = new MemoryStream(binaryData)) { var image = Image.FromStream(stream); WriteToLog("Image decoded"); return(image); } } catch (Exception exception) { WriteToLog($"Error decoding image: {ExceptionHelpers.GetInnerException(exception)}"); return(null); } } try { WriteToLog($"Getting image from uri '{imageSource}'"); var imageUri = new Uri(imageSource); if (imageUri.IsLoopback || imageUri.IsFile) { var fileName = imageUri.OriginalString; if (!File.Exists(fileName)) { fileName = Path.Combine(localDirectory, imageUri.AbsolutePath.Trim('/')); } if (File.Exists(fileName)) { var fileStream = new FileStream(fileName, FileMode.Open, FileAccess.Read); return(Image.FromStream(fileStream, true, false)); } } switch (imageUri.Scheme) { case "https": case "http": using (var webStream = WebClient.OpenReadTaskAsync(imageUri).Timeout(_timeout).GetAwaiter().GetResult()) { if (webStream != null) { return(Image.FromStream(webStream, true, false)); } } break; default: WriteToLog($"Unsupported scheme {imageUri.Scheme} to get image"); return(null); } } catch (Exception exception) { WriteToLog("Getting image failed with exception: " + ExceptionHelpers.GetInnerException(exception)); } return(null); }
/// <summary> /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>. /// If an image does need to be rotated or does not fit then a local copy is made of /// the <paramref name="inputUri"/> file. /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param> /// <param name="rotate">When set to <c>true</c> then the EXIF information of an /// image is read and when needed the image is automatic rotated</param> /// <param name="sanitizeHtml">When set to <c>true</c> then the HTML with get sanitized</param> /// <param name="pageSettings"><see cref="PageSettings"/></param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns> /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception> public bool Validate(ConvertUri inputUri, bool resize, bool rotate, bool sanitizeHtml, PageSettings pageSettings, out ConvertUri outputUri) { outputUri = null; string localDirectory = null; if (inputUri.IsFile) { localDirectory = Path.GetDirectoryName(inputUri.OriginalString); } using (var webpage = inputUri.IsFile ? File.OpenRead(inputUri.OriginalString) : DownloadStream(inputUri)) { var maxWidth = (pageSettings.PaperWidth - pageSettings.MarginLeft - pageSettings.MarginRight) * 96.0; var maxHeight = (pageSettings.PaperHeight - pageSettings.MarginTop - pageSettings.MarginBottom) * 96.0; var htmlChanged = false; var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); IDocument document; try { // ReSharper disable AccessToDisposedClosure document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")) .Result : context.OpenAsync(m => m.Content(webpage)).Result; // ReSharper restore AccessToDisposedClosure } catch (Exception exception) { WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}"); return(true); } if (sanitizeHtml) { WriteToLog("Sanitizing HTML"); new HtmlSanitizer().DoSanitize(document as IHtmlDocument, document.DocumentElement); htmlChanged = true; WriteToLog("HTML sanitized"); } WriteToLog("Validating all images if they need to be rotated and if they fit the page"); var unchangedImages = new List <IHtmlImageElement>(); // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop foreach (var htmlImage in document.Images) { var imageChanged = false; if (string.IsNullOrWhiteSpace(htmlImage.Source)) { WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'"); continue; } Image image = null; var source = htmlImage.Source.Contains("?") ? htmlImage.Source.Split('?')[0] : htmlImage.Source; var extension = Path.GetExtension(FileManager.RemoveInvalidFileNameChars(source)); var fileName = GetTempFile(extension); try { // The local width and height attributes always go before css width and height var width = htmlImage.DisplayWidth; var height = htmlImage.DisplayHeight; if (rotate) { image = GetImage(htmlImage.Source, localDirectory); if (image == null) { continue; } if (RotateImageByExifOrientationData(image)) { htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; WriteToLog($"Image rotated and saved to location '{fileName}'"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.SetStyle(string.Empty); htmlImage.Source = new Uri(fileName).ToString(); htmlChanged = true; imageChanged = true; } width = image.Width; height = image.Height; } if (resize) { if (height == 0 && width == 0) { var style = context.Current.GetComputedStyle(htmlImage); if (style != null) { width = ParseValue(style.GetPropertyValue("width")); height = ParseValue(style.GetPropertyValue("height")); } } // If we don't know the image size then get if from the image itself if (width <= 0 || height <= 0) { if (image == null) { image = GetImage(htmlImage.Source, localDirectory); } if (image == null) { continue; } width = image.Width; height = image.Height; } if (width > maxWidth || height > maxHeight) { // If we did not load the image already then load it if (image == null) { image = GetImage(htmlImage.Source, localDirectory); } if (image == null) { continue; } ScaleImage(image, (int)maxWidth, out var newWidth, out var newHeight); WriteToLog($"Image rescaled to width {newWidth} and height {newHeight}"); htmlImage.DisplayWidth = newWidth; htmlImage.DisplayHeight = newHeight; htmlImage.SetStyle(string.Empty); htmlChanged = true; } } } finally { image?.Dispose(); } if (!imageChanged) { unchangedImages.Add(htmlImage); } } if (!htmlChanged) { return(true); } foreach (var unchangedImage in unchangedImages) { using (var image = GetImage(unchangedImage.Source, localDirectory)) { if (image == null) { WriteToLog($"Could not load unchanged image from location '{unchangedImage.Source}'"); continue; } var extension = Path.GetExtension(unchangedImage.Source.Contains("?") ? unchangedImage.Source.Split('?')[0] : unchangedImage.Source); var fileName = GetTempFile(extension); WriteToLog($"Unchanged image saved to location '{fileName}'"); image.Save(fileName); unchangedImage.Source = new Uri(fileName).ToString(); } } var outputFile = GetTempFile(".htm"); outputUri = new ConvertUri(outputFile, inputUri.Encoding); try { using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } } return(false); } catch (Exception exception) { WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(true); } } }
/// <summary> /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>. /// If an image does need to be rotated or does not fit then a local copy is maded of /// the <paramref name="inputUri"/> file. /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param> /// <param name="rotate">When set to <c>true</c> then the EXIF information of an /// image is read and when needed the image is automaticly rotated</param> /// <param name="pageSettings"><see cref="PageSettings"/></param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns> /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception> public bool ValidateImages(ConvertUri inputUri, bool resize, bool rotate, PageSettings pageSettings, out ConvertUri outputUri) { WriteToLog("Validating all images if they need to be rotated and if they fit the page"); outputUri = null; string localDirectory = null; if (inputUri.IsFile) { localDirectory = Path.GetDirectoryName(inputUri.OriginalString); } var webpage = inputUri.IsFile ? inputUri.Encoding != null ? File.ReadAllText(inputUri.OriginalString, inputUri.Encoding) : File.ReadAllText(inputUri.OriginalString) : DownloadString(inputUri); var maxWidth = pageSettings.PaperWidth * 96.0; var maxHeight = pageSettings.PaperHeight * 96.0; var changed = false; var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); var document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")).Result : context.OpenAsync(m => m.Content(webpage)).Result; // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop foreach (var htmlImage in document.Images) { if (string.IsNullOrWhiteSpace(htmlImage.Source)) { WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'"); continue; } Image image = null; try { // The local width and height attributes always go before css width and height var width = htmlImage.DisplayWidth; var height = htmlImage.DisplayHeight; if (rotate) { image = GetImage(new Uri(htmlImage.Source), localDirectory); if (image == null) { continue; } if (RotateImageByExifOrientationData(image)) { htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; changed = true; } width = image.Width; height = image.Height; } if (!resize) { continue; } if (height == 0 && width == 0) { var style = context.Current.GetComputedStyle(htmlImage); if (style != null) { width = ParseValue(style.Width); height = ParseValue(style.Height); } } // If we don't know the image size then get if from the image itself if (width <= 0 || height <= 0) { if (image == null) { image = GetImage(new Uri(htmlImage.Source), localDirectory); } if (image == null) { continue; } width = image.Width; height = image.Height; } if (width > maxWidth || height > maxHeight) { var extension = Path.GetExtension(htmlImage.Source.Contains("?") ? htmlImage.Source.Split('?')[0] : htmlImage.Source); var fileName = GetTempFile(extension); // If we did not load the image already then load it if (image == null) { image = GetImage(new Uri(htmlImage.Source), localDirectory); } if (image == null) { continue; } image = ScaleImage(image, (int)maxWidth); WriteToLog($"Image resized to width {image.Width} and height {image.Height}"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.Source = new Uri(fileName).ToString(); changed = true; } } finally { image?.Dispose(); } } if (!changed) { return(true); } var outputFile = GetTempFile(".htm"); outputUri = new ConvertUri(outputFile, inputUri.Encoding); try { using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new AutoSelectedMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new AutoSelectedMarkupFormatter()); } } return(false); } catch (Exception exception) { WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(true); } }
/// <summary> /// Sanitizes the HTML by removing all forbidden elements /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <returns></returns> public bool FitPageToContent(ConvertUri inputUri, out ConvertUri outputUri) { outputUri = null; using (var webpage = inputUri.IsFile ? File.OpenRead(inputUri.OriginalString) : DownloadStream(inputUri)) { var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); IDocument document; try { // ReSharper disable AccessToDisposedClosure document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}").Address(inputUri.ToString())).Result : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result; // ReSharper restore AccessToDisposedClosure var styleElement = new HtmlElement(document as Document, "style") { InnerHtml = "html, body " + Environment.NewLine + "{" + Environment.NewLine + " width: fit-content;" + Environment.NewLine + " height: fit-content;" + Environment.NewLine + " margin: 0px;" + Environment.NewLine + " padding: 0px;" + Environment.NewLine + "}" + Environment.NewLine }; document.Head.AppendElement(styleElement); var pageStyleElement = new HtmlElement(document as Document, "style") { Id = "pagestyle", InnerHtml = "@page " + Environment.NewLine + "{ " + Environment.NewLine + " size: 595px 842px ; " + Environment.NewLine + " margin: 0px " + Environment.NewLine + "}" + Environment.NewLine }; document.Head.AppendElement(pageStyleElement); var pageElement = new HtmlElement(document as Document, "script") { InnerHtml = "window.onload = function () {" + Environment.NewLine + "" + Environment.NewLine + " var page = document.getElementsByTagName('html')[0];" + Environment.NewLine + " var pageInfo = window.getComputedStyle(page);" + Environment.NewLine + "" + Environment.NewLine + " var height = parseInt(pageInfo.height) + 10 + 'px';" + Environment.NewLine + "" + Environment.NewLine + " var pageCss = '@page { size: ' + pageInfo.width + ' ' + height + '; margin: 0; }'" + Environment.NewLine + " document.getElementById('pagestyle').innerHTML = pageCss;" + Environment.NewLine + "}" + Environment.NewLine }; document.Body.AppendElement(pageElement); } catch (Exception exception) { WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } var outputFile = GetTempFile(".htm"); outputUri = new ConvertUri(outputFile, inputUri.Encoding); try { WriteToLog($"Writing changed webpage to '{outputFile}'"); using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } } WriteToLog("Changed webpage written"); return(true); } catch (Exception exception) { WriteToLog($"Could not write new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } } }
/// <summary> /// Sanitizes the HTML by removing all forbidden elements /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="sanitizer"><see cref="HtmlSanitizer"/></param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <returns></returns> public bool SanitizeHtml( ConvertUri inputUri, HtmlSanitizer sanitizer, out ConvertUri outputUri) { outputUri = null; using (var webpage = inputUri.IsFile ? File.OpenRead(inputUri.OriginalString) : DownloadStream(inputUri)) { var htmlChanged = false; var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); IDocument document; try { // ReSharper disable AccessToDisposedClosure document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}").Address(inputUri.ToString())).Result : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result; // ReSharper restore AccessToDisposedClosure } catch (Exception exception) { WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } WriteToLog("Sanitizing HTML"); if (sanitizer == null) { sanitizer = new HtmlSanitizer(); } sanitizer.FilterUrl += delegate(object sender, FilterUrlEventArgs args) { if (args.OriginalUrl != args.SanitizedUrl) { WriteToLog($"URL sanitized from '{args.OriginalUrl}' to '{args.SanitizedUrl}'"); htmlChanged = true; } }; sanitizer.RemovingAtRule += delegate(object sender, RemovingAtRuleEventArgs args) { WriteToLog($"Removing CSS at-rule '{args.Rule.CssText}' from tag '{args.Tag.TagName}'"); htmlChanged = true; }; sanitizer.RemovingAttribute += delegate(object sender, RemovingAttributeEventArgs args) { WriteToLog( $"Removing attribute '{args.Attribute.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.RemovingComment += delegate(object sender, RemovingCommentEventArgs args) { WriteToLog($"Removing comment '{args.Comment.TextContent}'"); htmlChanged = true; }; sanitizer.RemovingCssClass += delegate(object sender, RemovingCssClassEventArgs args) { WriteToLog( $"Removing CSS class '{args.CssClass}' from tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.RemovingStyle += delegate(object sender, RemovingStyleEventArgs args) { WriteToLog( $"Removing style '{args.Style.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.RemovingTag += delegate(object sender, RemovingTagEventArgs args) { WriteToLog($"Removing tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.SanitizeDom(document as IHtmlDocument); WriteToLog("HTML sanitized"); if (!htmlChanged) { return(false); } var sanitizedOutputFile = GetTempFile(".htm"); outputUri = new ConvertUri(sanitizedOutputFile, inputUri.Encoding); try { WriteToLog($"Writing sanitized webpage to '{sanitizedOutputFile}'"); using (var fileStream = new FileStream(sanitizedOutputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } } WriteToLog("Sanitized webpage written"); return(true); } catch (Exception exception) { WriteToLog($"Could not write new html file '{sanitizedOutputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } } }
/// <summary> /// Validates all images if they are rotated correctly when <paramref name="rotate"/> is set /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>. /// If an image does need to be rotated or does not fit then a local copy is made of /// the <paramref name="inputUri"/> file. /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="sanitize"></param> /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param> /// <param name="rotate">When set to <c>true</c> then the EXIF information of an /// image is read and when needed the image is automatically rotated</param> /// <param name="pageSettings"><see cref="PageSettings"/></param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns> /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception> public bool Cleanup(ConvertUri inputUri, bool sanitize, bool resize, bool rotate, PageSettings pageSettings, out ConvertUri outputUri) { outputUri = null; string localDirectory = null; if (inputUri.IsFile) { localDirectory = Path.GetDirectoryName(inputUri.OriginalString); } var webpage = inputUri.IsFile ? inputUri.Encoding != null ? File.ReadAllText(inputUri.OriginalString, inputUri.Encoding) : File.ReadAllText(inputUri.OriginalString) : DownloadString(inputUri); var changed = false; if (sanitize) { var sanitizer = new HtmlSanitizer(); sanitizer.AllowedSchemes.Add("mailto"); sanitizer.AllowedTags.Add("html"); sanitizer.AllowedTags.Add("head"); sanitizer.AllowedAttributes.Add("http-equiv"); sanitizer.AllowedAttributes.Add("content"); sanitizer.AllowedTags.Add("body"); sanitizer.AllowedTags.Add("meta"); sanitizer.AllowedAttributes.Add("class"); sanitizer.AllowDataAttributes = true; var sanitizedWebPage = sanitizer.Sanitize(webpage, string.Empty, new AutoSelectedMarkupFormatter()); if (webpage != sanitizedWebPage) { changed = true; webpage = sanitizedWebPage; WriteToLog("Webpage sanitized"); } } var maxWidth = pageSettings.PaperWidth * 96.0; var maxHeight = pageSettings.PaperHeight * 96.0; var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); var document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")).Result : context.OpenAsync(m => m.Content(webpage)).Result; //document.TextContent var unchangedImages = new List <IHtmlImageElement>(); // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop foreach (var htmlImage in document.Images) { if (string.IsNullOrWhiteSpace(htmlImage.Source)) { WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'"); continue; } Image image = null; var extension = Path.GetExtension(htmlImage.Source.Contains("?") ? htmlImage.Source.Split('?')[0] : htmlImage.Source); var fileName = GetTempFile(extension); try { // The local width and height attributes always go before css width and height var width = htmlImage.DisplayWidth; var height = htmlImage.DisplayHeight; if (rotate) { image = htmlImage.Source.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase) ? GetImageFromBase64(htmlImage.Source) : GetImage(new Uri(htmlImage.Source), localDirectory); if (image == null) { continue; } if (RotateImageByExifOrientationData(image)) { htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; changed = true; } width = image.Width; height = image.Height; if (!resize) { WriteToLog($"Image rotated and saved to location '{fileName}'"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.Source = new Uri(fileName).ToString(); } } if (resize) { if (height == 0 && width == 0) { var style = context.Current.GetComputedStyle(htmlImage); if (style != null) { width = ParseValue(style.GetPropertyValue("width")); height = ParseValue(style.GetPropertyValue("height")); } } // If we don't know the image size then get if from the image itself if (width <= 0 || height <= 0) { if (image == null) { image = htmlImage.Source.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase) ? GetImageFromBase64(htmlImage.Source) : GetImage(new Uri(htmlImage.Source), localDirectory); } if (image == null) { continue; } width = image.Width; height = image.Height; } if (width > maxWidth || height > maxHeight) { // If we did not load the image already then load it if (image == null) { image = htmlImage.Source.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase) ? GetImageFromBase64(htmlImage.Source) : GetImage(new Uri(htmlImage.Source), localDirectory); } if (image == null) { continue; } image = ScaleImage(image, (int)maxWidth); WriteToLog($"Image resized to width {image.Width} and height {image.Height} and saved to location '{fileName}'"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.Source = new Uri(fileName).ToString(); changed = true; } } } finally { image?.Dispose(); } if (!changed) { unchangedImages.Add(htmlImage); } } if (!changed) { return(true); } foreach (var unchangedImage in unchangedImages) { var imageSource = new Uri(unchangedImage.Source); using (var image = GetImage(imageSource, localDirectory)) { if (localDirectory != null) { var fileName = Path.Combine(localDirectory, Path.GetFileName(imageSource.ToString())); unchangedImage.Source = new Uri(fileName).ToString(); } else { var extension = Path.GetExtension(unchangedImage.Source.Contains("?") ? unchangedImage.Source.Split('?')[0] : unchangedImage.Source); var fileName = GetTempFile(extension); WriteToLog($"Unchanged image saved to location '{fileName}'"); image.Save(fileName); unchangedImage.Source = new Uri(fileName).ToString(); } } } var outputFile = GetTempFile(".htm"); outputUri = new ConvertUri(outputFile, inputUri.Encoding); try { using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new AutoSelectedMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new AutoSelectedMarkupFormatter()); } } return(false); } catch (Exception exception) { WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(true); } }
/// <summary> /// Returns the <see cref="Image"/> for the given <paramref name="imageSource"/> /// </summary> /// <param name="imageSource"></param> /// <param name="localDirectory"></param> /// <returns></returns> private Image GetImage(string imageSource, string localDirectory) { if (imageSource.StartsWith("data:", StringComparison.InvariantCultureIgnoreCase)) { WriteToLog("Decoding image from base64 string"); try { var base64Data = Regex.Match(imageSource, @"data:image/(?<type>.+?),(?<data>.+)").Groups["data"].Value; var binaryData = Convert.FromBase64String(base64Data); using (var stream = new MemoryStream(binaryData)) { var image = Image.FromStream(stream); WriteToLog("Image decoded"); return(image); } } catch (Exception exception) { WriteToLog($"Error decoding image: {ExceptionHelpers.GetInnerException(exception)}"); return(null); } } try { WriteToLog($"Getting image from url '{imageSource}'"); var imageUri = new Uri(imageSource); if (imageUri.IsFile) { var fileName = imageUri.LocalPath; if (!File.Exists(fileName)) { fileName = Path.Combine(localDirectory, Path.GetFileName(imageUri.LocalPath)); } if (File.Exists(fileName)) { var fileStream = OpenFileStream(fileName); return(Image.FromStream(fileStream, true, false)); } } switch (imageUri.Scheme) { case "https": case "http": using (var webStream = OpenDownloadStream(imageUri, true)) { if (webStream != null) { return(Image.FromStream(webStream, true, false)); } } break; case "file": WriteToLog("Ignoring local file"); break; default: WriteToLog($"Unsupported scheme {imageUri.Scheme} to get image"); return(null); } } catch (Exception exception) { WriteToLog($"Getting image failed with exception: {ExceptionHelpers.GetInnerException(exception)}"); } return(null); }
/// <summary> /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>. /// If an image does need to be rotated or does not fit then a local copy is made of /// the <paramref name="inputUri"/> file. /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param> /// <param name="rotate">When set to <c>true</c> then the EXIF information of an /// image is read and when needed the image is automatic rotated</param> /// <param name="pageSettings"><see cref="PageSettings"/></param> /// <param name="outputUri">The outputUri when this method returns <c>true</c> otherwise /// <c>null</c> is returned</param> /// <param name="urlBlacklist">A list of URL's that need to be blocked (use * as a wildcard)</param> /// <param name="safeUrls">A list with URL's that are safe to load</param> /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns> /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception> public bool ValidateImages( ConvertUri inputUri, bool resize, bool rotate, PageSettings pageSettings, out ConvertUri outputUri, ref List <string> safeUrls, List <string> urlBlacklist) { outputUri = null; using (var graphics = Graphics.FromHwnd(IntPtr.Zero)) using (var webpage = inputUri.IsFile ? OpenFileStream(inputUri.OriginalString) : OpenDownloadStream(inputUri)) { WriteToLog($"DPI settings for image, x: '{graphics.DpiX}' and y: '{graphics.DpiY}'"); var maxWidth = (pageSettings.PaperWidth - pageSettings.MarginLeft - pageSettings.MarginRight) * graphics.DpiX; var maxHeight = (pageSettings.PaperHeight - pageSettings.MarginTop - pageSettings.MarginBottom) * graphics.DpiY; string localDirectory = null; if (inputUri.IsFile) { localDirectory = Path.GetDirectoryName(inputUri.OriginalString); } var htmlChanged = false; IConfiguration config; if (_webProxy != null) { WriteToLog($"Using web proxy '{_webProxy.Address}' to download images"); var httpClientHandler = new HttpClientHandler { Proxy = _webProxy, ServerCertificateCustomValidationCallback = (message, certificate, arg1, arg2) => { WriteToLog($"Accepting certificate '{certificate.Subject}', message '{message}'"); return(true); } }; var client = new HttpClient(httpClientHandler); config = Configuration.Default .With(new HttpClientRequester(client)) .WithTemporaryCookies() .WithDefaultLoader() .WithCss(); } else { config = Configuration.Default.WithCss(); } var context = BrowsingContext.New(config); IDocument document; try { // ReSharper disable AccessToDisposedClosure document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}") .Address(inputUri.ToString())).Result : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result; // ReSharper restore AccessToDisposedClosure } catch (Exception exception) { WriteToLog($"Exception occurred in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } WriteToLog("Validating all images if they need to be rotated and if they fit the page"); var unchangedImages = new List <IHtmlImageElement>(); var absoluteUri = inputUri.AbsoluteUri.Substring(0, inputUri.AbsoluteUri.LastIndexOf('/') + 1); // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop foreach (var htmlImage in document.Images) { var imageChanged = false; if (string.IsNullOrWhiteSpace(htmlImage.Source)) { WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'"); continue; } Image image = null; var source = htmlImage.Source.Contains("?") ? htmlImage.Source.Split('?')[0] : htmlImage.Source; var isSafeUrl = safeUrls.Contains(source); var isAbsoluteUri = source.StartsWith(absoluteUri, StringComparison.InvariantCultureIgnoreCase); if (!RegularExpression.IsRegExMatch(urlBlacklist, source, out var matchedPattern) || isAbsoluteUri || isSafeUrl) { if (isAbsoluteUri) { WriteToLog($"The url '{source}' has been allowed because it start with the absolute uri '{absoluteUri}'"); } else if (isSafeUrl) { WriteToLog($"The url '{source}' has been allowed because it is on the safe url list"); } else { WriteToLog($"The url '{source}' has been allowed because it did not match anything on the url blacklist"); } } else { WriteToLog($"The url '{source}' has been blocked by url blacklist pattern '{matchedPattern}'"); continue; } var extension = Path.GetExtension(FileManager.RemoveInvalidFileNameChars(source)); var fileName = GetTempFile(extension); try { // The local width and height attributes always go before css width and height var width = htmlImage.DisplayWidth; var height = htmlImage.DisplayHeight; if (rotate) { image = GetImage(htmlImage.Source, localDirectory); if (image == null) { continue; } if (RotateImageByExifOrientationData(image)) { htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; WriteToLog($"Image rotated and saved to location '{fileName}'"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.SetStyle(string.Empty); var newSrc = new Uri(fileName).ToString(); WriteToLog($"Adding url '{newSrc}' to the safe url list"); safeUrls.Add(newSrc); htmlImage.Source = newSrc; htmlChanged = true; imageChanged = true; } width = image.Width; height = image.Height; } if (resize) { if (height == 0 && width == 0) { ICssStyleDeclaration style = null; try { style = context.Current.GetComputedStyle(htmlImage); } catch (Exception exception) { WriteToLog($"Could not get computed style from html image, exception: '{exception.Message}'"); } if (style != null) { width = ParseValue(style.GetPropertyValue("width")); height = ParseValue(style.GetPropertyValue("height")); } } // If we don't know the image size then get if from the image itself if (width <= 0 || height <= 0) { if (image == null) { image = GetImage(htmlImage.Source, localDirectory); } if (image == null) { continue; } width = image.Width; height = image.Height; } if (width > maxWidth || height > maxHeight) { // If we did not load the image already then load it if (image == null) { image = GetImage(htmlImage.Source, localDirectory); } if (image == null) { continue; } ScaleImage(image, (int)maxWidth, out var newWidth, out var newHeight); WriteToLog($"Image rescaled to width {newWidth} and height {newHeight}"); htmlImage.DisplayWidth = newWidth; htmlImage.DisplayHeight = newHeight; htmlImage.SetStyle(string.Empty); htmlChanged = true; } } } finally { image?.Dispose(); } if (!imageChanged) { unchangedImages.Add(htmlImage); } } if (!htmlChanged) { return(false); } foreach (var unchangedImage in unchangedImages) { using (var image = GetImage(unchangedImage.Source, localDirectory)) { if (image == null) { WriteToLog($"Could not load unchanged image from location '{unchangedImage.Source}'"); continue; } var extension = Path.GetExtension(unchangedImage.Source.Contains("?") ? unchangedImage.Source.Split('?')[0] : unchangedImage.Source); var fileName = GetTempFile(extension); WriteToLog($"Unchanged image saved to location '{fileName}'"); image.Save(fileName); var newSrc = new Uri(fileName).ToString(); safeUrls.Add(newSrc); unchangedImage.Source = newSrc; } } var outputFile = GetTempFile(".htm"); outputUri = new ConvertUri(outputFile, inputUri.Encoding); safeUrls.Add(outputUri.ToString()); try { WriteToLog($"Writing changed webpage to '{outputFile}'"); using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } } WriteToLog("Changed webpage written"); return(true); } catch (Exception exception) { WriteToLog($"Could not write new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } } }
/// <summary> /// Sanitizes the HTML by removing all forbidden elements /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="mediaLoadTimeout">The media load timeout or <c>null</c> when not set</param> /// <param name="sanitizer"><see cref="HtmlSanitizer"/></param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <param name="safeUrls">A list of safe URL's</param> /// <returns></returns> public bool SanitizeHtml( ConvertUri inputUri, int?mediaLoadTimeout, HtmlSanitizer sanitizer, out ConvertUri outputUri, ref List <string> safeUrls) { outputUri = null; using (var webpage = inputUri.IsFile ? OpenFileStream(inputUri.OriginalString) : OpenDownloadStream(inputUri)) { var htmlChanged = false; var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); IDocument document; try { // ReSharper disable AccessToDisposedClosure document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}").Address(inputUri.ToString())).Result : context.OpenAsync(m => m.Content(webpage).Address(inputUri.ToString())).Result; // ReSharper restore AccessToDisposedClosure } catch (Exception exception) { WriteToLog($"Exception occurred in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } WriteToLog("Sanitizing HTML"); if (sanitizer == null) { sanitizer = new HtmlSanitizer(); } sanitizer.FilterUrl += delegate(object sender, FilterUrlEventArgs args) { if (args.OriginalUrl != args.SanitizedUrl) { WriteToLog($"URL sanitized from '{args.OriginalUrl}' to '{args.SanitizedUrl}'"); htmlChanged = true; } }; sanitizer.RemovingAtRule += delegate(object sender, RemovingAtRuleEventArgs args) { WriteToLog($"Removing CSS at-rule '{args.Rule.CssText}' from tag '{args.Tag.TagName}'"); htmlChanged = true; }; sanitizer.RemovingAttribute += delegate(object sender, RemovingAttributeEventArgs args) { WriteToLog($"Removing attribute '{args.Attribute.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.RemovingComment += delegate(object sender, RemovingCommentEventArgs args) { WriteToLog($"Removing comment '{args.Comment.TextContent}'"); htmlChanged = true; }; sanitizer.RemovingCssClass += delegate(object sender, RemovingCssClassEventArgs args) { WriteToLog($"Removing CSS class '{args.CssClass}' from tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.RemovingStyle += delegate(object sender, RemovingStyleEventArgs args) { WriteToLog($"Removing style '{args.Style.Name}' from tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.RemovingTag += delegate(object sender, RemovingTagEventArgs args) { WriteToLog($"Removing tag '{args.Tag.TagName}', reason '{args.Reason}'"); htmlChanged = true; }; sanitizer.SanitizeDom(document as IHtmlDocument); if (!htmlChanged) { WriteToLog("HTML did not need any sanitization"); return(false); } WriteToLog("HTML sanitized"); var sanitizedOutputFile = GetTempFile(".htm"); outputUri = new ConvertUri(sanitizedOutputFile, inputUri.Encoding); var url = outputUri.ToString(); WriteToLog($"Adding url '{url}' to the safe url list"); safeUrls.Add(url); try { if (document.BaseUrl.Scheme.StartsWith("file")) { var images = document.DocumentElement.Descendents() .Where(x => x.NodeType == NodeType.Element) .OfType <IHtmlImageElement>(); foreach (var image in images) { var src = image.Source; if (src.StartsWith("http://", StringComparison.InvariantCultureIgnoreCase) || src.StartsWith("https://", StringComparison.InvariantCultureIgnoreCase)) { continue; } WriteToLog($"Updating image source to '{src}' and adding it to the safe url list"); safeUrls.Add(src); image.Source = src; } } WriteToLog($"Writing sanitized webpage to '{sanitizedOutputFile}'"); using (var fileStream = new FileStream(sanitizedOutputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } } WriteToLog("Sanitized webpage written"); return(true); } catch (Exception exception) { WriteToLog($"Could not write new html file '{sanitizedOutputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(false); } } }