public IEnumerable <Common.Documents.IDocument> Execute(IReadOnlyList <Common.Documents.IDocument> inputs, IExecutionContext context) { #pragma warning disable RCS1163 // Unused parameter. // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064) ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true; #pragma warning restore RCS1163 // Unused parameter. // Cache downloaded resources Dictionary <string, string> mirrorCache = new Dictionary <string, string>(); // Iterate the input documents synchronously so we don't download the same resource more than once HtmlParser parser = new HtmlParser(); return(inputs.Select(context, input => { IHtmlDocument htmlDocument = input.ParseHtml(parser); if (htmlDocument != null) { bool modifiedDocument = false; // Link element foreach (IElement element in htmlDocument .GetElementsByTagName("link") .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror"))) { string replacement = DownloadAndReplace(element.GetAttribute("href"), element, mirrorCache, context); if (replacement != null) { element.Attributes["href"].Value = replacement; modifiedDocument = true; } } // Scripts foreach (IHtmlScriptElement element in htmlDocument.Scripts .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror"))) { string replacement = DownloadAndReplace(element.Source, element, mirrorCache, context); if (replacement != null) { element.Source = replacement; modifiedDocument = true; } } // Return a new document with the replacements if we performed any if (modifiedDocument) { Stream contentStream = context.GetContentStream(); using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return context.GetDocument(input, contentStream); } } } return input; })); }
public static void WriteToFile(this IHtmlDocument doc, string filePath, Encoding encoding) { using (var sw = new StreamWriter(File.Open(filePath, FileMode.Create), encoding)) { doc.ToHtml(sw, XmlMarkupFormatter.Instance); } }
/// <summary> /// In-lines the CSS for the current HTML /// </summary> /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param> /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param> /// <param name="css">A string containing a style-sheet for inlining.</param> /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param> /// <param name="removeComments">True to remove comments, false to leave them intact</param> /// <param name="keepMediaQueries">True to add back any mediaqueries</param> /// <returns>Returns the html input, with styles moved to inline attributes.</returns> public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false, bool keepMediaQueries = false) { // Store the variables used for inlining the CSS _removeStyleElements = removeStyleElements; _stripIdAndClassAttributes = stripIdAndClassAttributes; _ignoreElements = ignoreElements; _keepMediaQueries = keepMediaQueries; _css = css; // Gather all of the CSS that we can work with. var cssSourceNodes = CssSourceNodes(); var cssLinkNodes = CssLinkNodes(); var cssSources = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes)); cssSources.AddRange(ConvertToStyleSources(cssLinkNodes)); var cssBlocks = GetCssBlocks(cssSources); if (_removeStyleElements) { RemoveStyleElements(cssSourceNodes); RemoveStyleElements(cssLinkNodes); } var joinedStyles = Join(cssBlocks); var joinedBlocks = joinedStyles.Styles; var mediaQueries = joinedStyles.MediaQueries; var validSelectors = CleanUnsupportedSelectors(joinedBlocks); var elementsWithStyles = FindElementsWithStyles(validSelectors); var mergedStyles = MergeStyleClasses(elementsWithStyles); StyleClassApplier.ApplyAllStyles(mergedStyles); if (_stripIdAndClassAttributes) { StripElementAttributes("id", "class"); } if (removeStyleElements && _keepMediaQueries) { var styleElem = _document.CreateElement("style"); styleElem.TextContent = String.Join(",", mediaQueries); _document.Body.Prepend(styleElem); } if (removeComments) { var comments = _document.Descendents <IComment>().ToList(); foreach (var comment in comments) { comment.Remove(); } } var html = _document.ToHtml(new AutoSelectedMarkupFormatter(_document.Doctype)); return(new InlineResult(html, _warnings)); }
/// <summary> /// In-lines the CSS for the current HTML /// </summary> /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param> /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param> /// <param name="css">A string containing a style-sheet for inlining.</param> /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param> /// <param name="removeComments">True to remove comments, false to leave them intact</param> /// <param name="precompiledStyles"></param> /// <returns>Returns the html input, with styles moved to inline attributes.</returns> public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false, SortedList <string, StyleClass> precompiledStyles = null) { // Store the variables used for inlining the CSS _removeStyleElements = removeStyleElements; _stripIdAndClassAttributes = stripIdAndClassAttributes; _ignoreElements = ignoreElements; // Gather all of the CSS that we can work with. var cssSourceNodes = CssSourceNodes(); var cssLinkNodes = CssLinkNodes(); var cssSources = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes)); cssSources.AddRange(ConvertToStyleSources(cssLinkNodes)); cssSources.AddRange(PreMailer.ConvertToStyleSources(css)); var cssBlocks = PreMailer.GetCssBlocks(cssSources); if (_removeStyleElements) { RemoveStyleElements(cssSourceNodes); RemoveStyleElements(cssLinkNodes); } var joinedBlocks = PreMailer.Join(cssBlocks); var validSelectors = CleanUnsupportedSelectors(joinedBlocks); if (precompiledStyles != null) { precompiledStyles.ToList().ForEach(kvp => { validSelectors.Add(kvp.Key, kvp.Value); }); } var elementsWithStyles = FindElementsWithStyles(validSelectors); var mergedStyles = MergeStyleClasses(elementsWithStyles); StyleClassApplier.ApplyAllStyles(mergedStyles); if (_stripIdAndClassAttributes) { StripElementAttributes("id", "class"); } if (removeComments) { var comments = _document.Descendents <IComment>().ToList(); foreach (var comment in comments) { comment.Remove(); } } var html = _document.ToHtml(new AutoSelectedMarkupFormatter(_document.Doctype)); return(new InlineResult(html, _warnings)); }
/// <summary> /// In-lines the CSS for the current HTML /// </summary> /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param> /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param> /// <param name="css">A string containing a style-sheet for inlining.</param> /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param> /// <param name="removeComments">True to remove comments, false to leave them intact</param> /// <returns>Returns the html input, with styles moved to inline attributes.</returns> public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false) { // Store the variables used for inlining the CSS _removeStyleElements = removeStyleElements; _stripIdAndClassAttributes = stripIdAndClassAttributes; _ignoreElements = ignoreElements; _css = css; // Gather all of the CSS that we can work with. var cssSourceNodes = CssSourceNodes(); var cssLinkNodes = CssLinkNodes(); var cssSources = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes)); cssSources.AddRange(ConvertToStyleSources(cssLinkNodes)); var cssBlocks = GetCssBlocks(cssSources); if (_removeStyleElements) { RemoveStyleElements(cssSourceNodes); RemoveStyleElements(cssLinkNodes); } var joinedBlocks = Join(cssBlocks); var validSelectors = CleanUnsupportedSelectors(joinedBlocks); var elementsWithStyles = FindElementsWithStyles(validSelectors); var mergedStyles = MergeStyleClasses(elementsWithStyles); StyleClassApplier.ApplyAllStyles(mergedStyles); if (_stripIdAndClassAttributes) { StripElementAttributes("id", "class"); } if (removeComments) { var comments = _document.Descendents <IComment>().ToList(); foreach (var comment in comments) { comment.Remove(); } } IMarkupFormatter markupFormatter = GetMarkupFormatterForDocType(); using (var sw = new StringWriter()) { _document.ToHtml(sw, markupFormatter); return(new InlineResult(sw.GetStringBuilder(), _warnings)); } }
/// <inheritdoc /> public IEnumerable <Common.Documents.IDocument> Execute(IReadOnlyList <Common.Documents.IDocument> inputs, IExecutionContext context) { HtmlParser parser = new HtmlParser(); return(inputs.AsParallel().Select(context, input => { // Get the replacement content string content = _content.Invoke <string>(input, context); if (content == null) { return input; } // Parse the HTML content IHtmlDocument htmlDocument = input.ParseHtml(parser); if (htmlDocument == null) { return input; } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(_querySelector)) { IElement[] elements = _first ? new[] { htmlDocument.QuerySelector(_querySelector) } : htmlDocument.QuerySelectorAll(_querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { foreach (IElement element in elements) { element.Insert(_position, content); } Stream contentStream = context.GetContentStream(); using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return context.GetDocument(input, contentStream); } } } return input; } catch (Exception ex) { Trace.Warning("Exception while processing HTML for {0}: {1}", input.SourceString(), ex.Message); return input; } })); }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) { // Get the replacement content string content = await _content.GetValueAsync(input, context); if (content == null) { return(input.Yield()); } // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(_querySelector)) { IElement[] elements = _first ? new[] { htmlDocument.QuerySelector(_querySelector) } : htmlDocument.QuerySelectorAll(_querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { foreach (IElement element in elements) { element.Insert(_position, content); } using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)).Yield()); } } } } return(input.Yield()); } catch (Exception ex) { context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input.Yield()); } }
public static string GetHtml(this IHtmlDocument doc) { using (var ms = new MemoryStream()) { using (var sw = new StreamWriter(ms)) using (var sr = new StreamReader(ms)) { doc.ToHtml(sw, XmlMarkupFormatter.Instance); sw.Flush(); ms.Position = 0; return(sr.ReadToEnd()); } } }
public static string SelectByCssSelector(string cssSelector, string html) { HtmlParser htmlParser = new HtmlParser(); IHtmlDocument document = htmlParser.ParseDocument(html); IElement element = document.QuerySelector(cssSelector); if (element == null) { throw new CssSelectorException() { Html = document.ToHtml(), CssSelector = cssSelector }; } return(element.InnerHtml); }
/// <inheritdoc /> protected override async Task <IEnumerable <IDocument> > ExecuteContextAsync(IExecutionContext context) { HtmlParser parser = new HtmlParser(); IJavaScriptEnginePool enginePool = context.GetJavaScriptEnginePool(x => { if (string.IsNullOrWhiteSpace(_highlightJsFile)) { x.ExecuteResource("highlight-all.js", typeof(HighlightCode)); } else { x.ExecuteFile(_highlightJsFile); } }); using (enginePool) { IEnumerable <IDocument> results = await context.Inputs.ParallelSelectAsync(async input => { try { using (Stream stream = input.GetContentStream()) { using (IHtmlDocument htmlDocument = await parser.ParseAsync(stream)) { foreach (AngleSharp.Dom.IElement element in htmlDocument.QuerySelectorAll(_codeQuerySelector)) { // Don't highlight anything that potentially is already highlighted if (element.ClassList.Contains("hljs")) { continue; } try { HighlightElement(enginePool, element); } catch (Exception innerEx) { if (innerEx.Message.Contains("Unknown language: ") && _warnOnMissingLanguage) { context.LogWarning($"Exception while highlighting source code: {innerEx.Message}"); } else { context.LogInformation($"Exception while highlighting source code: {innerEx.Message}"); } } } using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, HtmlMarkupFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html))); } } } } } catch (Exception ex) { context.LogWarning("Exception while highlighting source code for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input); } }); // Materialize the results before disposing the JS engine return(results.ToList()); } }
private static async System.Threading.Tasks.Task ScrapeAsync(string pageUrl, string outputLocation) { if (urls.Contains(pageUrl)) { return; } urls.Add(pageUrl); Console.WriteLine($"Traversing link: {pageUrl}"); CancellationTokenSource cancellationToken = new CancellationTokenSource(); HttpClient httpClient = new HttpClient(); HttpResponseMessage request = await httpClient.GetAsync(pageUrl); cancellationToken.Token.ThrowIfCancellationRequested(); Stream response = await request.Content.ReadAsStreamAsync(); cancellationToken.Token.ThrowIfCancellationRequested(); HtmlParser parser = new HtmlParser(); IHtmlDocument document = parser.ParseDocument(response); Url url = new Url(pageUrl); Console.WriteLine("ContentLength: " + request.Content.Headers.ContentLength); string p = Path.Combine(outputLocation, url.Path); Console.WriteLine("Writing to --> " + p); Directory.CreateDirectory(Path.GetDirectoryName(p)); Console.WriteLine(document.ToHtml()); /* * using ( Stream fs = File.OpenWrite( p ) ) * { * response.Seek( 0, SeekOrigin.Begin ); * response.CopyTo( fs ); * //await response.CopyToAsync( fs ); * //await fs.FlushAsync(); * }*/ var els = document.All.Where(x => x.NodeType == NodeType.Element); foreach (var e in els) { Console.WriteLine($"{e.NodeName} {e.GetType()}"); if ((e as IHtmlElement).HasAttribute("background")) { Url imgUrl = Url.Create((e as IHtmlElement).GetAttribute("background")); if (imgUrl.IsRelative) { Url hrefUrl = imgUrl; string follow = hrefUrl.Href; Url newUrl = new Url(url, follow); CancellationTokenSource cancellationToken2 = new CancellationTokenSource(); HttpClient httpClient2 = new HttpClient(); HttpResponseMessage request2 = await httpClient2.GetAsync(newUrl); cancellationToken2.Token.ThrowIfCancellationRequested(); Console.WriteLine("Downloading: " + newUrl); byte[] response2 = await request2.Content.ReadAsByteArrayAsync(); cancellationToken2.Token.ThrowIfCancellationRequested(); string p2 = Path.Combine(outputLocation, newUrl.Path); Directory.CreateDirectory(Path.GetDirectoryName(p2)); Console.WriteLine("Writing to --> " + p2); using (Stream fs = File.OpenWrite(p2)) { var l = request2.Content.Headers.ContentLength ?? response2.Length; Console.WriteLine($"***Writing {l} chars {response2.Length}"); fs.Write(response2); fs.Flush(); } } } } var imgs = document.All .Where(x => x.NodeType == NodeType.Element) .OfType <IHtmlImageElement>(); if (imgs == null) { Console.WriteLine("No images in: " + pageUrl); } else { foreach (var i in imgs) { string src = i.Source; if (i.HasAttribute("src")) { src = i.GetAttribute("src"); } Url imgUrl = Url.Create(src); if (imgUrl.IsRelative) { { Url hrefUrl = imgUrl; { string follow = hrefUrl.Href; if (!string.IsNullOrEmpty(hrefUrl.Fragment)) { follow = follow.Substring(0, follow.IndexOf(hrefUrl.Fragment) - 1); } Url newUrl = new Url(url, follow); CancellationTokenSource cancellationToken2 = new CancellationTokenSource(); HttpClient httpClient2 = new HttpClient(); HttpResponseMessage request2 = await httpClient2.GetAsync(newUrl); cancellationToken2.Token.ThrowIfCancellationRequested(); Console.WriteLine("Downloading: " + newUrl); Stream response2 = await request2.Content.ReadAsStreamAsync(); cancellationToken2.Token.ThrowIfCancellationRequested(); string p2 = Path.Combine(outputLocation, newUrl.Path); Directory.CreateDirectory(Path.GetDirectoryName(p2)); Console.WriteLine("Writing to --> " + p2); using (Stream fs = File.OpenWrite(p2)) { response.Seek(0, SeekOrigin.Begin); await response2.CopyToAsync(fs); await fs.FlushAsync(); //response2.CopyTo( fs ); //fs.Flush(); } } } } // No need /*if ( !src.Contains( "/" ) ) * { * i.SetAttribute("src", "./" + i.GetAttribute("src") ); * }*/ } } { CancellationTokenSource cancellationToken2 = new CancellationTokenSource(); HttpClient httpClient2 = new HttpClient(); HttpResponseMessage request2 = await httpClient2.GetAsync(pageUrl); cancellationToken2.Token.ThrowIfCancellationRequested(); Console.WriteLine("Downloading: " + pageUrl); byte[] response2 = await request2.Content.ReadAsByteArrayAsync(); cancellationToken2.Token.ThrowIfCancellationRequested(); string p2 = Path.Combine(outputLocation, url.Path); Directory.CreateDirectory(Path.GetDirectoryName(p2)); Console.WriteLine("Writing to --> " + p2); using (Stream fs = File.OpenWrite(p2)) { var l = request2.Content.Headers.ContentLength ?? response2.Length; Console.WriteLine($"***Writing {l} chars {response2.Length}"); fs.Write(response2); fs.Flush(); } } /* Write modified HTML * using ( StreamWriter fs = new StreamWriter( File.OpenWrite( p ) ) ) * { * fs.Flush(); * fs.Write( document.ToHtml() ); * fs.Flush(); * fs.Close(); * }*/ var refs = document.All.Where(x => x.IsLink()); foreach (var r in refs) { foreach (var a in r.Attributes) { if (a.Name.Equals("href", StringComparison.InvariantCultureIgnoreCase)) { string href = a.Value; if (href.StartsWith("#")) { Console.WriteLine("Skipping anchor link: " + href); break; } Url hrefUrl = Url.Create(href); if (hrefUrl.IsRelative) { string follow = hrefUrl.Href; if (!string.IsNullOrEmpty(hrefUrl.Fragment)) { follow = follow.Substring(0, follow.IndexOf(hrefUrl.Fragment) - 1); } Url newUrl = new Url(url, follow); await ScrapeAsync(newUrl.Href, outputLocation); } } } } }
/// <inheritdoc /> public IEnumerable <IDocument> Execute(IReadOnlyList <IDocument> inputs, IExecutionContext context) { HtmlParser parser = new HtmlParser(); using (IJsEnginePool enginePool = context.GetJsEnginePool(x => { if (string.IsNullOrWhiteSpace(_highlightJsFile)) { x.ExecuteResource("highlight-all.js", typeof(Highlight)); } else { x.ExecuteFile(_highlightJsFile); } })) { return(inputs.AsParallel().Select(context, input => { // We materialize the list before exiting the using statement, so safe to access enginePool // ReSharper disable once AccessToDisposedClosure using (IJsEngine engine = enginePool.GetEngine()) { try { using (Stream stream = input.GetStream()) using (IHtmlDocument htmlDocument = parser.Parse(stream)) { foreach (AngleSharp.Dom.IElement element in htmlDocument.QuerySelectorAll(_codeQuerySelector)) { // Don't highlight anything that potentially is already highlighted if (element.ClassList.Contains("hljs")) { continue; } // Make sure to use TextContent, otherwise you'll get escaped html which highlight.js won't parse engine.SetVariableValue("input", element.TextContent); // Check if they specified a language in their code block string language = element.ClassList.FirstOrDefault(i => i.StartsWith("language")); try { if (language != null) { engine.SetVariableValue("language", language.Replace("language-", "")); engine.Execute("result = hljs.highlight(language, input)"); } else { language = "(auto)"; // set this to auto in case there is an exception below engine.Execute("result = hljs.highlightAuto(input)"); string detectedLanguage = engine.Evaluate <string>("result.language"); if (string.IsNullOrWhiteSpace(detectedLanguage) == false) { element.ClassList.Add("language-" + detectedLanguage); } } element.ClassList.Add("hljs"); string formatted = engine.Evaluate <string>("result.value"); element.InnerHtml = formatted; } catch (Exception innerEx) { if (innerEx.Message.Contains("Unknown language: ") && _warnOnMissingLanguage) { Trace.Warning("Exception while highlighting source code for {0} using language {1}: {2}", input.SourceString(), language, innerEx.Message); } else { Trace.Information("Exception while highlighting source code for {0} using language {1}: {2}", input.SourceString(), language, innerEx.Message); } } } string content = htmlDocument.ToHtml(); return context.GetDocument(input, content); } } catch (Exception ex) { Trace.Warning("Exception while highlighting source code for {0}: {1}", input.SourceString(), ex.Message); return input; } } }).ToList()); } }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteContextAsync(IExecutionContext context) { #pragma warning disable RCS1163 // Unused parameter. // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064) ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true; #pragma warning restore RCS1163 // Unused parameter. // Cache downloaded resources Dictionary <string, string> mirrorCache = new Dictionary <string, string>(); // Iterate the input documents synchronously so we don't download the same resource more than once HtmlParser parser = new HtmlParser(); return(await context.Inputs .ToAsyncEnumerable() .SelectAwait(async x => await GetDocumentAsync(x)) .ToListAsync()); async Task <Common.IDocument> GetDocumentAsync(Common.IDocument input) { IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, parser); if (htmlDocument != null) { bool modifiedDocument = false; // Link element foreach (IElement element in htmlDocument .GetElementsByTagName("link") .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror"))) { string replacement = await DownloadAndReplaceAsync(element.GetAttribute("href"), mirrorCache, context); if (replacement != null) { element.Attributes["href"].Value = replacement; modifiedDocument = true; } } // Scripts foreach (IHtmlScriptElement element in htmlDocument.Scripts .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror"))) { string replacement = await DownloadAndReplaceAsync(element.Source, mirrorCache, context); if (replacement != null) { element.Source = replacement; modifiedDocument = true; } } // Return a new document with the replacements if we performed any if (modifiedDocument) { using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html))); } } } } return(input); } }
/// <inheritdoc /> public IEnumerable <IDocument> Execute(IReadOnlyList <IDocument> inputs, IExecutionContext context) { HtmlParser parser = new HtmlParser(); using (IJavaScriptEnginePool enginePool = context.GetJavaScriptEnginePool(x => { if (string.IsNullOrWhiteSpace(_highlightJsFile)) { x.ExecuteResource("highlight-all.js", typeof(Highlight)); } else { x.ExecuteFile(_highlightJsFile); } })) { return(inputs.AsParallel().Select(context, input => { try { using (Stream stream = input.GetStream()) { using (IHtmlDocument htmlDocument = parser.Parse(stream)) { foreach (AngleSharp.Dom.IElement element in htmlDocument.QuerySelectorAll(_codeQuerySelector)) { // Don't highlight anything that potentially is already highlighted if (element.ClassList.Contains("hljs")) { continue; } try { HighlightElement(enginePool, element); } catch (Exception innerEx) { if (innerEx.Message.Contains("Unknown language: ") && _warnOnMissingLanguage) { Trace.Warning($"Exception while highlighting source code: {innerEx.Message}"); } else { Trace.Information($"Exception while highlighting source code: {innerEx.Message}"); } } } Stream contentStream = context.GetContentStream(); using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, HtmlMarkupFormatter.Instance); writer.Flush(); return context.GetDocument(input, contentStream); } } } } catch (Exception ex) { Trace.Warning("Exception while highlighting source code for {0}: {1}", input.SourceString(), ex.Message); return input; } }).ToList()); } }
private static async Task <Common.IDocument> ResolveDocumentXrefsAsync( Common.IDocument input, IExecutionContext context, ConcurrentDictionary <string, ConcurrentBag <string> > failures) { IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument != null) { // Find and replace "xref:" in links bool modifiedDocument = false; bool errors = false; foreach (IElement element in htmlDocument .GetElementsByTagName("a") .Where(x => x.HasAttribute("href"))) { string href = element.GetAttribute("href"); if (href.StartsWith("xref:") && href.Length > 5) { string xref = href.Substring(5); string queryAndFragment = string.Empty; int queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' }); if (queryAndFragmentIndex > 0) { queryAndFragment = xref.Substring(queryAndFragmentIndex); xref = xref.Substring(0, queryAndFragmentIndex); } if (context.TryGetXrefLink(xref, out string xrefLink, out string error)) { element.Attributes["href"].Value = xrefLink + queryAndFragment; } else { // Continue processing so we can report all the failures in a given document failures.AddOrUpdate( input.Source.FullPath, _ => new ConcurrentBag <string> { error }, (_, list) => { list.Add(error); return(list); }); errors = true; } modifiedDocument = true; } } // Exit if there were errors if (errors) { return(null); } // Return a new document with the replacements if we performed any if (modifiedDocument) { using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html))); } } } }
internal static async Task <IEnumerable <Common.IDocument> > ProcessElementsAsync( Common.IDocument input, IExecutionContext context, string querySelector, bool first, Action <Common.IDocument, IExecutionContext, IElement, Dictionary <string, object> > processElement) { // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(querySelector)) { IElement[] elements = first ? new[] { htmlDocument.QuerySelector(querySelector) } : htmlDocument.QuerySelectorAll(querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { INode clone = htmlDocument.Clone(true); // Clone the document so we know if it changed Dictionary <string, object> metadata = new Dictionary <string, object>(); foreach (IElement element in elements) { processElement(input, context, element, metadata); } if (htmlDocument.Equals(clone)) { // Elements were not edited so return the original document or clone it with new metadata return(metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield()); } // Elements were edited so get the new content using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html); return(metadata.Count == 0 ? input.Clone(contentProvider).Yield() : input.Clone(metadata, contentProvider).Yield()); } } } } return(input.Yield()); } catch (Exception ex) { context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input.Yield()); } }