Example #1
0
        public IEnumerable <Common.Documents.IDocument> Execute(IReadOnlyList <Common.Documents.IDocument> inputs, IExecutionContext context)
        {
#pragma warning disable RCS1163 // Unused parameter.
            // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064)
            ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true;
#pragma warning restore RCS1163 // Unused parameter.

            // Cache downloaded resources
            Dictionary <string, string> mirrorCache = new Dictionary <string, string>();

            // Iterate the input documents synchronously so we don't download the same resource more than once
            HtmlParser parser = new HtmlParser();
            return(inputs.Select(context, input =>
            {
                IHtmlDocument htmlDocument = input.ParseHtml(parser);
                if (htmlDocument != null)
                {
                    bool modifiedDocument = false;

                    // Link element
                    foreach (IElement element in htmlDocument
                             .GetElementsByTagName("link")
                             .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = DownloadAndReplace(element.GetAttribute("href"), element, mirrorCache, context);
                        if (replacement != null)
                        {
                            element.Attributes["href"].Value = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Scripts
                    foreach (IHtmlScriptElement element in htmlDocument.Scripts
                             .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = DownloadAndReplace(element.Source, element, mirrorCache, context);
                        if (replacement != null)
                        {
                            element.Source = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Return a new document with the replacements if we performed any
                    if (modifiedDocument)
                    {
                        Stream contentStream = context.GetContentStream();
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            return context.GetDocument(input, contentStream);
                        }
                    }
                }

                return input;
            }));
        }
Example #2
0
 public static void WriteToFile(this IHtmlDocument doc, string filePath, Encoding encoding)
 {
     using (var sw = new StreamWriter(File.Open(filePath, FileMode.Create), encoding))
     {
         doc.ToHtml(sw, XmlMarkupFormatter.Instance);
     }
 }
Example #3
0
        /// <summary>
        /// In-lines the CSS for the current HTML
        /// </summary>
        /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param>
        /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param>
        /// <param name="css">A string containing a style-sheet for inlining.</param>
        /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param>
        /// <param name="removeComments">True to remove comments, false to leave them intact</param>
        /// <param name="keepMediaQueries">True to add back any mediaqueries</param>
        /// <returns>Returns the html input, with styles moved to inline attributes.</returns>
        public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false, bool keepMediaQueries = false)
        {
            // Store the variables used for inlining the CSS
            _removeStyleElements       = removeStyleElements;
            _stripIdAndClassAttributes = stripIdAndClassAttributes;
            _ignoreElements            = ignoreElements;
            _keepMediaQueries          = keepMediaQueries;
            _css = css;

            // Gather all of the CSS that we can work with.
            var cssSourceNodes = CssSourceNodes();
            var cssLinkNodes   = CssLinkNodes();
            var cssSources     = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes));

            cssSources.AddRange(ConvertToStyleSources(cssLinkNodes));

            var cssBlocks = GetCssBlocks(cssSources);

            if (_removeStyleElements)
            {
                RemoveStyleElements(cssSourceNodes);
                RemoveStyleElements(cssLinkNodes);
            }

            var joinedStyles       = Join(cssBlocks);
            var joinedBlocks       = joinedStyles.Styles;
            var mediaQueries       = joinedStyles.MediaQueries;
            var validSelectors     = CleanUnsupportedSelectors(joinedBlocks);
            var elementsWithStyles = FindElementsWithStyles(validSelectors);
            var mergedStyles       = MergeStyleClasses(elementsWithStyles);

            StyleClassApplier.ApplyAllStyles(mergedStyles);

            if (_stripIdAndClassAttributes)
            {
                StripElementAttributes("id", "class");
            }

            if (removeStyleElements && _keepMediaQueries)
            {
                var styleElem = _document.CreateElement("style");
                styleElem.TextContent = String.Join(",", mediaQueries);
                _document.Body.Prepend(styleElem);
            }

            if (removeComments)
            {
                var comments = _document.Descendents <IComment>().ToList();

                foreach (var comment in comments)
                {
                    comment.Remove();
                }
            }

            var html = _document.ToHtml(new AutoSelectedMarkupFormatter(_document.Doctype));

            return(new InlineResult(html, _warnings));
        }
Example #4
0
        /// <summary>
        /// In-lines the CSS for the current HTML
        /// </summary>
        /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param>
        /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param>
        /// <param name="css">A string containing a style-sheet for inlining.</param>
        /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param>
        /// <param name="removeComments">True to remove comments, false to leave them intact</param>
        /// <param name="precompiledStyles"></param>
        /// <returns>Returns the html input, with styles moved to inline attributes.</returns>
        public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false, SortedList <string, StyleClass> precompiledStyles = null)
        {
            // Store the variables used for inlining the CSS
            _removeStyleElements       = removeStyleElements;
            _stripIdAndClassAttributes = stripIdAndClassAttributes;
            _ignoreElements            = ignoreElements;

            // Gather all of the CSS that we can work with.
            var cssSourceNodes = CssSourceNodes();
            var cssLinkNodes   = CssLinkNodes();
            var cssSources     = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes));

            cssSources.AddRange(ConvertToStyleSources(cssLinkNodes));
            cssSources.AddRange(PreMailer.ConvertToStyleSources(css));

            var cssBlocks = PreMailer.GetCssBlocks(cssSources);

            if (_removeStyleElements)
            {
                RemoveStyleElements(cssSourceNodes);
                RemoveStyleElements(cssLinkNodes);
            }

            var joinedBlocks   = PreMailer.Join(cssBlocks);
            var validSelectors = CleanUnsupportedSelectors(joinedBlocks);

            if (precompiledStyles != null)
            {
                precompiledStyles.ToList().ForEach(kvp => { validSelectors.Add(kvp.Key, kvp.Value); });
            }

            var elementsWithStyles = FindElementsWithStyles(validSelectors);
            var mergedStyles       = MergeStyleClasses(elementsWithStyles);

            StyleClassApplier.ApplyAllStyles(mergedStyles);

            if (_stripIdAndClassAttributes)
            {
                StripElementAttributes("id", "class");
            }

            if (removeComments)
            {
                var comments = _document.Descendents <IComment>().ToList();

                foreach (var comment in comments)
                {
                    comment.Remove();
                }
            }

            var html = _document.ToHtml(new AutoSelectedMarkupFormatter(_document.Doctype));

            return(new InlineResult(html, _warnings));
        }
Example #5
0
        /// <summary>
        /// In-lines the CSS for the current HTML
        /// </summary>
        /// <param name="removeStyleElements">If set to <c>true</c> the style elements are removed.</param>
        /// <param name="ignoreElements">CSS selector for STYLE elements to ignore (e.g. mobile-specific styles etc.)</param>
        /// <param name="css">A string containing a style-sheet for inlining.</param>
        /// <param name="stripIdAndClassAttributes">True to strip ID and class attributes</param>
        /// <param name="removeComments">True to remove comments, false to leave them intact</param>
        /// <returns>Returns the html input, with styles moved to inline attributes.</returns>
        public InlineResult MoveCssInline(bool removeStyleElements = false, string ignoreElements = null, string css = null, bool stripIdAndClassAttributes = false, bool removeComments = false)
        {
            // Store the variables used for inlining the CSS
            _removeStyleElements       = removeStyleElements;
            _stripIdAndClassAttributes = stripIdAndClassAttributes;
            _ignoreElements            = ignoreElements;
            _css = css;

            // Gather all of the CSS that we can work with.
            var cssSourceNodes = CssSourceNodes();
            var cssLinkNodes   = CssLinkNodes();
            var cssSources     = new List <ICssSource>(ConvertToStyleSources(cssSourceNodes));

            cssSources.AddRange(ConvertToStyleSources(cssLinkNodes));

            var cssBlocks = GetCssBlocks(cssSources);

            if (_removeStyleElements)
            {
                RemoveStyleElements(cssSourceNodes);
                RemoveStyleElements(cssLinkNodes);
            }

            var joinedBlocks       = Join(cssBlocks);
            var validSelectors     = CleanUnsupportedSelectors(joinedBlocks);
            var elementsWithStyles = FindElementsWithStyles(validSelectors);
            var mergedStyles       = MergeStyleClasses(elementsWithStyles);

            StyleClassApplier.ApplyAllStyles(mergedStyles);

            if (_stripIdAndClassAttributes)
            {
                StripElementAttributes("id", "class");
            }

            if (removeComments)
            {
                var comments = _document.Descendents <IComment>().ToList();

                foreach (var comment in comments)
                {
                    comment.Remove();
                }
            }

            IMarkupFormatter markupFormatter = GetMarkupFormatterForDocType();

            using (var sw = new StringWriter())
            {
                _document.ToHtml(sw, markupFormatter);

                return(new InlineResult(sw.GetStringBuilder(), _warnings));
            }
        }
Example #6
0
        /// <inheritdoc />
        public IEnumerable <Common.Documents.IDocument> Execute(IReadOnlyList <Common.Documents.IDocument> inputs, IExecutionContext context)
        {
            HtmlParser parser = new HtmlParser();

            return(inputs.AsParallel().Select(context, input =>
            {
                // Get the replacement content
                string content = _content.Invoke <string>(input, context);
                if (content == null)
                {
                    return input;
                }

                // Parse the HTML content
                IHtmlDocument htmlDocument = input.ParseHtml(parser);
                if (htmlDocument == null)
                {
                    return input;
                }

                // Evaluate the query selector
                try
                {
                    if (!string.IsNullOrWhiteSpace(_querySelector))
                    {
                        IElement[] elements = _first
                            ? new[] { htmlDocument.QuerySelector(_querySelector) }
                            : htmlDocument.QuerySelectorAll(_querySelector).ToArray();
                        if (elements.Length > 0 && elements[0] != null)
                        {
                            foreach (IElement element in elements)
                            {
                                element.Insert(_position, content);
                            }

                            Stream contentStream = context.GetContentStream();
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                return context.GetDocument(input, contentStream);
                            }
                        }
                    }
                    return input;
                }
                catch (Exception ex)
                {
                    Trace.Warning("Exception while processing HTML for {0}: {1}", input.SourceString(), ex.Message);
                    return input;
                }
            }));
        }
Example #7
0
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context)
        {
            // Get the replacement content
            string content = await _content.GetValueAsync(input, context);

            if (content == null)
            {
                return(input.Yield());
            }

            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query selector
            try
            {
                if (!string.IsNullOrWhiteSpace(_querySelector))
                {
                    IElement[] elements = _first
                        ? new[] { htmlDocument.QuerySelector(_querySelector) }
                        : htmlDocument.QuerySelectorAll(_querySelector).ToArray();
                    if (elements.Length > 0 && elements[0] != null)
                    {
                        foreach (IElement element in elements)
                        {
                            element.Insert(_position, content);
                        }

                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)).Yield());
                            }
                        }
                    }
                }
                return(input.Yield());
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                return(input.Yield());
            }
        }
Example #8
0
 public static string GetHtml(this IHtmlDocument doc)
 {
     using (var ms = new MemoryStream())
     {
         using (var sw = new StreamWriter(ms))
             using (var sr = new StreamReader(ms))
             {
                 doc.ToHtml(sw, XmlMarkupFormatter.Instance);
                 sw.Flush();
                 ms.Position = 0;
                 return(sr.ReadToEnd());
             }
     }
 }
Example #9
0
        public static string SelectByCssSelector(string cssSelector, string html)
        {
            HtmlParser    htmlParser = new HtmlParser();
            IHtmlDocument document   = htmlParser.ParseDocument(html);
            IElement      element    = document.QuerySelector(cssSelector);

            if (element == null)
            {
                throw new CssSelectorException()
                      {
                          Html = document.ToHtml(), CssSelector = cssSelector
                      };
            }

            return(element.InnerHtml);
        }
        /// <inheritdoc />
        protected override async Task <IEnumerable <IDocument> > ExecuteContextAsync(IExecutionContext context)
        {
            HtmlParser            parser     = new HtmlParser();
            IJavaScriptEnginePool enginePool = context.GetJavaScriptEnginePool(x =>
            {
                if (string.IsNullOrWhiteSpace(_highlightJsFile))
                {
                    x.ExecuteResource("highlight-all.js", typeof(HighlightCode));
                }
                else
                {
                    x.ExecuteFile(_highlightJsFile);
                }
            });

            using (enginePool)
            {
                IEnumerable <IDocument> results = await context.Inputs.ParallelSelectAsync(async input =>
                {
                    try
                    {
                        using (Stream stream = input.GetContentStream())
                        {
                            using (IHtmlDocument htmlDocument = await parser.ParseAsync(stream))
                            {
                                foreach (AngleSharp.Dom.IElement element in htmlDocument.QuerySelectorAll(_codeQuerySelector))
                                {
                                    // Don't highlight anything that potentially is already highlighted
                                    if (element.ClassList.Contains("hljs"))
                                    {
                                        continue;
                                    }

                                    try
                                    {
                                        HighlightElement(enginePool, element);
                                    }
                                    catch (Exception innerEx)
                                    {
                                        if (innerEx.Message.Contains("Unknown language: ") && _warnOnMissingLanguage)
                                        {
                                            context.LogWarning($"Exception while highlighting source code: {innerEx.Message}");
                                        }
                                        else
                                        {
                                            context.LogInformation($"Exception while highlighting source code: {innerEx.Message}");
                                        }
                                    }
                                }

                                using (Stream contentStream = await context.GetContentStreamAsync())
                                {
                                    using (StreamWriter writer = contentStream.GetWriter())
                                    {
                                        htmlDocument.ToHtml(writer, HtmlMarkupFormatter.Instance);
                                        writer.Flush();
                                        return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        context.LogWarning("Exception while highlighting source code for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                        return(input);
                    }
                });

                // Materialize the results before disposing the JS engine
                return(results.ToList());
            }
        }
Example #11
0
        private static async System.Threading.Tasks.Task ScrapeAsync(string pageUrl, string outputLocation)
        {
            if (urls.Contains(pageUrl))
            {
                return;
            }
            urls.Add(pageUrl);

            Console.WriteLine($"Traversing link: {pageUrl}");

            CancellationTokenSource cancellationToken = new CancellationTokenSource();
            HttpClient          httpClient            = new HttpClient();
            HttpResponseMessage request = await httpClient.GetAsync(pageUrl);

            cancellationToken.Token.ThrowIfCancellationRequested();

            Stream response = await request.Content.ReadAsStreamAsync();

            cancellationToken.Token.ThrowIfCancellationRequested();

            HtmlParser    parser   = new HtmlParser();
            IHtmlDocument document = parser.ParseDocument(response);

            Url url = new Url(pageUrl);

            Console.WriteLine("ContentLength: " + request.Content.Headers.ContentLength);

            string p = Path.Combine(outputLocation, url.Path);

            Console.WriteLine("Writing to --> " + p);
            Directory.CreateDirectory(Path.GetDirectoryName(p));
            Console.WriteLine(document.ToHtml());

            /*
             * using ( Stream fs = File.OpenWrite( p  ) )
             * {
             *  response.Seek( 0, SeekOrigin.Begin );
             *  response.CopyTo( fs );
             *  //await response.CopyToAsync( fs );
             *  //await fs.FlushAsync();
             * }*/

            var els = document.All.Where(x => x.NodeType == NodeType.Element);

            foreach (var e in els)
            {
                Console.WriteLine($"{e.NodeName} {e.GetType()}");
                if ((e as IHtmlElement).HasAttribute("background"))
                {
                    Url imgUrl = Url.Create((e as IHtmlElement).GetAttribute("background"));
                    if (imgUrl.IsRelative)
                    {
                        Url hrefUrl = imgUrl;

                        string follow = hrefUrl.Href;
                        Url    newUrl = new Url(url, follow);

                        CancellationTokenSource cancellationToken2 = new CancellationTokenSource();
                        HttpClient          httpClient2            = new HttpClient();
                        HttpResponseMessage request2 = await httpClient2.GetAsync(newUrl);

                        cancellationToken2.Token.ThrowIfCancellationRequested();

                        Console.WriteLine("Downloading: " + newUrl);

                        byte[] response2 = await request2.Content.ReadAsByteArrayAsync();

                        cancellationToken2.Token.ThrowIfCancellationRequested();

                        string p2 = Path.Combine(outputLocation, newUrl.Path);
                        Directory.CreateDirectory(Path.GetDirectoryName(p2));
                        Console.WriteLine("Writing to --> " + p2);
                        using (Stream fs = File.OpenWrite(p2))
                        {
                            var l = request2.Content.Headers.ContentLength ?? response2.Length;
                            Console.WriteLine($"***Writing {l} chars {response2.Length}");
                            fs.Write(response2);
                            fs.Flush();
                        }
                    }
                }
            }

            var imgs = document.All
                       .Where(x => x.NodeType == NodeType.Element)
                       .OfType <IHtmlImageElement>();

            if (imgs == null)
            {
                Console.WriteLine("No images in: " + pageUrl);
            }
            else
            {
                foreach (var i in imgs)
                {
                    string src = i.Source;
                    if (i.HasAttribute("src"))
                    {
                        src = i.GetAttribute("src");
                    }
                    Url imgUrl = Url.Create(src);
                    if (imgUrl.IsRelative)
                    {
                        {
                            Url hrefUrl = imgUrl;
                            {
                                string follow = hrefUrl.Href;
                                if (!string.IsNullOrEmpty(hrefUrl.Fragment))
                                {
                                    follow = follow.Substring(0, follow.IndexOf(hrefUrl.Fragment) - 1);
                                }

                                Url newUrl = new Url(url, follow);

                                CancellationTokenSource cancellationToken2 = new CancellationTokenSource();
                                HttpClient          httpClient2            = new HttpClient();
                                HttpResponseMessage request2 = await httpClient2.GetAsync(newUrl);

                                cancellationToken2.Token.ThrowIfCancellationRequested();

                                Console.WriteLine("Downloading: " + newUrl);

                                Stream response2 = await request2.Content.ReadAsStreamAsync();

                                cancellationToken2.Token.ThrowIfCancellationRequested();

                                string p2 = Path.Combine(outputLocation, newUrl.Path);
                                Directory.CreateDirectory(Path.GetDirectoryName(p2));
                                Console.WriteLine("Writing to --> " + p2);
                                using (Stream fs = File.OpenWrite(p2))
                                {
                                    response.Seek(0, SeekOrigin.Begin);
                                    await response2.CopyToAsync(fs);

                                    await fs.FlushAsync();

                                    //response2.CopyTo( fs );
                                    //fs.Flush();
                                }
                            }
                        }
                    }

                    // No need

                    /*if ( !src.Contains( "/" ) )
                     * {
                     *  i.SetAttribute("src", "./" + i.GetAttribute("src") );
                     * }*/
                }
            }

            {
                CancellationTokenSource cancellationToken2 = new CancellationTokenSource();
                HttpClient          httpClient2            = new HttpClient();
                HttpResponseMessage request2 = await httpClient2.GetAsync(pageUrl);

                cancellationToken2.Token.ThrowIfCancellationRequested();

                Console.WriteLine("Downloading: " + pageUrl);

                byte[] response2 = await request2.Content.ReadAsByteArrayAsync();

                cancellationToken2.Token.ThrowIfCancellationRequested();

                string p2 = Path.Combine(outputLocation, url.Path);
                Directory.CreateDirectory(Path.GetDirectoryName(p2));
                Console.WriteLine("Writing to --> " + p2);
                using (Stream fs = File.OpenWrite(p2))
                {
                    var l = request2.Content.Headers.ContentLength ?? response2.Length;
                    Console.WriteLine($"***Writing {l} chars {response2.Length}");
                    fs.Write(response2);
                    fs.Flush();
                }
            }

            /* Write modified HTML
             * using ( StreamWriter fs = new StreamWriter( File.OpenWrite( p ) ) )
             * {
             *  fs.Flush();
             *  fs.Write( document.ToHtml() );
             *  fs.Flush();
             *  fs.Close();
             * }*/

            var refs = document.All.Where(x => x.IsLink());

            foreach (var r in refs)
            {
                foreach (var a in r.Attributes)
                {
                    if (a.Name.Equals("href", StringComparison.InvariantCultureIgnoreCase))
                    {
                        string href = a.Value;

                        if (href.StartsWith("#"))
                        {
                            Console.WriteLine("Skipping anchor link: " + href);
                            break;
                        }

                        Url hrefUrl = Url.Create(href);
                        if (hrefUrl.IsRelative)
                        {
                            string follow = hrefUrl.Href;
                            if (!string.IsNullOrEmpty(hrefUrl.Fragment))
                            {
                                follow = follow.Substring(0, follow.IndexOf(hrefUrl.Fragment) - 1);
                            }

                            Url newUrl = new Url(url, follow);
                            await ScrapeAsync(newUrl.Href, outputLocation);
                        }
                    }
                }
            }
        }
Example #12
0
        /// <inheritdoc />
        public IEnumerable <IDocument> Execute(IReadOnlyList <IDocument> inputs, IExecutionContext context)
        {
            HtmlParser parser = new HtmlParser();

            using (IJsEnginePool enginePool = context.GetJsEnginePool(x =>
            {
                if (string.IsNullOrWhiteSpace(_highlightJsFile))
                {
                    x.ExecuteResource("highlight-all.js", typeof(Highlight));
                }
                else
                {
                    x.ExecuteFile(_highlightJsFile);
                }
            }))
            {
                return(inputs.AsParallel().Select(context, input =>
                {
                    // We materialize the list before exiting the using statement, so safe to access enginePool
                    // ReSharper disable once AccessToDisposedClosure
                    using (IJsEngine engine = enginePool.GetEngine())
                    {
                        try
                        {
                            using (Stream stream = input.GetStream())
                                using (IHtmlDocument htmlDocument = parser.Parse(stream))
                                {
                                    foreach (AngleSharp.Dom.IElement element in htmlDocument.QuerySelectorAll(_codeQuerySelector))
                                    {
                                        // Don't highlight anything that potentially is already highlighted
                                        if (element.ClassList.Contains("hljs"))
                                        {
                                            continue;
                                        }


                                        // Make sure to use TextContent, otherwise you'll get escaped html which highlight.js won't parse
                                        engine.SetVariableValue("input", element.TextContent);

                                        // Check if they specified a language in their code block
                                        string language = element.ClassList.FirstOrDefault(i => i.StartsWith("language"));

                                        try
                                        {
                                            if (language != null)
                                            {
                                                engine.SetVariableValue("language", language.Replace("language-", ""));
                                                engine.Execute("result = hljs.highlight(language, input)");
                                            }
                                            else
                                            {
                                                language = "(auto)"; // set this to auto in case there is an exception below
                                                engine.Execute("result = hljs.highlightAuto(input)");
                                                string detectedLanguage = engine.Evaluate <string>("result.language");
                                                if (string.IsNullOrWhiteSpace(detectedLanguage) == false)
                                                {
                                                    element.ClassList.Add("language-" + detectedLanguage);
                                                }
                                            }

                                            element.ClassList.Add("hljs");
                                            string formatted = engine.Evaluate <string>("result.value");
                                            element.InnerHtml = formatted;
                                        }
                                        catch (Exception innerEx)
                                        {
                                            if (innerEx.Message.Contains("Unknown language: ") && _warnOnMissingLanguage)
                                            {
                                                Trace.Warning("Exception while highlighting source code for {0} using language {1}: {2}", input.SourceString(), language, innerEx.Message);
                                            }
                                            else
                                            {
                                                Trace.Information("Exception while highlighting source code for {0} using language {1}: {2}", input.SourceString(), language, innerEx.Message);
                                            }
                                        }
                                    }
                                    string content = htmlDocument.ToHtml();
                                    return context.GetDocument(input, content);
                                }
                        }
                        catch (Exception ex)
                        {
                            Trace.Warning("Exception while highlighting source code for {0}: {1}", input.SourceString(), ex.Message);
                            return input;
                        }
                    }
                }).ToList());
            }
        }
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteContextAsync(IExecutionContext context)
        {
#pragma warning disable RCS1163 // Unused parameter.
            // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064)
            ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true;
#pragma warning restore RCS1163 // Unused parameter.

            // Cache downloaded resources
            Dictionary <string, string> mirrorCache = new Dictionary <string, string>();

            // Iterate the input documents synchronously so we don't download the same resource more than once
            HtmlParser parser = new HtmlParser();
            return(await context.Inputs
                   .ToAsyncEnumerable()
                   .SelectAwait(async x => await GetDocumentAsync(x))
                   .ToListAsync());

            async Task <Common.IDocument> GetDocumentAsync(Common.IDocument input)
            {
                IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, parser);

                if (htmlDocument != null)
                {
                    bool modifiedDocument = false;

                    // Link element
                    foreach (IElement element in htmlDocument
                             .GetElementsByTagName("link")
                             .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = await DownloadAndReplaceAsync(element.GetAttribute("href"), mirrorCache, context);

                        if (replacement != null)
                        {
                            element.Attributes["href"].Value = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Scripts
                    foreach (IHtmlScriptElement element in htmlDocument.Scripts
                             .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = await DownloadAndReplaceAsync(element.Source, mirrorCache, context);

                        if (replacement != null)
                        {
                            element.Source   = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Return a new document with the replacements if we performed any
                    if (modifiedDocument)
                    {
                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                            }
                        }
                    }
                }

                return(input);
            }
        }
Example #14
0
        /// <inheritdoc />
        public IEnumerable <IDocument> Execute(IReadOnlyList <IDocument> inputs, IExecutionContext context)
        {
            HtmlParser parser = new HtmlParser();

            using (IJavaScriptEnginePool enginePool = context.GetJavaScriptEnginePool(x =>
            {
                if (string.IsNullOrWhiteSpace(_highlightJsFile))
                {
                    x.ExecuteResource("highlight-all.js", typeof(Highlight));
                }
                else
                {
                    x.ExecuteFile(_highlightJsFile);
                }
            }))
            {
                return(inputs.AsParallel().Select(context, input =>
                {
                    try
                    {
                        using (Stream stream = input.GetStream())
                        {
                            using (IHtmlDocument htmlDocument = parser.Parse(stream))
                            {
                                foreach (AngleSharp.Dom.IElement element in htmlDocument.QuerySelectorAll(_codeQuerySelector))
                                {
                                    // Don't highlight anything that potentially is already highlighted
                                    if (element.ClassList.Contains("hljs"))
                                    {
                                        continue;
                                    }

                                    try
                                    {
                                        HighlightElement(enginePool, element);
                                    }
                                    catch (Exception innerEx)
                                    {
                                        if (innerEx.Message.Contains("Unknown language: ") && _warnOnMissingLanguage)
                                        {
                                            Trace.Warning($"Exception while highlighting source code: {innerEx.Message}");
                                        }
                                        else
                                        {
                                            Trace.Information($"Exception while highlighting source code: {innerEx.Message}");
                                        }
                                    }
                                }

                                Stream contentStream = context.GetContentStream();
                                using (StreamWriter writer = contentStream.GetWriter())
                                {
                                    htmlDocument.ToHtml(writer, HtmlMarkupFormatter.Instance);
                                    writer.Flush();
                                    return context.GetDocument(input, contentStream);
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Trace.Warning("Exception while highlighting source code for {0}: {1}", input.SourceString(), ex.Message);
                        return input;
                    }
                }).ToList());
            }
        }
Example #15
0
        private static async Task <Common.IDocument> ResolveDocumentXrefsAsync(
            Common.IDocument input,
            IExecutionContext context,
            ConcurrentDictionary <string, ConcurrentBag <string> > failures)
        {
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument != null)
            {
                // Find and replace "xref:" in links
                bool modifiedDocument = false;
                bool errors           = false;
                foreach (IElement element in htmlDocument
                         .GetElementsByTagName("a")
                         .Where(x => x.HasAttribute("href")))
                {
                    string href = element.GetAttribute("href");
                    if (href.StartsWith("xref:") && href.Length > 5)
                    {
                        string xref                  = href.Substring(5);
                        string queryAndFragment      = string.Empty;
                        int    queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' });
                        if (queryAndFragmentIndex > 0)
                        {
                            queryAndFragment = xref.Substring(queryAndFragmentIndex);
                            xref             = xref.Substring(0, queryAndFragmentIndex);
                        }
                        if (context.TryGetXrefLink(xref, out string xrefLink, out string error))
                        {
                            element.Attributes["href"].Value = xrefLink + queryAndFragment;
                        }
                        else
                        {
                            // Continue processing so we can report all the failures in a given document
                            failures.AddOrUpdate(
                                input.Source.FullPath,
                                _ => new ConcurrentBag <string> {
                                error
                            },
                                (_, list) =>
                            {
                                list.Add(error);
                                return(list);
                            });
                            errors = true;
                        }
                        modifiedDocument = true;
                    }
                }

                // Exit if there were errors
                if (errors)
                {
                    return(null);
                }

                // Return a new document with the replacements if we performed any
                if (modifiedDocument)
                {
                    using (Stream contentStream = await context.GetContentStreamAsync())
                    {
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                        }
                    }
                }
            }
        internal static async Task <IEnumerable <Common.IDocument> > ProcessElementsAsync(
            Common.IDocument input,
            IExecutionContext context,
            string querySelector,
            bool first,
            Action <Common.IDocument, IExecutionContext, IElement, Dictionary <string, object> > processElement)
        {
            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query selector
            try
            {
                if (!string.IsNullOrWhiteSpace(querySelector))
                {
                    IElement[] elements = first
                        ? new[] { htmlDocument.QuerySelector(querySelector) }
                        : htmlDocument.QuerySelectorAll(querySelector).ToArray();
                    if (elements.Length > 0 && elements[0] != null)
                    {
                        INode clone = htmlDocument.Clone(true);  // Clone the document so we know if it changed
                        Dictionary <string, object> metadata = new Dictionary <string, object>();
                        foreach (IElement element in elements)
                        {
                            processElement(input, context, element, metadata);
                        }

                        if (htmlDocument.Equals(clone))
                        {
                            // Elements were not edited so return the original document or clone it with new metadata
                            return(metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield());
                        }

                        // Elements were edited so get the new content
                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html);
                                return(metadata.Count == 0
                                    ? input.Clone(contentProvider).Yield()
                                    : input.Clone(metadata, contentProvider).Yield());
                            }
                        }
                    }
                }
                return(input.Yield());
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                return(input.Yield());
            }
        }