Example #1
0
        private void EmitNewLine(StringBuilder builder, ConversionState state, int count = 2)
        {
            if (state.LinePrefix != null)
            {
                if (state.PreventNewLines)
                {
                    this.logger.LogWarning("New lines are being emitted in an unexpected context (e.g. a list in a table cell) - output is likely malformed.");
                }

                builder.AppendLine();

                for (var i = 0; i < count - 1; i++)
                {
                    builder.Append(state.LinePrefix)
                    .AppendLine(" ");
                }

                builder.Append(state.LinePrefix)
                .Append(' ');
            }
            else
            {
                if (state.PreventNewLines)
                {
                    if (builder[^ 1] != ' ')
Example #2
0
 private ConversionState(ConversionState previous)
 {
     this.RenderingEnabled   = previous.RenderingEnabled;
     this.ListDepth          = previous.ListDepth;
     this.ListItemPrefix     = previous.ListItemPrefix;
     this.EmitMarkDownStyles = previous.EmitMarkDownStyles;
     this.LinePrefix         = previous.LinePrefix;
     this.NodesToExclude     = previous.NodesToExclude;
     this.PreventNewLines    = previous.PreventNewLines;
 }
Example #3
0
        private void EmitListItemPrefix(StringBuilder builder, ConversionState state)
        {
            this.EmitNewLine(builder, state, 1);

            if (state.ListDepth > 1)
            {
                builder.Append(' ', 4 * (state.ListDepth - 1));
            }

            builder.Append(state.ListItemPrefix ?? "-")
            .Append(" ");
        }
Example #4
0
 private void ProcessChildNodes(
     Uri pageUri,
     HtmlNodeCollection nodes,
     StringBuilder builder,
     ImageCollector imageCollector,
     ConversionState state)
 {
     foreach (var childNode in nodes)
     {
         this.ProcessNode(pageUri, childNode, builder, imageCollector, state);
     }
 }
Example #5
0
        private async Task <ConvertedDocument> ConvertAsync(Uri pageUri, StringBuilder builder, ImageCollector imageCollector)
        {
            this.logger.LogInformation("Loading page content for {PageUri}", pageUri);
            var content = await this.httpClient.GetStringAsync(pageUri);

            var doc = new HtmlDocument();

            doc.LoadHtml(content);

            var frontMatter = this.frontMatterExtractor.Extract(this.options.FrontMatter, doc, pageUri);

            if (frontMatter != null)
            {
                builder.Append(frontMatter);
            }

            this.logger.LogDebug("Processing page content");

            this.logger.LogTrace("Building list of explicitly included elements");
            var nodesToProcess = this.includeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToList();
            var nodesToExclude = this.excludeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToHashSet();

            if (nodesToProcess.Count == 0)
            {
                nodesToProcess.Add(doc.DocumentNode);
            }

            var index = 0;

            foreach (var node in nodesToProcess)
            {
                this.ProcessNode(pageUri, node, builder, imageCollector, ConversionState.InitialState(nodesToExclude));
                if (++index != nodesToProcess.Count)
                {
                    builder.AppendLine().AppendLine();
                }
            }

            return(new ConvertedDocument(pageUri, this.RemoveRedundantWhiteSpace(builder.ToString())));
        }
Example #6
0
        private (IReadOnlyList <string> headers, bool skipFirstRow) GetTableHeaders(HtmlNode node, ConversionState state)
        {
            var headRow      = node.SelectSingleNode("thead/tr");
            var skipFirstRow = false;

            if (headRow == null)
            {
                headRow      = node.SelectSingleNode(".//tr");
                skipFirstRow = true;
            }

            return
                (
                (headRow.SelectNodes("td") ?? Enumerable.Empty <HtmlNode>())
                .Concat(headRow.SelectNodes("th") ?? Enumerable.Empty <HtmlNode>())
                .Select(n => this.ExtractText(n, state))
                .ToList(),
                skipFirstRow
                );
        }
Example #7
0
        private void EmitTable(Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector, ConversionState state)
        {
            this.EmitNewLine(builder, state);

            state = state.WithAllNewLinesStripped();
            var(headers, skipFirstRow) = this.GetTableHeaders(node, state);

            foreach (var header in headers)
            {
                builder.Append("|").Append(header);
            }

            builder.AppendLine("|");

            this.EmitRepeated(builder, "|-", headers.Count, "|");

            IEnumerable <HtmlNode> rows = this.GetTableRows(node);

            if (skipFirstRow)
            {
                rows = rows.Skip(1);
            }

            foreach (var row in rows)
            {
                var cells = row.SelectNodes("td");
                if (cells.Count != headers.Count)
                {
                    this.logger.LogWarning("Table row has different number of columns to header - output will likely be malformed");
                }

                foreach (var cell in cells)
                {
                    builder.Append("|");
                    this.ProcessNode(pageUri, cell, builder, imageCollector, state);
                }

                builder.AppendLine("|");
            }
        }
Example #8
0
        private void ProcessNode(
            Uri pageUri,
            HtmlNode node,
            StringBuilder builder,
            ImageCollector imageCollector,
            ConversionState state)
        {
            switch (node.NodeType)
            {
            case HtmlNodeType.Text:
                if (state.RenderingEnabled)
                {
                    builder.Append(this.ExtractText(node, state));
                }

                break;

            case HtmlNodeType.Document:
            case HtmlNodeType.Element:
                if (!this.excludeTags.Contains(node.Name) && !state.NodesToExclude.Contains(node))
                {
                    var emitNewLineAfterChildren = false;
                    if (this.IsIncludedTag(node.Name))
                    {
                        state = state.WithRenderingEnabled();
                    }

                    ConversionState?childState = null;
                    if (state.RenderingEnabled)
                    {
                        if (state.EmitMarkDownStyles)
                        {
                            switch (node.Name)
                            {
                            case "table":
                                this.EmitTable(pageUri, node, builder, imageCollector, state);
                                this.EmitNewLine(builder, state);
                                return;

                            case "img":
                                this.EmitImage(pageUri, node, builder, imageCollector);
                                return;

                            case "p":
                                emitNewLineAfterChildren = true;
                                break;

                            case "br":
                                this.EmitNewLine(builder, state, 1);
                                break;

                            case "blockquote":
                                childState = state.WithLinePrefix(state.LinePrefix ?? "" + ">");
                                this.EmitNewLine(builder, state);
                                builder.Append(childState?.LinePrefix).Append(" ");
                                emitNewLineAfterChildren = true;
                                break;

                            case "ul":
                                state = state.StartUnorderedList();
                                emitNewLineAfterChildren = state.ListDepth == 1;
                                break;

                            case "ol":
                                state = state.StartOrderedList();
                                emitNewLineAfterChildren = state.ListDepth == 1;
                                break;

                            case "li":
                                this.EmitListItemPrefix(builder, state);
                                break;

                            case "h1":
                            case "h2":
                            case "h3":
                            case "h4":
                            case "h5":
                            case "h6":
                                this.EmitNewLine(builder, state);
                                emitNewLineAfterChildren = true;
                                builder.Append('#', node.Name[1] - '0').Append(' ');
                                break;

                            case "a":
                                this.EmitLink(pageUri, node, builder, imageCollector, state);
                                return;

                            case "i":
                            case "em":
                                this.EmitFormattedText(pageUri, node, builder, "*", imageCollector, state);
                                return;

                            case "b":
                            case "strong":
                                this.EmitFormattedText(pageUri, node, builder, "**", imageCollector, state);
                                return;

                            case "pre":
                                this.EmitPreformattedText(pageUri, node, builder, imageCollector, state);
                                return;
                            }
                        }
                        else
                        {
                            switch (node.Name)
                            {
                            case "br":
                                builder.AppendLine();
                                break;
                            }
                        }
                    }

                    this.ProcessChildNodes(pageUri, node.ChildNodes, builder, imageCollector, childState ?? state);

                    if (emitNewLineAfterChildren)
                    {
                        this.EmitNewLine(builder, state);
                    }
                }

                break;
            }
        }