private void EmitNewLine(StringBuilder builder, ConversionState state, int count = 2) { if (state.LinePrefix != null) { if (state.PreventNewLines) { this.logger.LogWarning("New lines are being emitted in an unexpected context (e.g. a list in a table cell) - output is likely malformed."); } builder.AppendLine(); for (var i = 0; i < count - 1; i++) { builder.Append(state.LinePrefix) .AppendLine(" "); } builder.Append(state.LinePrefix) .Append(' '); } else { if (state.PreventNewLines) { if (builder[^ 1] != ' ')
private ConversionState(ConversionState previous) { this.RenderingEnabled = previous.RenderingEnabled; this.ListDepth = previous.ListDepth; this.ListItemPrefix = previous.ListItemPrefix; this.EmitMarkDownStyles = previous.EmitMarkDownStyles; this.LinePrefix = previous.LinePrefix; this.NodesToExclude = previous.NodesToExclude; this.PreventNewLines = previous.PreventNewLines; }
private void EmitListItemPrefix(StringBuilder builder, ConversionState state) { this.EmitNewLine(builder, state, 1); if (state.ListDepth > 1) { builder.Append(' ', 4 * (state.ListDepth - 1)); } builder.Append(state.ListItemPrefix ?? "-") .Append(" "); }
private void ProcessChildNodes( Uri pageUri, HtmlNodeCollection nodes, StringBuilder builder, ImageCollector imageCollector, ConversionState state) { foreach (var childNode in nodes) { this.ProcessNode(pageUri, childNode, builder, imageCollector, state); } }
private async Task <ConvertedDocument> ConvertAsync(Uri pageUri, StringBuilder builder, ImageCollector imageCollector) { this.logger.LogInformation("Loading page content for {PageUri}", pageUri); var content = await this.httpClient.GetStringAsync(pageUri); var doc = new HtmlDocument(); doc.LoadHtml(content); var frontMatter = this.frontMatterExtractor.Extract(this.options.FrontMatter, doc, pageUri); if (frontMatter != null) { builder.Append(frontMatter); } this.logger.LogDebug("Processing page content"); this.logger.LogTrace("Building list of explicitly included elements"); var nodesToProcess = this.includeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToList(); var nodesToExclude = this.excludeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToHashSet(); if (nodesToProcess.Count == 0) { nodesToProcess.Add(doc.DocumentNode); } var index = 0; foreach (var node in nodesToProcess) { this.ProcessNode(pageUri, node, builder, imageCollector, ConversionState.InitialState(nodesToExclude)); if (++index != nodesToProcess.Count) { builder.AppendLine().AppendLine(); } } return(new ConvertedDocument(pageUri, this.RemoveRedundantWhiteSpace(builder.ToString()))); }
private (IReadOnlyList <string> headers, bool skipFirstRow) GetTableHeaders(HtmlNode node, ConversionState state) { var headRow = node.SelectSingleNode("thead/tr"); var skipFirstRow = false; if (headRow == null) { headRow = node.SelectSingleNode(".//tr"); skipFirstRow = true; } return ( (headRow.SelectNodes("td") ?? Enumerable.Empty <HtmlNode>()) .Concat(headRow.SelectNodes("th") ?? Enumerable.Empty <HtmlNode>()) .Select(n => this.ExtractText(n, state)) .ToList(), skipFirstRow ); }
private void EmitTable(Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector, ConversionState state) { this.EmitNewLine(builder, state); state = state.WithAllNewLinesStripped(); var(headers, skipFirstRow) = this.GetTableHeaders(node, state); foreach (var header in headers) { builder.Append("|").Append(header); } builder.AppendLine("|"); this.EmitRepeated(builder, "|-", headers.Count, "|"); IEnumerable <HtmlNode> rows = this.GetTableRows(node); if (skipFirstRow) { rows = rows.Skip(1); } foreach (var row in rows) { var cells = row.SelectNodes("td"); if (cells.Count != headers.Count) { this.logger.LogWarning("Table row has different number of columns to header - output will likely be malformed"); } foreach (var cell in cells) { builder.Append("|"); this.ProcessNode(pageUri, cell, builder, imageCollector, state); } builder.AppendLine("|"); } }
private void ProcessNode( Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector, ConversionState state) { switch (node.NodeType) { case HtmlNodeType.Text: if (state.RenderingEnabled) { builder.Append(this.ExtractText(node, state)); } break; case HtmlNodeType.Document: case HtmlNodeType.Element: if (!this.excludeTags.Contains(node.Name) && !state.NodesToExclude.Contains(node)) { var emitNewLineAfterChildren = false; if (this.IsIncludedTag(node.Name)) { state = state.WithRenderingEnabled(); } ConversionState?childState = null; if (state.RenderingEnabled) { if (state.EmitMarkDownStyles) { switch (node.Name) { case "table": this.EmitTable(pageUri, node, builder, imageCollector, state); this.EmitNewLine(builder, state); return; case "img": this.EmitImage(pageUri, node, builder, imageCollector); return; case "p": emitNewLineAfterChildren = true; break; case "br": this.EmitNewLine(builder, state, 1); break; case "blockquote": childState = state.WithLinePrefix(state.LinePrefix ?? "" + ">"); this.EmitNewLine(builder, state); builder.Append(childState?.LinePrefix).Append(" "); emitNewLineAfterChildren = true; break; case "ul": state = state.StartUnorderedList(); emitNewLineAfterChildren = state.ListDepth == 1; break; case "ol": state = state.StartOrderedList(); emitNewLineAfterChildren = state.ListDepth == 1; break; case "li": this.EmitListItemPrefix(builder, state); break; case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": this.EmitNewLine(builder, state); emitNewLineAfterChildren = true; builder.Append('#', node.Name[1] - '0').Append(' '); break; case "a": this.EmitLink(pageUri, node, builder, imageCollector, state); return; case "i": case "em": this.EmitFormattedText(pageUri, node, builder, "*", imageCollector, state); return; case "b": case "strong": this.EmitFormattedText(pageUri, node, builder, "**", imageCollector, state); return; case "pre": this.EmitPreformattedText(pageUri, node, builder, imageCollector, state); return; } } else { switch (node.Name) { case "br": builder.AppendLine(); break; } } } this.ProcessChildNodes(pageUri, node.ChildNodes, builder, imageCollector, childState ?? state); if (emitNewLineAfterChildren) { this.EmitNewLine(builder, state); } } break; } }