public async Task <ConvertionResult> ConvertAsync(Uri url) { var builder = new StringBuilder(); var imageCollector = new ImageCollector(this.logger); var document = await this.ConvertAsync(url, builder, imageCollector); return(new ConvertionResult( new[] { document }, await imageCollector.GetCollectedImagesAsync(this.httpClient))); }
private void ProcessChildNodes( Uri pageUri, HtmlNodeCollection nodes, StringBuilder builder, ImageCollector imageCollector, ConversionState state) { foreach (var childNode in nodes) { this.ProcessNode(pageUri, childNode, builder, imageCollector, state); } }
public async Task <ConvertionResult> ConvertAsync(IEnumerable <Uri> urls) { urls = urls as IList <Uri> ?? urls.ToList(); var builder = new StringBuilder(); var imageCollector = new ImageCollector(this.logger); var documents = new List <ConvertedDocument>(urls.Count()); foreach (var url in urls) { documents.Add(await this.ConvertAsync(url, builder, imageCollector)); builder.Length = 0; } return(new ConvertionResult( documents, await imageCollector.GetCollectedImagesAsync(this.httpClient))); }
private void EmitImage(Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector) { var src = node.GetAttributeValue("src", null); var alt = node.GetAttributeValue("alt", "image"); if (src == null) { this.logger.LogWarning("No src attribute for image - it will not be emitted: {NodeHtml}", node.OuterHtml); } else { var imageUri = src; if (imageCollector.CanCollect(pageUri, src)) { imageUri = this.BuildImagePath(imageCollector.Collect(pageUri, src)); } builder.Append("![").Append(alt).Append("](").Append(imageUri).Append(')'); } }
private void EmitTable(Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector, ConversionState state) { this.EmitNewLine(builder, state); state = state.WithAllNewLinesStripped(); var(headers, skipFirstRow) = this.GetTableHeaders(node, state); foreach (var header in headers) { builder.Append("|").Append(header); } builder.AppendLine("|"); this.EmitRepeated(builder, "|-", headers.Count, "|"); IEnumerable <HtmlNode> rows = this.GetTableRows(node); if (skipFirstRow) { rows = rows.Skip(1); } foreach (var row in rows) { var cells = row.SelectNodes("td"); if (cells.Count != headers.Count) { this.logger.LogWarning("Table row has different number of columns to header - output will likely be malformed"); } foreach (var cell in cells) { builder.Append("|"); this.ProcessNode(pageUri, cell, builder, imageCollector, state); } builder.AppendLine("|"); } }
private async Task <ConvertedDocument> ConvertAsync(Uri pageUri, StringBuilder builder, ImageCollector imageCollector) { this.logger.LogInformation("Loading page content for {PageUri}", pageUri); var content = await this.httpClient.GetStringAsync(pageUri); var doc = new HtmlDocument(); doc.LoadHtml(content); var frontMatter = this.frontMatterExtractor.Extract(this.options.FrontMatter, doc, pageUri); if (frontMatter != null) { builder.Append(frontMatter); } this.logger.LogDebug("Processing page content"); this.logger.LogTrace("Building list of explicitly included elements"); var nodesToProcess = this.includeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToList(); var nodesToExclude = this.excludeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToHashSet(); if (nodesToProcess.Count == 0) { nodesToProcess.Add(doc.DocumentNode); } var index = 0; foreach (var node in nodesToProcess) { this.ProcessNode(pageUri, node, builder, imageCollector, ConversionState.InitialState(nodesToExclude)); if (++index != nodesToProcess.Count) { builder.AppendLine().AppendLine(); } } return(new ConvertedDocument(pageUri, this.RemoveRedundantWhiteSpace(builder.ToString()))); }
private void ProcessNode( Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector, ConversionState state) { switch (node.NodeType) { case HtmlNodeType.Text: if (state.RenderingEnabled) { builder.Append(this.ExtractText(node, state)); } break; case HtmlNodeType.Document: case HtmlNodeType.Element: if (!this.excludeTags.Contains(node.Name) && !state.NodesToExclude.Contains(node)) { var emitNewLineAfterChildren = false; if (this.IsIncludedTag(node.Name)) { state = state.WithRenderingEnabled(); } ConversionState?childState = null; if (state.RenderingEnabled) { if (state.EmitMarkDownStyles) { switch (node.Name) { case "table": this.EmitTable(pageUri, node, builder, imageCollector, state); this.EmitNewLine(builder, state); return; case "img": this.EmitImage(pageUri, node, builder, imageCollector); return; case "p": emitNewLineAfterChildren = true; break; case "br": this.EmitNewLine(builder, state, 1); break; case "blockquote": childState = state.WithLinePrefix(state.LinePrefix ?? "" + ">"); this.EmitNewLine(builder, state); builder.Append(childState?.LinePrefix).Append(" "); emitNewLineAfterChildren = true; break; case "ul": state = state.StartUnorderedList(); emitNewLineAfterChildren = state.ListDepth == 1; break; case "ol": state = state.StartOrderedList(); emitNewLineAfterChildren = state.ListDepth == 1; break; case "li": this.EmitListItemPrefix(builder, state); break; case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": this.EmitNewLine(builder, state); emitNewLineAfterChildren = true; builder.Append('#', node.Name[1] - '0').Append(' '); break; case "a": this.EmitLink(pageUri, node, builder, imageCollector, state); return; case "i": case "em": this.EmitFormattedText(pageUri, node, builder, "*", imageCollector, state); return; case "b": case "strong": this.EmitFormattedText(pageUri, node, builder, "**", imageCollector, state); return; case "pre": this.EmitPreformattedText(pageUri, node, builder, imageCollector, state); return; } } else { switch (node.Name) { case "br": builder.AppendLine(); break; } } } this.ProcessChildNodes(pageUri, node.ChildNodes, builder, imageCollector, childState ?? state); if (emitNewLineAfterChildren) { this.EmitNewLine(builder, state); } } break; } }