Example #1
0
        public async Task <ConvertionResult> ConvertAsync(Uri url)
        {
            var builder        = new StringBuilder();
            var imageCollector = new ImageCollector(this.logger);

            var document = await this.ConvertAsync(url, builder, imageCollector);

            return(new ConvertionResult(
                       new[] { document },
                       await imageCollector.GetCollectedImagesAsync(this.httpClient)));
        }
Example #2
0
 private void ProcessChildNodes(
     Uri pageUri,
     HtmlNodeCollection nodes,
     StringBuilder builder,
     ImageCollector imageCollector,
     ConversionState state)
 {
     foreach (var childNode in nodes)
     {
         this.ProcessNode(pageUri, childNode, builder, imageCollector, state);
     }
 }
Example #3
0
        public async Task <ConvertionResult> ConvertAsync(IEnumerable <Uri> urls)
        {
            urls = urls as IList <Uri> ?? urls.ToList();
            var builder        = new StringBuilder();
            var imageCollector = new ImageCollector(this.logger);
            var documents      = new List <ConvertedDocument>(urls.Count());

            foreach (var url in urls)
            {
                documents.Add(await this.ConvertAsync(url, builder, imageCollector));
                builder.Length = 0;
            }

            return(new ConvertionResult(
                       documents,
                       await imageCollector.GetCollectedImagesAsync(this.httpClient)));
        }
Example #4
0
        private void EmitImage(Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector)
        {
            var src = node.GetAttributeValue("src", null);
            var alt = node.GetAttributeValue("alt", "image");

            if (src == null)
            {
                this.logger.LogWarning("No src attribute for image - it will not be emitted: {NodeHtml}", node.OuterHtml);
            }
            else
            {
                var imageUri = src;
                if (imageCollector.CanCollect(pageUri, src))
                {
                    imageUri = this.BuildImagePath(imageCollector.Collect(pageUri, src));
                }

                builder.Append("![").Append(alt).Append("](").Append(imageUri).Append(')');
            }
        }
Example #5
0
        private void EmitTable(Uri pageUri, HtmlNode node, StringBuilder builder, ImageCollector imageCollector, ConversionState state)
        {
            this.EmitNewLine(builder, state);

            state = state.WithAllNewLinesStripped();
            var(headers, skipFirstRow) = this.GetTableHeaders(node, state);

            foreach (var header in headers)
            {
                builder.Append("|").Append(header);
            }

            builder.AppendLine("|");

            this.EmitRepeated(builder, "|-", headers.Count, "|");

            IEnumerable <HtmlNode> rows = this.GetTableRows(node);

            if (skipFirstRow)
            {
                rows = rows.Skip(1);
            }

            foreach (var row in rows)
            {
                var cells = row.SelectNodes("td");
                if (cells.Count != headers.Count)
                {
                    this.logger.LogWarning("Table row has different number of columns to header - output will likely be malformed");
                }

                foreach (var cell in cells)
                {
                    builder.Append("|");
                    this.ProcessNode(pageUri, cell, builder, imageCollector, state);
                }

                builder.AppendLine("|");
            }
        }
Example #6
0
        private async Task <ConvertedDocument> ConvertAsync(Uri pageUri, StringBuilder builder, ImageCollector imageCollector)
        {
            this.logger.LogInformation("Loading page content for {PageUri}", pageUri);
            var content = await this.httpClient.GetStringAsync(pageUri);

            var doc = new HtmlDocument();

            doc.LoadHtml(content);

            var frontMatter = this.frontMatterExtractor.Extract(this.options.FrontMatter, doc, pageUri);

            if (frontMatter != null)
            {
                builder.Append(frontMatter);
            }

            this.logger.LogDebug("Processing page content");

            this.logger.LogTrace("Building list of explicitly included elements");
            var nodesToProcess = this.includeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToList();
            var nodesToExclude = this.excludeXPaths.SelectMany(p => doc.DocumentNode.SelectNodes(p) ?? Enumerable.Empty <HtmlNode>()).ToHashSet();

            if (nodesToProcess.Count == 0)
            {
                nodesToProcess.Add(doc.DocumentNode);
            }

            var index = 0;

            foreach (var node in nodesToProcess)
            {
                this.ProcessNode(pageUri, node, builder, imageCollector, ConversionState.InitialState(nodesToExclude));
                if (++index != nodesToProcess.Count)
                {
                    builder.AppendLine().AppendLine();
                }
            }

            return(new ConvertedDocument(pageUri, this.RemoveRedundantWhiteSpace(builder.ToString())));
        }
Example #7
0
        private void ProcessNode(
            Uri pageUri,
            HtmlNode node,
            StringBuilder builder,
            ImageCollector imageCollector,
            ConversionState state)
        {
            switch (node.NodeType)
            {
            case HtmlNodeType.Text:
                if (state.RenderingEnabled)
                {
                    builder.Append(this.ExtractText(node, state));
                }

                break;

            case HtmlNodeType.Document:
            case HtmlNodeType.Element:
                if (!this.excludeTags.Contains(node.Name) && !state.NodesToExclude.Contains(node))
                {
                    var emitNewLineAfterChildren = false;
                    if (this.IsIncludedTag(node.Name))
                    {
                        state = state.WithRenderingEnabled();
                    }

                    ConversionState?childState = null;
                    if (state.RenderingEnabled)
                    {
                        if (state.EmitMarkDownStyles)
                        {
                            switch (node.Name)
                            {
                            case "table":
                                this.EmitTable(pageUri, node, builder, imageCollector, state);
                                this.EmitNewLine(builder, state);
                                return;

                            case "img":
                                this.EmitImage(pageUri, node, builder, imageCollector);
                                return;

                            case "p":
                                emitNewLineAfterChildren = true;
                                break;

                            case "br":
                                this.EmitNewLine(builder, state, 1);
                                break;

                            case "blockquote":
                                childState = state.WithLinePrefix(state.LinePrefix ?? "" + ">");
                                this.EmitNewLine(builder, state);
                                builder.Append(childState?.LinePrefix).Append(" ");
                                emitNewLineAfterChildren = true;
                                break;

                            case "ul":
                                state = state.StartUnorderedList();
                                emitNewLineAfterChildren = state.ListDepth == 1;
                                break;

                            case "ol":
                                state = state.StartOrderedList();
                                emitNewLineAfterChildren = state.ListDepth == 1;
                                break;

                            case "li":
                                this.EmitListItemPrefix(builder, state);
                                break;

                            case "h1":
                            case "h2":
                            case "h3":
                            case "h4":
                            case "h5":
                            case "h6":
                                this.EmitNewLine(builder, state);
                                emitNewLineAfterChildren = true;
                                builder.Append('#', node.Name[1] - '0').Append(' ');
                                break;

                            case "a":
                                this.EmitLink(pageUri, node, builder, imageCollector, state);
                                return;

                            case "i":
                            case "em":
                                this.EmitFormattedText(pageUri, node, builder, "*", imageCollector, state);
                                return;

                            case "b":
                            case "strong":
                                this.EmitFormattedText(pageUri, node, builder, "**", imageCollector, state);
                                return;

                            case "pre":
                                this.EmitPreformattedText(pageUri, node, builder, imageCollector, state);
                                return;
                            }
                        }
                        else
                        {
                            switch (node.Name)
                            {
                            case "br":
                                builder.AppendLine();
                                break;
                            }
                        }
                    }

                    this.ProcessChildNodes(pageUri, node.ChildNodes, builder, imageCollector, childState ?? state);

                    if (emitNewLineAfterChildren)
                    {
                        this.EmitNewLine(builder, state);
                    }
                }

                break;
            }
        }