protected override async Task <IEnumerable <Common.IDocument> > ExecuteAsync(Common.IDocument input, IExecutionContext context)
        {
            if (string.IsNullOrWhiteSpace(_metadataKey))
            {
                return(input.Yield());
            }

            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Get the query string excerpt first
            string queryExcerpt = GetQueryExcerpt(htmlDocument);

            // Now try to get a excerpt separator
            string separatorExcerpt = GetSeparatorExcerpt(htmlDocument);

            // Set the metadata
            string excerpt = separatorExcerpt ?? queryExcerpt;

            if (excerpt != null)
            {
                return(input
                       .Clone(new MetadataItems
                {
                    { _metadataKey, excerpt.Trim() }
                })
                       .Yield());
            }
            return(input.Yield());
        }
Exemple #2
0
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteAsync(Common.IDocument input, IExecutionContext context)
        {
            try
            {
                // Get the links and HTML decode the keys (if they're encoded) since the text nodes are decoded
                IDictionary <string, string> links = await _links.GetValueAsync(input, context, v => _extraLinks
                                                                                .Concat(v.Where(l => !_extraLinks.ContainsKey(l.Key)))
                                                                                .Where(x => !string.IsNullOrWhiteSpace(x.Value))
                                                                                .ToDictionary(z => WebUtility.HtmlDecode(z.Key), z => $"<a href=\"{z.Value}\">{z.Key}</a>"));

                // Enumerate all elements that match the query selector not already in a link element
                List <KeyValuePair <IText, string> > replacements = new List <KeyValuePair <IText, string> >();
                IHtmlDocument htmlDocument;
                using (Stream stream = input.GetStream())
                {
                    htmlDocument = await HtmlParser.ParseAsync(stream);
                }
                foreach (IElement element in htmlDocument.QuerySelectorAll(_querySelector).Where(t => !t.Ancestors <IHtmlAnchorElement>().Any()))
                {
                    // Enumerate all descendant text nodes not already in a link element
                    foreach (IText text in element.Descendents().OfType <IText>().Where(t => !t.Ancestors <IHtmlAnchorElement>().Any()))
                    {
                        if (ReplaceStrings(text, links, out string newText))
                        {
                            // Only perform replacement if the text content changed
                            replacements.Add(new KeyValuePair <IText, string>(text, newText));
                        }
                    }
                }

                // Perform the replacements if there were any, otherwise just return the same document
                if (replacements.Count > 0)
                {
                    foreach (KeyValuePair <IText, string> replacement in replacements)
                    {
                        replacement.Key.Replace(HtmlParser.ParseFragment(replacement.Value, replacement.Key.ParentElement).ToArray());
                    }

                    using (Stream contentStream = await context.GetContentStreamAsync())
                    {
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            return(input.Clone(context.GetContentProvider(contentStream)).Yield());
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while parsing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
            }

            return(input.Yield());
        }
 protected override Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) =>
 ProcessHtml.ProcessElementsAsync(
     input,
     context,
     "[href],[src]",
     false,
     (d, c, e, m) =>
 {
     MakeLinkAbsolute(e, "href", context);
     MakeLinkAbsolute(e, "src", context);
 });
Exemple #4
0
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteAsync(Common.IDocument input, IExecutionContext context)
        {
            // Get the replacement content
            string content = await _content.GetValueAsync(input, context);

            if (content == null)
            {
                return(input.Yield());
            }

            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query selector
            try
            {
                if (!string.IsNullOrWhiteSpace(_querySelector))
                {
                    IElement[] elements = _first
                        ? new[] { htmlDocument.QuerySelector(_querySelector) }
                        : htmlDocument.QuerySelectorAll(_querySelector).ToArray();
                    if (elements.Length > 0 && elements[0] != null)
                    {
                        foreach (IElement element in elements)
                        {
                            element.Insert(_position, content);
                        }

                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                return(input.Clone(context.GetContentProvider(contentStream)).Yield());
                            }
                        }
                    }
                }
                return(input.Yield());
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                return(input.Yield());
            }
        }
Exemple #5
0
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context)
        {
            if (string.IsNullOrWhiteSpace(_metadataKey))
            {
                return(input.Yield());
            }

            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query and create the holding nodes
            Heading        previousHeading = null;
            List <Heading> headings        = htmlDocument
                                             .QuerySelectorAll(_query)
                                             .Select(x =>
            {
                previousHeading = new Heading
                {
                    Element  = x,
                    Previous = previousHeading,
                    Level    = int.Parse(x.NodeName.Substring(1))
                };
                return(previousHeading);
            })
                                             .ToList();

            // Build the tree from the bottom-up
            for (int level = _level; level >= 1; level--)
            {
                int currentLevel = level;
                foreach (Heading heading in headings.Where(x => x.Level == currentLevel))
                {
                    // Get the parent
                    Heading parent = null;
                    if (currentLevel > 1)
                    {
                        parent = heading.Previous;
                        while (parent != null && parent.Level >= currentLevel)
                        {
                            parent = parent.Previous;
                        }
                    }

                    // Create the document
                    MetadataItems metadata = new MetadataItems();
                    if (_levelKey != null)
                    {
                        metadata.Add(_levelKey, heading.Level);
                    }
                    if (_idKey != null && heading.Element.HasAttribute("id"))
                    {
                        metadata.Add(_idKey, heading.Element.GetAttribute("id"));
                    }
                    if (_headingKey != null)
                    {
                        metadata.Add(_headingKey, heading.Element.InnerHtml);
                    }
                    if (_childrenKey != null)
                    {
                        metadata.Add(_childrenKey, heading.Children.AsReadOnly());
                    }

                    using (Stream contentStream = await context.GetContentStreamAsync())
                    {
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            heading.Element.ChildNodes.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            heading.Document = context.CreateDocument(metadata, context.GetContentProvider(contentStream, MediaTypes.Html));
                        }
                    }

                    // Add to parent
                    parent?.Children.Add(heading.Document);
                }
            }

            return(input
                   .Clone(new MetadataItems
            {
                {
                    _metadataKey,
                    _nesting
                            ? headings
                    .Where(x => x.Level == headings.Min(y => y.Level))
                    .Select(x => x.Document)
                    .ToArray()
                            : headings
                    .Select(x => x.Document)
                    .ToArray()
                }
            })
                   .Yield());
        }
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context)
        {
            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query selector
            try
            {
                if (!string.IsNullOrWhiteSpace(_querySelector))
                {
                    IElement[] elements = _first
                        ? new[] { htmlDocument.QuerySelector(_querySelector) }
                        : htmlDocument.QuerySelectorAll(_querySelector).ToArray();
                    if (elements.Length > 0 && elements[0] != null)
                    {
                        List <Common.IDocument> documents = new List <Common.IDocument>();
                        foreach (IElement element in elements)
                        {
                            // Get the metadata
                            Dictionary <string, object> metadata = new Dictionary <string, object>();
                            foreach (Action <IElement, Dictionary <string, object> > metadataAction in _metadataActions)
                            {
                                metadataAction(element, metadata);
                            }

                            // Clone the document and optionally change content to the HTML element
                            if (_outerHtmlContent.HasValue)
                            {
                                using (Stream contentStream = await context.GetContentStreamAsync())
                                {
                                    using (StreamWriter writer = contentStream.GetWriter())
                                    {
                                        if (_outerHtmlContent.Value)
                                        {
                                            element.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                        }
                                        else
                                        {
                                            element.ChildNodes.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                        }
                                        writer.Flush();
                                        documents.Add(
                                            input.Clone(
                                                metadata.Count == 0 ? null : metadata,
                                                context.GetContentProvider(contentStream)));
                                    }
                                }
                            }
                            else
                            {
                                documents.Add(input.Clone(metadata));
                            }
                        }
                        return(documents);
                    }
                }
                return(input.Yield());
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                return(input.Yield());
            }
        }
Exemple #7
0
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context)
        {
            // Get the replacement content
            string content = await _content.GetValueAsync(input, context);

            if (content == null)
            {
                return(input.Yield());
            }

            return(await ProcessHtml.ProcessElementsAsync(
                       input,
                       context,
                       _querySelector,
                       _first,
                       (i, c, e, m) => e.Insert(_position, content)));
        }
        private static async Task <Common.IDocument> ResolveDocumentXrefsAsync(
            Common.IDocument input,
            IExecutionContext context,
            ConcurrentDictionary <string, ConcurrentBag <string> > failures)
        {
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument != null)
            {
                // Find and replace "xref:" in links
                bool modifiedDocument = false;
                bool errors           = false;
                foreach (IElement element in htmlDocument
                         .GetElementsByTagName("a")
                         .Where(x => x.HasAttribute("href")))
                {
                    string href = element.GetAttribute("href");
                    if (href.StartsWith("xref:") && href.Length > 5)
                    {
                        string xref                  = href.Substring(5);
                        string queryAndFragment      = string.Empty;
                        int    queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' });
                        if (queryAndFragmentIndex > 0)
                        {
                            queryAndFragment = xref.Substring(queryAndFragmentIndex);
                            xref             = xref.Substring(0, queryAndFragmentIndex);
                        }
                        if (context.TryGetXrefLink(xref, out string xrefLink, out string error))
                        {
                            element.Attributes["href"].Value = xrefLink + queryAndFragment;
                        }
                        else
                        {
                            // Continue processing so we can report all the failures in a given document
                            failures.AddOrUpdate(
                                input.Source.FullPath,
                                _ => new ConcurrentBag <string> {
                                error
                            },
                                (_, list) =>
                            {
                                list.Add(error);
                                return(list);
                            });
                            errors = true;
                        }
                        modifiedDocument = true;
                    }
                }

                // Exit if there were errors
                if (errors)
                {
                    return(null);
                }

                // Return a new document with the replacements if we performed any
                if (modifiedDocument)
                {
                    using (Stream contentStream = await context.GetContentStreamAsync())
                    {
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                        }
                    }
                }
            }
        internal static async Task <IEnumerable <Common.IDocument> > ProcessElementsAsync(
            Common.IDocument input,
            IExecutionContext context,
            string querySelector,
            bool first,
            Action <Common.IDocument, IExecutionContext, IElement, Dictionary <string, object> > processElement)
        {
            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query selector
            try
            {
                if (!string.IsNullOrWhiteSpace(querySelector))
                {
                    IElement[] elements = first
                        ? new[] { htmlDocument.QuerySelector(querySelector) }
                        : htmlDocument.QuerySelectorAll(querySelector).ToArray();
                    if (elements.Length > 0 && elements[0] != null)
                    {
                        INode clone = htmlDocument.Clone(true);  // Clone the document so we know if it changed
                        Dictionary <string, object> metadata = new Dictionary <string, object>();
                        foreach (IElement element in elements)
                        {
                            processElement(input, context, element, metadata);
                        }

                        if (htmlDocument.Equals(clone))
                        {
                            // Elements were not edited so return the original document or clone it with new metadata
                            return(metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield());
                        }

                        // Elements were edited so get the new content
                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html);
                                return(metadata.Count == 0
                                    ? input.Clone(contentProvider).Yield()
                                    : input.Clone(metadata, contentProvider).Yield());
                            }
                        }
                    }
                }
                return(input.Yield());
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                return(input.Yield());
            }
        }
 protected override Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) =>
 ProcessElementsAsync(input, context, _querySelector, _first, _processElement);