protected override async Task <IEnumerable <Common.IDocument> > ExecuteAsync(Common.IDocument input, IExecutionContext context) { if (string.IsNullOrWhiteSpace(_metadataKey)) { return(input.Yield()); } // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Get the query string excerpt first string queryExcerpt = GetQueryExcerpt(htmlDocument); // Now try to get a excerpt separator string separatorExcerpt = GetSeparatorExcerpt(htmlDocument); // Set the metadata string excerpt = separatorExcerpt ?? queryExcerpt; if (excerpt != null) { return(input .Clone(new MetadataItems { { _metadataKey, excerpt.Trim() } }) .Yield()); } return(input.Yield()); }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteAsync(Common.IDocument input, IExecutionContext context) { try { // Get the links and HTML decode the keys (if they're encoded) since the text nodes are decoded IDictionary <string, string> links = await _links.GetValueAsync(input, context, v => _extraLinks .Concat(v.Where(l => !_extraLinks.ContainsKey(l.Key))) .Where(x => !string.IsNullOrWhiteSpace(x.Value)) .ToDictionary(z => WebUtility.HtmlDecode(z.Key), z => $"<a href=\"{z.Value}\">{z.Key}</a>")); // Enumerate all elements that match the query selector not already in a link element List <KeyValuePair <IText, string> > replacements = new List <KeyValuePair <IText, string> >(); IHtmlDocument htmlDocument; using (Stream stream = input.GetStream()) { htmlDocument = await HtmlParser.ParseAsync(stream); } foreach (IElement element in htmlDocument.QuerySelectorAll(_querySelector).Where(t => !t.Ancestors <IHtmlAnchorElement>().Any())) { // Enumerate all descendant text nodes not already in a link element foreach (IText text in element.Descendents().OfType <IText>().Where(t => !t.Ancestors <IHtmlAnchorElement>().Any())) { if (ReplaceStrings(text, links, out string newText)) { // Only perform replacement if the text content changed replacements.Add(new KeyValuePair <IText, string>(text, newText)); } } } // Perform the replacements if there were any, otherwise just return the same document if (replacements.Count > 0) { foreach (KeyValuePair <IText, string> replacement in replacements) { replacement.Key.Replace(HtmlParser.ParseFragment(replacement.Value, replacement.Key.ParentElement).ToArray()); } using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream)).Yield()); } } } } catch (Exception ex) { context.LogWarning("Exception while parsing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); } return(input.Yield()); }
protected override Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) => ProcessHtml.ProcessElementsAsync( input, context, "[href],[src]", false, (d, c, e, m) => { MakeLinkAbsolute(e, "href", context); MakeLinkAbsolute(e, "src", context); });
protected override async Task <IEnumerable <Common.IDocument> > ExecuteAsync(Common.IDocument input, IExecutionContext context) { // Get the replacement content string content = await _content.GetValueAsync(input, context); if (content == null) { return(input.Yield()); } // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(_querySelector)) { IElement[] elements = _first ? new[] { htmlDocument.QuerySelector(_querySelector) } : htmlDocument.QuerySelectorAll(_querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { foreach (IElement element in elements) { element.Insert(_position, content); } using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream)).Yield()); } } } } return(input.Yield()); } catch (Exception ex) { context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input.Yield()); } }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) { if (string.IsNullOrWhiteSpace(_metadataKey)) { return(input.Yield()); } // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query and create the holding nodes Heading previousHeading = null; List <Heading> headings = htmlDocument .QuerySelectorAll(_query) .Select(x => { previousHeading = new Heading { Element = x, Previous = previousHeading, Level = int.Parse(x.NodeName.Substring(1)) }; return(previousHeading); }) .ToList(); // Build the tree from the bottom-up for (int level = _level; level >= 1; level--) { int currentLevel = level; foreach (Heading heading in headings.Where(x => x.Level == currentLevel)) { // Get the parent Heading parent = null; if (currentLevel > 1) { parent = heading.Previous; while (parent != null && parent.Level >= currentLevel) { parent = parent.Previous; } } // Create the document MetadataItems metadata = new MetadataItems(); if (_levelKey != null) { metadata.Add(_levelKey, heading.Level); } if (_idKey != null && heading.Element.HasAttribute("id")) { metadata.Add(_idKey, heading.Element.GetAttribute("id")); } if (_headingKey != null) { metadata.Add(_headingKey, heading.Element.InnerHtml); } if (_childrenKey != null) { metadata.Add(_childrenKey, heading.Children.AsReadOnly()); } using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { heading.Element.ChildNodes.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); heading.Document = context.CreateDocument(metadata, context.GetContentProvider(contentStream, MediaTypes.Html)); } } // Add to parent parent?.Children.Add(heading.Document); } } return(input .Clone(new MetadataItems { { _metadataKey, _nesting ? headings .Where(x => x.Level == headings.Min(y => y.Level)) .Select(x => x.Document) .ToArray() : headings .Select(x => x.Document) .ToArray() } }) .Yield()); }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) { // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(_querySelector)) { IElement[] elements = _first ? new[] { htmlDocument.QuerySelector(_querySelector) } : htmlDocument.QuerySelectorAll(_querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { List <Common.IDocument> documents = new List <Common.IDocument>(); foreach (IElement element in elements) { // Get the metadata Dictionary <string, object> metadata = new Dictionary <string, object>(); foreach (Action <IElement, Dictionary <string, object> > metadataAction in _metadataActions) { metadataAction(element, metadata); } // Clone the document and optionally change content to the HTML element if (_outerHtmlContent.HasValue) { using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { if (_outerHtmlContent.Value) { element.ToHtml(writer, ProcessingInstructionFormatter.Instance); } else { element.ChildNodes.ToHtml(writer, ProcessingInstructionFormatter.Instance); } writer.Flush(); documents.Add( input.Clone( metadata.Count == 0 ? null : metadata, context.GetContentProvider(contentStream))); } } } else { documents.Add(input.Clone(metadata)); } } return(documents); } } return(input.Yield()); } catch (Exception ex) { context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input.Yield()); } }
protected override async Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) { // Get the replacement content string content = await _content.GetValueAsync(input, context); if (content == null) { return(input.Yield()); } return(await ProcessHtml.ProcessElementsAsync( input, context, _querySelector, _first, (i, c, e, m) => e.Insert(_position, content))); }
private static async Task <Common.IDocument> ResolveDocumentXrefsAsync( Common.IDocument input, IExecutionContext context, ConcurrentDictionary <string, ConcurrentBag <string> > failures) { IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument != null) { // Find and replace "xref:" in links bool modifiedDocument = false; bool errors = false; foreach (IElement element in htmlDocument .GetElementsByTagName("a") .Where(x => x.HasAttribute("href"))) { string href = element.GetAttribute("href"); if (href.StartsWith("xref:") && href.Length > 5) { string xref = href.Substring(5); string queryAndFragment = string.Empty; int queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' }); if (queryAndFragmentIndex > 0) { queryAndFragment = xref.Substring(queryAndFragmentIndex); xref = xref.Substring(0, queryAndFragmentIndex); } if (context.TryGetXrefLink(xref, out string xrefLink, out string error)) { element.Attributes["href"].Value = xrefLink + queryAndFragment; } else { // Continue processing so we can report all the failures in a given document failures.AddOrUpdate( input.Source.FullPath, _ => new ConcurrentBag <string> { error }, (_, list) => { list.Add(error); return(list); }); errors = true; } modifiedDocument = true; } } // Exit if there were errors if (errors) { return(null); } // Return a new document with the replacements if we performed any if (modifiedDocument) { using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html))); } } } }
internal static async Task <IEnumerable <Common.IDocument> > ProcessElementsAsync( Common.IDocument input, IExecutionContext context, string querySelector, bool first, Action <Common.IDocument, IExecutionContext, IElement, Dictionary <string, object> > processElement) { // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(querySelector)) { IElement[] elements = first ? new[] { htmlDocument.QuerySelector(querySelector) } : htmlDocument.QuerySelectorAll(querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { INode clone = htmlDocument.Clone(true); // Clone the document so we know if it changed Dictionary <string, object> metadata = new Dictionary <string, object>(); foreach (IElement element in elements) { processElement(input, context, element, metadata); } if (htmlDocument.Equals(clone)) { // Elements were not edited so return the original document or clone it with new metadata return(metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield()); } // Elements were edited so get the new content using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html); return(metadata.Count == 0 ? input.Clone(contentProvider).Yield() : input.Clone(metadata, contentProvider).Yield()); } } } } return(input.Yield()); } catch (Exception ex) { context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input.Yield()); } }
protected override Task <IEnumerable <Common.IDocument> > ExecuteInputAsync(Common.IDocument input, IExecutionContext context) => ProcessElementsAsync(input, context, _querySelector, _first, _processElement);