Example #1
0
        // Use this after attempting to find the excerpt element because it destroys the HTML document
        private string GetSeparatorExcerpt(IHtmlDocument htmlDocument)
        {
            if (_separators?.Length > 0)
            {
                ITreeWalker walker  = htmlDocument.CreateTreeWalker(htmlDocument.DocumentElement, FilterSettings.Comment);
                IComment    comment = (IComment)walker.ToFirst();
                while (comment != null && !_separators.Contains(comment.NodeValue.Trim(), StringComparer.OrdinalIgnoreCase))
                {
                    comment = (IComment)walker.ToNext();
                }

                // Found the first separator
                if (comment != null)
                {
                    // Get a clone of the parent element
                    IElement parent = comment.ParentElement;
                    if (parent.TagName.Equals("p", StringComparison.OrdinalIgnoreCase))
                    {
                        // If we were in a tag inside a paragraph, ascend to the paragraph's parent
                        parent = parent.ParentElement;
                    }

                    // Now remove everything after the separator
                    walker = htmlDocument.CreateTreeWalker(parent);
                    bool          remove      = false;
                    Stack <INode> removeStack = new Stack <INode>();
                    INode         node        = walker.ToFirst();
                    while (node != null)
                    {
                        if (node == comment)
                        {
                            remove = true;
                        }

                        // Also remove if it's a top-level element that doesn't match the query selector
                        if (remove ||
                            (node.Parent == parent &&
                             node is IElement &&
                             !string.IsNullOrEmpty(_querySelector) &&
                             !((IElement)node).Matches(_querySelector)))
                        {
                            removeStack.Push(node);
                        }
                        node = walker.ToNext();
                    }
                    while (removeStack.Count > 0)
                    {
                        node = removeStack.Pop();
                        node.Parent.RemoveChild(node);
                    }

                    return(parent.InnerHtml);
                }
            }
            return(null);
        }