private HtmlNodeNavigator(HtmlNodeNavigator nav)
        {
            if (nav == null)
            {
                throw new ArgumentNullException("nav");
            }
            InternalTrace(null);

            _doc = nav._doc;
            _currentnode = nav._currentnode;
            _attindex = nav._attindex;
            _nametable = nav._nametable; // REVIEW: should we do this?
        }
        /// <summary>
        /// 是否匹配,匹配到了则执行操作
        /// </summary>
        /// <param name="node"></param>
        /// <param name="action"></param>
        public void MatchAndAction(HtmlNode node, Action<HtmlNode, HtmlAttribute> action)
        {
            if (node != null && action != null)
            {
                if (string.Compare(node.Name, this.TagName, true) == 0)
                {
                    foreach (string attrName in this.AttributesName)
                    {
                        HtmlAttribute attr = node.Attributes[attrName];

                        if (attr != null)
                            action(node, attr);
                    }
                }
            }
        }
        internal HtmlNodeNavigator(HtmlDocument doc, HtmlNode currentNode)
        {
            if (currentNode == null)
            {
                throw new ArgumentNullException("currentNode");
            }
            if (currentNode.OwnerDocument != doc)
            {
                throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild);
            }
            InternalTrace(null);

            _doc = doc;
            Reset();
            _currentnode = currentNode;
        }
        internal void SetIdForNode(HtmlNode node, string id)
        {
            if (!OptionUseIdAttribute)
                return;

            if ((Nodesid == null) || (id == null))
                return;

            if (node == null)
                Nodesid.Remove(id.ToLower());
            else
                Nodesid[id.ToLower()] = node;
        }
Exemple #5
0
 /// <summary>
 /// Creates a duplicate of the node and the subtree under it.
 /// </summary>
 /// <param name="node">The node to duplicate. May not be <c>null</c>.</param>
 public void CopyFrom(HtmlNode node)
 {
     CopyFrom(node, true);
 }
Exemple #6
0
        internal void CloseNode(HtmlNode endnode)
        {
            if (!_ownerdocument.OptionAutoCloseOnEnd)
            {
                // close all children
                if (_childnodes != null)
                {
                    foreach (HtmlNode child in _childnodes)
                    {
                        if (child.Closed)
                            continue;

                        // create a fake closer node
                        HtmlNode close = new HtmlNode(NodeType, _ownerdocument, -1);
                        close._endnode = close;
                        child.CloseNode(close);
                    }
                }
            }

            if (!Closed)
            {
                _endnode = endnode;

                if (_ownerdocument.Openednodes != null)
                    _ownerdocument.Openednodes.Remove(_outerstartindex);

                HtmlNode self = Utilities.GetDictionaryValueOrNull(_ownerdocument.Lastnodes, Name);
                if (self == this)
                {
                    _ownerdocument.Lastnodes.Remove(Name);
                    _ownerdocument.UpdateLastParentNode();
                }

                if (endnode == this)
                    return;

                // create an inner section
                _innerstartindex = _outerstartindex + _outerlength;
                _innerlength = endnode._outerstartindex - _innerstartindex;

                // update full length
                _outerlength = (endnode._outerstartindex + endnode._outerlength) - _outerstartindex;
            }
        }
Exemple #7
0
        /// <summary>
        /// Replaces the child node oldChild with newChild node.
        /// </summary>
        /// <param name="newChild">The new node to put in the child list.</param>
        /// <param name="oldChild">The node being replaced in the list.</param>
        /// <returns>The node replaced.</returns>
        public HtmlNode ReplaceChild(HtmlNode newChild, HtmlNode oldChild)
        {
            if (newChild == null)
            {
                return RemoveChild(oldChild);
            }

            if (oldChild == null)
            {
                return AppendChild(newChild);
            }

            int index = -1;

            if (_childnodes != null)
            {
                index = _childnodes[oldChild];
            }

            if (index == -1)
            {
                throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild);
            }

            if (_childnodes != null) _childnodes.Replace(index, newChild);

            _ownerdocument.SetIdForNode(null, oldChild.GetId());
            _ownerdocument.SetIdForNode(newChild, newChild.GetId());
            SetChanged();
            return newChild;
        }
Exemple #8
0
        /// <summary>
        /// Removes the specified child node.
        /// </summary>
        /// <param name="oldChild">The node being removed. May not be <c>null</c>.</param>
        /// <returns>The node removed.</returns>
        public HtmlNode RemoveChild(HtmlNode oldChild)
        {
            if (oldChild == null)
            {
                throw new ArgumentNullException("oldChild");
            }

            int index = -1;

            if (_childnodes != null)
            {
                index = _childnodes[oldChild];
            }

            if (index == -1)
            {
                throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild);
            }

            if (_childnodes != null)
                _childnodes.Remove(index);

            _ownerdocument.SetIdForNode(null, oldChild.GetId());
            SetChanged();
            return oldChild;
        }
Exemple #9
0
        /// <summary>
        /// Inserts the specified node immediately before the specified reference node.
        /// </summary>
        /// <param name="newChild">The node to insert. May not be <c>null</c>.</param>
        /// <param name="refChild">The node that is the reference node. The newChild is placed before this node.</param>
        /// <returns>The node being inserted.</returns>
        public HtmlNode InsertBefore(HtmlNode newChild, HtmlNode refChild)
        {
            if (newChild == null)
            {
                throw new ArgumentNullException("newChild");
            }

            if (refChild == null)
            {
                return AppendChild(newChild);
            }

            if (newChild == refChild)
            {
                return newChild;
            }

            int index = -1;

            if (_childnodes != null)
            {
                index = _childnodes[refChild];
            }

            if (index == -1)
            {
                throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild);
            }

            if (_childnodes != null) _childnodes.Insert(index, newChild);

            _ownerdocument.SetIdForNode(newChild, newChild.GetId());
            SetChanged();
            return newChild;
        }
        /// <summary>
        /// 过滤有可能包含脚本的属性,如果包含javascript,则过滤掉此属性
        /// </summary>
        /// <param name="node"></param>
        private static void FilterScriptRelAttributes(HtmlNode node)
        {
            List<HtmlAttribute> attrsNeedToRemove = new List<HtmlAttribute>();

            foreach (HtmlSanitizeNode hsn in _JavascriptRelAttributes)
            {
                hsn.MatchAndAction(node, (n, attr) =>
                {
                    if (ContainsScript(attr))
                        attrsNeedToRemove.Add(attr);
                });
            }

            attrsNeedToRemove.ForEach(attr => node.Attributes.Remove(attr));
        }
        private static void FilterAttributes(HtmlNode node)
        {
            List<HtmlAttribute> attrsNeedToRemove = new List<HtmlAttribute>();

            foreach (HtmlAttribute attr in node.Attributes)
            {
                if (_DefaultAttributeBlackList.Contains(attr.Name))
                    attrsNeedToRemove.Add(attr);
            }

            attrsNeedToRemove.ForEach(attr => node.Attributes.Remove(attr));

            FilterStyleAttributes(node.Attributes["style"]);
        }
        private void FixNestedTag(string name, string[] resetters)
        {
            if (resetters == null)
                return;

            HtmlNode prev = Utilities.GetDictionaryValueOrNull(Lastnodes, _currentnode.Name);
            // if we find a previous unclosed same name node, without a resetter node between, we must close it
            if (prev == null || (Lastnodes[name].Closed)) return;
            // try to find a resetter node, if found, we do nothing
            if (FindResetterNodes(prev, resetters))
            {
                return;
            }

            // ok we need to close the prev now
            // create a fake closer node
            HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
            close._endnode = close;
            prev.CloseNode(close);
        }
        private bool FindResetterNodes(HtmlNode node, string[] names)
        {
            if (names == null)
                return false;

            for (int i = 0; i < names.Length; i++)
            {
                if (FindResetterNode(node, names[i]) != null)
                    return true;
            }
            return false;
        }
        private HtmlNode FindResetterNode(HtmlNode node, string name)
        {
            HtmlNode resetter = Utilities.GetDictionaryValueOrNull(Lastnodes, name);
            if (resetter == null)
                return null;

            if (resetter.Closed)
                return null;

            if (resetter._streamposition < node._streamposition)
            {
                return null;
            }

            return resetter;
        }
        internal void UpdateLastParentNode()
        {
            do
            {
                if (_lastparentnode.Closed)
                    _lastparentnode = _lastparentnode.ParentNode;

            } while ((_lastparentnode != null) && (_lastparentnode.Closed));

            if (_lastparentnode == null)
                _lastparentnode = _documentnode;
        }
 /// <summary>
 /// Moves to the root node to which the current node belongs.
 /// </summary>
 public override void MoveToRoot()
 {
     _currentnode = _doc.DocumentNode;
     InternalTrace(null);
 }
 private void Reset()
 {
     InternalTrace(null);
     _currentnode = _doc.DocumentNode;
     _attindex = -1;
 }
        /// <summary>
        /// Moves to the same position as the specified HtmlNavigator.
        /// </summary>
        /// <param name="other">The HtmlNavigator positioned on the node that you want to move to.</param>
        /// <returns>true if successful, otherwise false. If false, the position of the navigator is unchanged.</returns>
        public override bool MoveTo(XPathNavigator other)
        {
            HtmlNodeNavigator nav = other as HtmlNodeNavigator;
            if (nav == null)
            {
                InternalTrace(">false (nav is not an HtmlNodeNavigator)");
                return false;
            }
            InternalTrace("moveto oid=" + nav.GetHashCode()
                          + ", n:" + nav._currentnode.Name
                          + ", a:" + nav._attindex);

            if (nav._doc == _doc)
            {
                _currentnode = nav._currentnode;
                _attindex = nav._attindex;
                InternalTrace(">true");
                return true;
            }
            // we don't know how to handle that
            InternalTrace(">false (???)");
            return false;
        }
Exemple #19
0
        /// <summary>
        /// Initializes HtmlNode, providing type, owner and where it exists in a collection
        /// </summary>
        /// <param name="type"></param>
        /// <param name="ownerdocument"></param>
        /// <param name="index"></param>
        public HtmlNode(HtmlNodeType type, HtmlDocument ownerdocument, int index)
        {
            _nodetype = type;
            _ownerdocument = ownerdocument;
            _outerstartindex = index;

            switch (type)
            {
                case HtmlNodeType.Comment:
                    Name = HtmlNodeTypeNameComment;
                    _endnode = this;
                    break;

                case HtmlNodeType.Document:
                    Name = HtmlNodeTypeNameDocument;
                    _endnode = this;
                    break;

                case HtmlNodeType.Text:
                    Name = HtmlNodeTypeNameText;
                    _endnode = this;
                    break;
            }

            if (_ownerdocument.Openednodes != null)
            {
                if (!Closed)
                {
                    // we use the index as the key

                    // -1 means the node comes from public
                    if (-1 != index)
                    {
                        _ownerdocument.Openednodes.Add(index, this);
                    }
                }
            }

            if ((-1 != index) || (type == HtmlNodeType.Comment) || (type == HtmlNodeType.Text)) return;
            // innerhtml and outerhtml must be calculated
            SetChanged();
        }
 /// <summary>
 /// Moves to the first child of the current node.
 /// </summary>
 /// <returns>true if there is a first child node, otherwise false.</returns>
 public override bool MoveToFirstChild()
 {
     if (!_currentnode.HasChildNodes)
     {
         InternalTrace(">false");
         return false;
     }
     _currentnode = _currentnode.ChildNodes[0];
     InternalTrace(">true");
     return true;
 }
Exemple #21
0
        /// <summary>
        /// Removes the specified child node.
        /// </summary>
        /// <param name="oldChild">The node being removed. May not be <c>null</c>.</param>
        /// <param name="keepGrandChildren">true to keep grand children of the node, false otherwise.</param>
        /// <returns>The node removed.</returns>
        public HtmlNode RemoveChild(HtmlNode oldChild, bool keepGrandChildren)
        {
            if (oldChild == null)
            {
                throw new ArgumentNullException("oldChild");
            }

            if ((oldChild._childnodes != null) && keepGrandChildren)
            {
                // get prev sibling
                HtmlNode prev = oldChild.PreviousSibling;

                // reroute grand children to ourselves
                foreach (HtmlNode grandchild in oldChild._childnodes)
                {
                    InsertAfter(grandchild, prev);
                }
            }
            RemoveChild(oldChild);
            SetChanged();
            return oldChild;
        }
 /// <summary>
 /// Moves to the node that has an attribute of type ID whose value matches the specified string.
 /// </summary>
 /// <param name="id">A string representing the ID value of the node to which you want to move. This argument does not need to be atomized.</param>
 /// <returns>true if the move was successful, otherwise false. If false, the position of the navigator is unchanged.</returns>
 public override bool MoveToId(string id)
 {
     InternalTrace("id=" + id);
     HtmlNode node = _doc.GetElementbyId(id);
     if (node == null)
     {
         InternalTrace(">false");
         return false;
     }
     _currentnode = node;
     InternalTrace(">true");
     return true;
 }
Exemple #23
0
 internal static void WriteAttributes(XmlWriter writer, HtmlNode node)
 {
     if (!node.HasAttributes)
     {
         return;
     }
     // we use Hashitems to make sure attributes are written only once
     foreach (HtmlAttribute att in node.Attributes.Hashitems.Values)
     {
         writer.WriteAttributeString(att.XmlName, att.Value);
     }
 }
 /// <summary>
 /// Moves to the next sibling of the current node.
 /// </summary>
 /// <returns>true if the navigator is successful moving to the next sibling node, false if there are no more siblings or if the navigator is currently positioned on an attribute node. If false, the position of the navigator is unchanged.</returns>
 public override bool MoveToNext()
 {
     if (_currentnode.NextSibling == null)
     {
         InternalTrace(">false");
         return false;
     }
     InternalTrace("_c=" + _currentnode.CloneNode(false).OuterHtml);
     InternalTrace("_n=" + _currentnode.NextSibling.CloneNode(false).OuterHtml);
     _currentnode = _currentnode.NextSibling;
     InternalTrace(">true");
     return true;
 }
Exemple #25
0
        /// <summary>
        /// Adds the specified node to the end of the list of children of this node.
        /// </summary>
        /// <param name="newChild">The node to add. May not be null.</param>
        /// <returns>The node added.</returns>
        public HtmlNode AppendChild(HtmlNode newChild)
        {
            if (newChild == null)
            {
                throw new ArgumentNullException("newChild");
            }

            ChildNodes.Append(newChild);
            _ownerdocument.SetIdForNode(newChild, newChild.GetId());
            SetChanged();
            return newChild;
        }
 /// <summary>
 /// Moves to the parent of the current node.
 /// </summary>
 /// <returns>true if there is a parent node, otherwise false.</returns>
 public override bool MoveToParent()
 {
     if (_currentnode.ParentNode == null)
     {
         InternalTrace(">false");
         return false;
     }
     _currentnode = _currentnode.ParentNode;
     InternalTrace(">true");
     return true;
 }
Exemple #27
0
        /// <summary>
        /// Creates a duplicate of the node.
        /// </summary>
        /// <param name="node">The node to duplicate. May not be <c>null</c>.</param>
        /// <param name="deep">true to recursively clone the subtree under the specified node, false to clone only the node itself.</param>
        public void CopyFrom(HtmlNode node, bool deep)
        {
            if (node == null)
            {
                throw new ArgumentNullException("node");
            }

            Attributes.RemoveAll();
            if (node.HasAttributes)
            {
                foreach (HtmlAttribute att in node.Attributes)
                {
                    SetAttributeValue(att.Name, att.Value);
                }
            }

            if (!deep)
            {
                RemoveAllChildren();
                if (node.HasChildNodes)
                {
                    foreach (HtmlNode child in node.ChildNodes)
                    {
                        AppendChild(child.CloneNode(true));
                    }
                }
            }
        }
 /// <summary>
 /// Moves to the previous sibling of the current node.
 /// </summary>
 /// <returns>true if the navigator is successful moving to the previous sibling node, false if there is no previous sibling or if the navigator is currently positioned on an attribute node.</returns>
 public override bool MoveToPrevious()
 {
     if (_currentnode.PreviousSibling == null)
     {
         InternalTrace(">false");
         return false;
     }
     _currentnode = _currentnode.PreviousSibling;
     InternalTrace(">true");
     return true;
 }
Exemple #29
0
        /// <summary>
        /// Clone and entitize an HtmlNode. This will affect attribute values and nodes' text. It will also entitize all child nodes.
        /// </summary>
        /// <param name="node">The node to entitize.</param>
        /// <returns>An entitized cloned node.</returns>
        public static HtmlNode Entitize(HtmlNode node)
        {
            if (node == null)
            {
                throw new ArgumentNullException("node");
            }
            HtmlNode result = node.CloneNode(true);
            if (result.HasAttributes)
                Entitize(result.Attributes);

            if (result.HasChildNodes)
            {
                Entitize(result.ChildNodes);
            }
            else
            {
                if (result.NodeType == HtmlNodeType.Text)
                {
                    ((HtmlTextNode)result).Text = Entitize(((HtmlTextNode)result).Text, true, true);
                }
            }
            return result;
        }
        /// <summary>
        /// Loads the HTML document from the specified TextReader.
        /// </summary>
        /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
        public void Load(TextReader reader)
        {
            // all Load methods pass down to this one
            if (reader == null)
                throw new ArgumentNullException("reader");

            _onlyDetectEncoding = false;

            if (OptionCheckSyntax)
                Openednodes = new Dictionary<int, HtmlNode>();
            else
                Openednodes = null;

            if (OptionUseIdAttribute)
            {
                Nodesid = new Dictionary<string, HtmlNode>();
            }
            else
            {
                Nodesid = null;
            }

            StreamReader sr = reader as StreamReader;
            if (sr != null)
            {
                try
                {
                    // trigger bom read if needed
                    sr.Peek();
                }
                // ReSharper disable EmptyGeneralCatchClause
                catch (Exception)
                // ReSharper restore EmptyGeneralCatchClause
                {
                    // void on purpose
                }
                _streamencoding = sr.CurrentEncoding;
            }
            else
            {
                _streamencoding = null;
            }
            _declaredencoding = null;

            Text = reader.ReadToEnd();
            _documentnode = CreateNode(HtmlNodeType.Document, 0);
            Parse();

            if (!OptionCheckSyntax || Openednodes == null) return;
            foreach (HtmlNode node in Openednodes.Values)
            {
                if (!node._starttag) // already reported
                {
                    continue;
                }

                string html;
                if (OptionExtractErrorSourceText)
                {
                    html = node.OuterHtml;
                    if (html.Length > OptionExtractErrorSourceTextMaxLength)
                    {
                        html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
                    }
                }
                else
                {
                    html = string.Empty;
                }
                AddError(
                    HtmlParseErrorCode.TagNotClosed,
                    node._line, node._lineposition,
                    node._streamposition, html,
                    "End tag </" + node.Name + "> was not found");
            }

            // we don't need this anymore
            Openednodes.Clear();
        }