public void TestParseHtml() { string path = Path.GetFullPath(TestDataSample.GetHtmlPath("mshome.html")); ParserContext context = new ParserContext(path); IDomParser parser = (IDomParser)ParserFactory.CreateDom(context); ToxyDom toxyDom = parser.Parse(); List <ToxyNode> metaNodeList = toxyDom.Root.SelectNodes("//meta"); Assert.AreEqual(7, metaNodeList.Count); ToxyNode aNode = toxyDom.Root.SingleSelect("//a"); Assert.AreEqual(1, aNode.Attributes.Count); Assert.AreEqual("href", aNode.Attributes[0].Name); Assert.AreEqual("http://www.microsoft.com/en/us/default.aspx?redir=true", aNode.Attributes[0].Value); ToxyNode titleNode = toxyDom.Root.ChildrenNodes[0].ChildrenNodes[0].ChildrenNodes[0]; Assert.AreEqual("title", titleNode.Name); Assert.AreEqual("Microsoft Corporation", titleNode.ChildrenNodes[0].InnerText); ToxyNode metaNode = toxyDom.Root.ChildrenNodes[0].ChildrenNodes[0].ChildrenNodes[7]; Assert.AreEqual("meta", metaNode.Name); Assert.AreEqual(3, metaNode.Attributes.Count); Assert.AreEqual("name", metaNode.Attributes[0].Name); Assert.AreEqual("SearchDescription", metaNode.Attributes[0].Value); Assert.AreEqual("scheme", metaNode.Attributes[2].Name); Assert.AreEqual(string.Empty, metaNode.Attributes[2].Value); }
void AppendChildren(ToxyNode tnode, XmlNode ele) { if (ele.ChildNodes.Count == 0) return; foreach (XmlNode child in ele.ChildNodes) { ToxyNode x = ConvertToToxyNode(child); tnode.ChildrenNodes.Add(x); AppendChildren(x, child); } }
void AppendChildren(ToxyNode tnode, XmlNode ele) { if (ele.ChildNodes.Count == 0) { return; } foreach (XmlNode child in ele.ChildNodes) { ToxyNode x = ConvertToToxyNode(child); tnode.ChildrenNodes.Add(x); AppendChildren(x, child); } }
ToxyNode ConvertToToxyNode(XmlNode ele) { ToxyNode tnode = new ToxyNode(); tnode.Name = ele.Name; if (ele.Name == "#text") { tnode.Text = ele.InnerText; return tnode; } if (ele.Attributes != null) { foreach (XmlAttribute attr in ele.Attributes) tnode.Attributes.Add(new ToxyAttribute(attr.Name, attr.Value)); } return tnode; }
public ToxyDom Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } XmlDocument doc = new XmlDocument(); doc.Load(Context.Path); ToxyNode rootNode = ConvertToToxyNode(doc.DocumentElement); ToxyDom dom = new ToxyDom(); dom.Root = rootNode; AppendChildren(rootNode, doc.DocumentElement); return(dom); }
ToxyNode ConvertToToxyNode(XmlNode ele) { ToxyNode tnode = new ToxyNode(); tnode.Name = ele.Name; if (ele.Name == "#text") { tnode.Text = ele.InnerText; return(tnode); } if (ele.Attributes != null) { foreach (XmlAttribute attr in ele.Attributes) { tnode.Attributes.Add(new ToxyAttribute(attr.Name, attr.Value)); } } return(tnode); }
void AppendTree(TreeNode node, ToxyNode tnode) { if (tnode.ChildrenNodes == null || tnode.ChildrenNodes.Count == 0) { return; } foreach (var child in tnode.ChildrenNodes) { TreeNode childNode; if (child.Name == "#text") { childNode = node.Nodes.Add(child.Text); } else { childNode = node.Nodes.Add(child.NodeString); } AppendTree(childNode, child); } }
/// <summary> /// Parse HTML document /// Note:Context.Path must be absolute path,not relative path /// </summary> /// <returns></returns> public ToxyDom Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } HtmlWeb hw = new HtmlWeb(); HtmlDocument htmlDoc = hw.Load(Context.Path); HtmlNode docNode = htmlDoc.DocumentNode; ToxyNode root = ToxyNode.TransformHtmlNodeToToxyNode(docNode); Queue <KeyValuePair <HtmlNode, ToxyNode> > nodeQueue = new Queue <KeyValuePair <HtmlNode, ToxyNode> >(); nodeQueue.Enqueue(new KeyValuePair <HtmlNode, ToxyNode>(docNode, root)); while (nodeQueue.Count > 0) { KeyValuePair <HtmlNode, ToxyNode> pair = nodeQueue.Dequeue(); HtmlNode htmlParentNode = pair.Key; ToxyNode toxyParentNode = pair.Value; foreach (HtmlNode htmlChildNode in htmlParentNode.ChildNodes) { ToxyNode toxyChildNode = ToxyNode.TransformHtmlNodeToToxyNode(htmlChildNode); if (htmlChildNode.Name == "#text") { toxyChildNode.Text = htmlChildNode.InnerText; } toxyParentNode.ChildrenNodes.Add(toxyChildNode); nodeQueue.Enqueue(new KeyValuePair <HtmlNode, ToxyNode>(htmlChildNode, toxyChildNode)); } } return(new ToxyDom() { Root = root }); }