private void CheckImageAlt(CHtmlDocument doc, CHtmlNodeCollection nodes) { foreach (CHtmlNode node in nodes) { if (node is CHtmlElement) { CHtmlElement element = node as CHtmlElement; if ("img".Equals(element.Name.ToLower(), StringComparison.OrdinalIgnoreCase)) { if (element.Attributes.HasAttribute("alt") == false) { // 리포터 모듈 작성 할것 AddReportItem(doc.HRef, element.HTML, "["+1001+"] img태그에 Alt가 없습니다."); } if(element.Attributes.HasAttribute("height") == false) { AddReportItem(doc.HRef, element.HTML, "["+1002+"] img태그에 Height속성이 없습니다."); } if(element.Attributes.HasAttribute("width") == false) { AddReportItem(doc.HRef, element.HTML, "[" + 1003 + "] img태그에 Width속성이 없습니다."); } } if (element.Nodes.Count > 0) CheckImageAlt(doc, element.Nodes); } } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// This constructs a new HTML element with the specified tag name. /// </summary> public CHtmlElement(string name) { System.Diagnostics.Debug.Assert(name != null && CHtmlUtil.ExistWhiteSpaceChar(name) == false); m_nodes = new CHtmlNodeCollection(this); m_name = name.Trim().ToLower(); }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="result"></param> /// <param name="name"></param> /// <param name="searchChildren"></param> public void FindByName(CHtmlNodeCollection result, string name, bool searchChildren) { System.Diagnostics.Debug.Assert(result != null); System.Diagnostics.Debug.Assert(name != null); name = name.Trim().ToLower(); for (int index = 0, count = m_nodeList.Count; index < count; ++index) { CHtmlNode node = m_nodeList[index]; if (node.NodeName.Equals(name)) { result.Add(node); } if (searchChildren == true && node is CHtmlElement) { ((CHtmlElement)node).Nodes.FindByName(result, name, searchChildren); } } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="html"></param> public virtual void LoadHtml(string html) { System.Diagnostics.Debug.Assert(html != null); m_nodeList.Clear(); m_parser.Parse(html, m_nodeList); if (m_charset == null) { m_charset = DetectCharset(m_nodeList); if (m_charset == null) { m_charset = Encoding.Unicode; } } CHtmlNodeCollection bases = m_nodeList.FindByName("base", true); if (bases != null && bases.Count > 0 && ((CHtmlElement)bases[bases.Count - 1]).Attributes["href"] != null) { this.docBase = ((CHtmlElement)bases[bases.Count - 1]).Attributes["href"].Value; } }
/////////////////////////////////////////////////////////////////////////////// /// <summary> /// »ý¼ºÀÚ /// </summary> public CHtmlElement(CHtmlElement obj) : base(obj) { System.Diagnostics.Debug.Assert(obj != null); obj.AssertValid(); m_name = obj.m_name; m_nodes = new CHtmlNodeCollection(this); int count = obj.m_nodes.Count; m_nodes.Capacity = count; for (int index = 0; index < count; ++index) { m_nodes.Add((CHtmlNode)obj.m_nodes[index].Clone()); } count = obj.m_attributes.Count; m_attributes.Capacity = count; for (int index = 0; index < count; ++index) { m_attributes.Add((CHtmlAttribute)obj.m_attributes[index].Clone()); } }
private void FindLink(CHtmlNodeCollection parentNodes, ref CHtmlNodeCollection links) { if (parentNodes == null || parentNodes.Count == 0) return; CHtmlAttribute attr; foreach(CHtmlNode node in parentNodes) { if(node is CHtmlElement) { if(node == null) continue; attr = null; CHtmlElement element = node as CHtmlElement; switch(element.Name.Trim().ToLower()) { case "a": case "link": case "frame": attr = element.Attributes["href"]; break; case "script": attr = element.Attributes["src"]; break; } if(attr != null) { links.Add(node); } if(element.Nodes.Count > 0) { FindLink(element.Nodes, ref links); } } } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="htmlStream"></param> /// <returns></returns> private Encoding DetectCharset(CHtmlNodeCollection nodes) { Encoding result = null; string charset = ""; CHtmlNodeCollection metaNodes = new CHtmlNodeCollection(); CHtmlElement node = nodes["html"] as CHtmlElement; if(node != null) node = node.Nodes["head"] as CHtmlElement; if(node != null) node.Nodes.FindByNameAttribute(metaNodes, "meta", "content", false); for(int nodeIndex = 0, count = metaNodes.Count; nodeIndex < count; ++nodeIndex) { CHtmlElement metaElement = metaNodes[nodeIndex] as CHtmlElement; if(metaElement != null) { int index = -1; CHtmlAttributeCollection attributes = metaElement.Attributes.FindByName("content"); for(int attributeIndex = 0, attributeCount = attributes.Count; attributeIndex < attributeCount; ++attributeIndex) { CHtmlAttribute attribute = attributes[attributeIndex]; if((index = attribute.Value.IndexOf("charset")) != -1) { string value = attribute.Value; int startIndex = index + 7; while(startIndex < value.Length && CHtmlUtil.EqualesOfAnyChar(value[startIndex], " =")) ++startIndex; int endIndex = startIndex + 1; while(endIndex < value.Length && !CHtmlUtil.EqualesOfAnyChar(value[endIndex], " ")) ++endIndex; if(startIndex < value.Length && endIndex - startIndex > 0) { charset = value.Substring(startIndex, endIndex - startIndex); try { result = Encoding.GetEncoding(charset); break; } catch(Exception) { } } } } } } return result; }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="result"></param> /// <param name="name"></param> /// <param name="searchChildren"></param> public void FindByName(CHtmlNodeCollection result, string name, bool searchChildren) { System.Diagnostics.Debug.Assert(result != null); System.Diagnostics.Debug.Assert(name != null); name = name.Trim().ToLower(); for(int index = 0, count = m_nodeList.Count; index < count; ++index) { CHtmlNode node = m_nodeList[index]; if(node.NodeName.Equals(name)) result.Add(node); if(searchChildren == true && node is CHtmlElement) ((CHtmlElement)node).Nodes.FindByName(result, name, searchChildren); } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="visitor"></param> /// <param name="nodeIDCount"></param> /// <returns></returns> private void ResolveElement(ref int visitor, ref int nodeIDCount) { System.Diagnostics.Debug.Assert(m_tokens[visitor].Type == Token.TokenType.TagName); CHtmlElement element = new CHtmlElement(m_tokens[visitor].Content); element.m_close = false; ++nodeIDCount; element.NodeID = nodeIDCount; ++visitor; ResolveAttribute(ref visitor, element.Attributes); System.Diagnostics.Debug.Assert(m_tokens[visitor].Type == Token.TokenType.TagEnd || m_tokens[visitor].Type == Token.TokenType.TagCloseEnd); if(m_tokens[visitor].Type == Token.TokenType.TagEnd) // <tag> { switch(element.Name) { // Empty tag case "area": case "bgsound": case "base": case "br": case "basefont": case "col": case "embed": case "frame": case "hr": case "img": case "isindex": case "input": case "keygen": case "link": case "meta": case "nextid": case "option": case "param": case "sound": case "spacer": case "wbr": element.TerminatedType = CHtmlElement.EndTagType.NonTerminated; CloseElement(element); if(ElementCreatedEvent != null) ElementCreatedEvent(element); m_currentLevel.Add(element); break; default: { element.m_previousWithSameNameNode = (CHtmlElement)m_lastOpenNodes[element.Name]; m_lastOpenNodes[element.Name] = element; if(ElementCreatedEvent != null) ElementCreatedEvent(element); m_currentLevel.Add(element); m_currentLevel = element.Nodes; } break; } } else // <tag/> { element.TerminatedType = CHtmlElement.EndTagType.Terminated; CloseElement(element); if(ElementCreatedEvent != null) ElementCreatedEvent(element); m_currentLevel.Add(element); } ++visitor; }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// This will parse a string containing HTML and will produce a domain tree. /// </summary> /// <param name="html">The HTML to be parsed</param> /// <returns></returns> public CHtmlNodeCollection Parse(string html) { System.Diagnostics.Debug.Assert(html != null); CHtmlNodeCollection result = new CHtmlNodeCollection(); Parse(html, result); return result; }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// This will parse a string containing HTML and will produce a domain tree. /// </summary> /// <param name="html">The HTML to be parsed</param> /// <returns></returns> public void Parse(string html, CHtmlNodeCollection result) { System.Diagnostics.Debug.Assert(html != null); System.Diagnostics.Debug.Assert(result != null); m_input = html; m_result = result; m_currentLevel = m_result; ParseTokens(); ResolveTokens(); m_result = null; m_input = null; m_currentLevel = null; m_lastOpenNodes.Clear(); }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="result"></param> public void GetDescendent(CHtmlNodeCollection result) { System.Diagnostics.Debug.Assert(result != null); for(int index = 0, count = m_nodeList.Count; index < count; ++index) { CHtmlNode node = m_nodeList[index]; result.Add(node); if(node is CHtmlElement) ((CHtmlElement)node).Nodes.GetDescendent(result); } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <returns></returns> public CHtmlNodeCollection GetDescendent() { CHtmlNodeCollection result = new CHtmlNodeCollection(64); GetDescendent(result); return result; }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="result"></param> /// <param name="name"></param> /// <param name="attributeName"></param> /// <param name="attributeValue"></param> /// <param name="searchChildren"></param> public void FindByNameAttributeValue(CHtmlNodeCollection result, string name, string attributeName, string attributeValue, bool searchChildren) { System.Diagnostics.Debug.Assert(result != null); System.Diagnostics.Debug.Assert(attributeName != null); attributeName = attributeName.Trim().ToLower(); for(int index = 0, count = m_nodeList.Count; index < count; ++index) { CHtmlNode node = m_nodeList[index]; if(node.NodeName.Equals(name) && node is IHtmlNodeHasAttribute) { CHtmlAttribute attribute = ((IHtmlNodeHasAttribute)node).Attributes[attributeName]; if(attribute != null && attribute.Value == attributeValue) result.Add(node); } if(searchChildren && node is CHtmlElement) ((CHtmlElement)node).Nodes.FindByNameAttributeValue(result, name, attributeName, attributeValue, searchChildren); } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="result"></param> /// <param name="name"></param> /// <param name="searchChildren"></param> public void FindByName(CHtmlNodeCollection result, string name, int depth) { System.Diagnostics.Debug.Assert(result != null); System.Diagnostics.Debug.Assert(name != null); System.Diagnostics.Debug.Assert(depth >= 0); if(depth > 0) { name = name.Trim().ToLower(); for(int index = 0, count = m_nodeList.Count; index < count; ++index) { CHtmlNode node = m_nodeList[index]; if(node.NodeName.Equals(name)) result.Add(node); if(depth > 1 && node is CHtmlElement) ((CHtmlElement)node).Nodes.FindByName(result, name, depth - 1); } } }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="visitor"></param> /// <param name="nodeIDCount"></param> /// <returns></returns> private void ResolveEndTag(ref int visitor) { System.Diagnostics.Debug.Assert(m_tokens[visitor].Type == Token.TokenType.TagCloseBegin); ++visitor; System.Diagnostics.Debug.Assert(m_tokens[visitor].Type == Token.TokenType.TagName); string tagName = m_tokens[visitor].Content; CHtmlElement openElement = (CHtmlElement)m_lastOpenNodes[tagName]; // "000<b>111<a>222</b>333</a>444" will be transformed into "000<b>111<a>222</a></b>333444". // The end tag "</a> will be ignored. if(openElement != null) // If open tag is not found, we ignore the end tag { openElement.TerminatedType = CHtmlElement.EndTagType.ExplicitlyTerminated; CloseElement(openElement); if(openElement.Parent != null) m_currentLevel = openElement.Parent.Nodes; else m_currentLevel = m_result; } ++visitor; System.Diagnostics.Debug.Assert(m_tokens[visitor].Type == Token.TokenType.TagEnd); ++visitor; }
///////////////////////////////////////////////////////////////////////////////// #region ///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="htmlStream"></param> /// <returns></returns> private Encoding DetectCharset(CHtmlNodeCollection nodes) { Encoding result = null; string charset = ""; CHtmlNodeCollection metaNodes = new CHtmlNodeCollection(); CHtmlElement node = nodes["html"] as CHtmlElement; if (node != null) { node = node.Nodes["head"] as CHtmlElement; } if (node != null) { node.Nodes.FindByNameAttribute(metaNodes, "meta", "content", false); } for (int nodeIndex = 0, count = metaNodes.Count; nodeIndex < count; ++nodeIndex) { CHtmlElement metaElement = metaNodes[nodeIndex] as CHtmlElement; if (metaElement != null) { int index = -1; CHtmlAttributeCollection attributes = metaElement.Attributes.FindByName("content"); for (int attributeIndex = 0, attributeCount = attributes.Count; attributeIndex < attributeCount; ++attributeIndex) { CHtmlAttribute attribute = attributes[attributeIndex]; if ((index = attribute.Value.IndexOf("charset")) != -1) { string value = attribute.Value; int startIndex = index + 7; while (startIndex < value.Length && CHtmlUtil.EqualesOfAnyChar(value[startIndex], " =")) { ++startIndex; } int endIndex = startIndex + 1; while (endIndex < value.Length && !CHtmlUtil.EqualesOfAnyChar(value[endIndex], " ")) { ++endIndex; } if (startIndex < value.Length && endIndex - startIndex > 0) { charset = value.Substring(startIndex, endIndex - startIndex); try { result = Encoding.GetEncoding(charset); break; } catch (Exception) { } } } } } } return(result); }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /// <param name="tagName"></param> /// <param name="boundElements"></param> private void ResolveEndTagOptionalElement(string[] targetElements, string[] boundElements) { System.Diagnostics.Debug.Assert(targetElements != null); System.Diagnostics.Debug.Assert(boundElements != null); for(int targetIndex = 0, targetCount = targetElements.Length; targetIndex < targetCount; ++targetIndex) { // Find open element CHtmlElement openElement = (CHtmlElement)m_lastOpenNodes[targetElements[targetIndex]]; if(openElement != null) { bool fixNestedElement = true; CHtmlElement boundElement = null; // Find bound element for(int index = 0, count = boundElements.Length; index < count; ++index) { boundElement = (CHtmlElement)m_lastOpenNodes[boundElements[index]]; if(boundElement != null && boundElement.m_close == false && boundElement.NodeID > openElement.NodeID) { // The bound element is found, and the "tagName" can cross the bound element to // match the open element fixNestedElement = false; break; } } if(fixNestedElement == true) { CHtmlElement parent = openElement.Parent; CloseElement(openElement); if(openElement.Parent != null) m_currentLevel = openElement.Parent.Nodes; else m_currentLevel = m_result; } } } }
/////////////////////////////////////////////////////////////////////////////// /// <summary> /// ������ /// </summary> public CHtmlElement(CHtmlElement obj) : base(obj) { System.Diagnostics.Debug.Assert(obj != null); obj.AssertValid(); m_name = obj.m_name; m_nodes = new CHtmlNodeCollection(this); int count = obj.m_nodes.Count; m_nodes.Capacity = count; for(int index = 0; index < count; ++index) m_nodes.Add((CHtmlNode)obj.m_nodes[index].Clone()); count = obj.m_attributes.Count; m_attributes.Capacity = count; for(int index = 0; index < count; ++index) m_attributes.Add((CHtmlAttribute)obj.m_attributes[index].Clone()); }
///////////////////////////////////////////////////////////////////////////////// /// <summary> /// This will search though this collection of nodes for all elements with the /// specified name. If you want to search the subnodes recursively, you should /// pass True as the parameter in searchChildren. This search is guaranteed to /// return nodes in the order in which they are found in the document. /// </summary> /// <param name="name">The name of the element to find</param> /// <param name="searchChildren">True if you want to search sub-nodes, False to /// only search this collection.</param> /// <returns>A collection of all the nodes that macth.</returns> public CHtmlNodeCollection FindByName(string name, bool searchChildren) { System.Diagnostics.Debug.Assert(name != null); name = name.Trim().ToLower(); CHtmlNodeCollection result = new CHtmlNodeCollection(64); FindByName(result, name, searchChildren); return result; }