public List <DomElement> Find(string XPath, DomElement rootElement = null) { var elements = new List <DomElement>(); if (XPath == "") { return(elements); } if (XPath.IndexOf("/") < 0) { return(elements); } if (Elements.Count == 0) { return(elements); } var root = rootElement; DomElement elem; var domIndex = 0; if (root == null) { //search from first element root = Elements[0]; } else { //start search at rootElement; domIndex = rootElement.index + 1; } //search the DOM to find elements based on the XPath query var paths = XPath.Split('/'); var lastPath = ""; var searchPath = ""; var searchFunc = ""; var searchName = ""; var hierarchy = ""; var childhier = ""; foreach (var path in paths) { if (path == "") { //hierarchy symbol if (lastPath == "/") { //look anywhere in the hierarchy searchPath = "//"; } else { searchPath = "/"; } } else { //check for search function if (path.IndexOf("[") >= 0) { searchFunc = "[" + path.Split('[')[1]; searchName = path.Replace(searchFunc, "").ToLower(); } else { searchName = path.ToLower(); } //find matching elements switch (searchPath) { case "/": //find elements at current hierarchy level foreach (var child in root.Children()) { if (child.tagName == searchName) { //found matching element !!!!!!! elements.Add(child); } } break; case "//": //find elements at any hierarchy level if (root.hierarchyIndexes.Length > 0) { hierarchy = string.Join(">", root.hierarchyIndexes); } else { hierarchy = ""; } for (var x = root.index + 1; x < Elements.Count; x++) { elem = Elements[x]; if (elem.hierarchyIndexes.Length > 0) { childhier = string.Join(">", elem.hierarchyIndexes); } else { childhier = ""; } if (childhier.IndexOf(hierarchy) == 0) { if (elem.tagName == searchName) { //found matching element !!!!!!! elements.Add(elem); } } } break; } } lastPath = path; if (lastPath == "") { lastPath = "/"; } } return(elements); }
public void Parse(string htm) { if (htm.Length <= 3) { return; } bool isClosingTag = false; bool isSelfClosing = false; bool isInScript = false; bool isComment = false; bool foundTag = false; int s1, s2, s3, xs = -1; int parentElement = -1; string str1, schar, strTag, strText, docType = "html"; var hierarchy = new List <string>(); var hierarchyIndexes = new List <int>(); var domTag = new DomElement(this); var textTag = new DomElement(this); var tagNameChars = new string[] { "/", "!", "?" }; for (var x = 0; x < htm.Length; x++) { //find HTML tag domTag = new DomElement(this); if (foundTag == false && xs == 0) { //no tags found in htm, create text tag and exit textTag = new DomElement(this) { tagName = "#text", text = htm }; AddTag(textTag, parentElement, true, false, hierarchy, hierarchyIndexes); break; } else if (xs == -1) { xs = x; } else if (foundTag == true) { xs = x; } isClosingTag = false; isSelfClosing = false; isComment = false; foundTag = false; if (isInScript == true) { //find closing script tag //TODO: make sure </script> tag isn't in a // javascript string, but instead is the // actual closing tag for the script x = htm.IndexOf("</script>", x); if (x == -1) { break; } schar = htm.Substring(x, 9).ToString(); } else { //find next html tag x = htm.IndexOf('<', x); if (x == -1) { break; } schar = htm.Substring(x, 3).ToString(); } if (schar[0] == '<') { if (schar[1].ToString().OnlyAlphabet(tagNameChars)) { //found HTML tag s1 = htm.IndexOf(">", x + 2); s2 = htm.IndexOf("<", x + 2); if (s1 >= 0) { //check for comment if (htm.Substring(x + 1, 3) == "!--") { s1 = htm.IndexOf("-->", x + 1); if (s1 < 0) { s1 = htm.Length - 1; } else { s1 += 2; } s2 = -1; isSelfClosing = true; isComment = true; } //check for broken tag if (s2 < s1 && s2 >= 0) { continue; } //found end of tag foundTag = true; strTag = htm.Substring(x + 1, s1 - (x + 1)); //check for self-closing tag str1 = strTag.Substring(strTag.Length - 1, 1); if (str1 == "/" || (str1 == "?" && schar[1] == '?')) { isSelfClosing = true; } if (Elements.Count == 0) { if (strTag.IndexOf("?xml") == 0) { docType = "xml"; } } documentType = docType; //check for attributes domTag.className = new List <string>(); if (isComment == true) { domTag.tagName = "!--"; domTag.text = strTag.Substring(3, strTag.Length - 5); } else { s3 = strTag.IndexOf(" "); if (s3 < 0) { //tag has no attributes if (isSelfClosing) { if (strTag.Length > 1) { domTag.tagName = strTag.Substring(0, strTag.Length - 2).ToLower(); } } else { //tag has no attributes & no forward-slash domTag.tagName = strTag.ToLower(); } } else { //tag has attributes domTag.tagName = strTag.Substring(0, s3).ToLower(); domTag.attribute = GetAttributes(strTag); domTag.style = new Dictionary <string, string>(); //set up class name list if (domTag.attribute.ContainsKey("class")) { domTag.className = new List <string>(domTag.attribute["class"].Split(' ')); } else { domTag.className = new List <string>(); } //set up style dictionary if (domTag.attribute.ContainsKey("style")) { var domStyle = new List <string>(domTag.attribute["style"].Split(';')); foreach (string keyval in domStyle) { var styleKeyVal = keyval.Trim().Split(new char[] { ':' }, 2); if (styleKeyVal.Length == 2) { var kv = styleKeyVal[0].Trim().ToLower(); if (domTag.style.ContainsKey(kv) == false) { domTag.style.Add(kv, styleKeyVal[1].Trim()); } } } } } } if (domTag.tagName != "") { //check if tag is script if (docType == "html") { if (isInScript == true) { isInScript = false; } else if (domTag.tagName == "script" && isSelfClosing == false) { isInScript = true; } //check if tag is self-closing even if it //doesn't include a forward-slash at the end switch (domTag.tagName) { case "br": case "img": case "input": case "link": case "meta": case "hr": isSelfClosing = true; break; } } if (domTag.tagName.Substring(0, 1) == "!") { //comments & doctype are self-closing tags isSelfClosing = true; } if (schar[1] == '/') { //found closing tag isClosingTag = true; } //extract text before beginning of tag strText = htm.Substring(xs, x - xs).Trim(); if (strText != "") { textTag = new DomElement(this) { tagName = "#text", text = strText }; AddTag(textTag, parentElement, true, false, hierarchy, hierarchyIndexes); } //check if domTag is unusable if (domTag.tagName == "" || domTag.tagName == null) { foundTag = false; continue; } //add tag to array parentElement = AddTag(domTag, parentElement, isSelfClosing, isClosingTag, hierarchy, hierarchyIndexes); //parentElement = pelem; if (isClosingTag == true) { //go back one parent if this tag is a closing tag if (parentElement >= 0) { if (Elements[parentElement].tagName != domTag.tagName.Replace("/", "")) { //not the same tag as the current parent tag, add missing closing tag if (Elements[parentElement].parent >= 0) { if (Elements[Elements[parentElement].parent].tagName == domTag.tagName.Replace("/", "")) { //replace unknown closing tag with missing closing tag domTag.tagName = "/" + Elements[Elements[parentElement].parent].tagName; } else { //skip this closing tag because it doesn't have an opening tag //Elements.RemoveAt(Elements.Count - 1); x = xs = s1; continue; } } } parentElement = Elements[parentElement].parent; if (hierarchy.Count > 0) { hierarchy.RemoveAt(hierarchy.Count - 1); hierarchyIndexes.RemoveAt(hierarchyIndexes.Count - 1); } } } } x = xs = s1; } } } } //finally, add last text tag (if possible) if (xs < htm.Length - 1) { if (htm.Substring(xs).Trim().Replace("\r", "").Replace("\n", "").Length > 0) { textTag = new DomElement(this) { tagName = "#text", text = htm.Substring(xs) }; AddTag(textTag, parentElement, true, false, hierarchy, hierarchyIndexes); } } }