Example #1
0
        public List <DomElement> Find(string XPath, DomElement rootElement = null)
        {
            var elements = new List <DomElement>();

            if (XPath == "")
            {
                return(elements);
            }
            if (XPath.IndexOf("/") < 0)
            {
                return(elements);
            }
            if (Elements.Count == 0)
            {
                return(elements);
            }
            var        root = rootElement;
            DomElement elem;
            var        domIndex = 0;

            if (root == null)
            {
                //search from first element
                root = Elements[0];
            }
            else
            {
                //start search at rootElement;
                domIndex = rootElement.index + 1;
            }

            //search the DOM to find elements based on the XPath query
            var paths      = XPath.Split('/');
            var lastPath   = "";
            var searchPath = "";
            var searchFunc = "";
            var searchName = "";
            var hierarchy  = "";
            var childhier  = "";

            foreach (var path in paths)
            {
                if (path == "")
                {
                    //hierarchy symbol
                    if (lastPath == "/")
                    {
                        //look anywhere in the hierarchy
                        searchPath = "//";
                    }
                    else
                    {
                        searchPath = "/";
                    }
                }
                else
                {
                    //check for search function
                    if (path.IndexOf("[") >= 0)
                    {
                        searchFunc = "[" + path.Split('[')[1];
                        searchName = path.Replace(searchFunc, "").ToLower();
                    }
                    else
                    {
                        searchName = path.ToLower();
                    }

                    //find matching elements
                    switch (searchPath)
                    {
                    case "/":
                        //find elements at current hierarchy level
                        foreach (var child in root.Children())
                        {
                            if (child.tagName == searchName)
                            {
                                //found matching element !!!!!!!
                                elements.Add(child);
                            }
                        }
                        break;

                    case "//":
                        //find elements at any hierarchy level
                        if (root.hierarchyIndexes.Length > 0)
                        {
                            hierarchy = string.Join(">", root.hierarchyIndexes);
                        }
                        else
                        {
                            hierarchy = "";
                        }
                        for (var x = root.index + 1; x < Elements.Count; x++)
                        {
                            elem = Elements[x];
                            if (elem.hierarchyIndexes.Length > 0)
                            {
                                childhier = string.Join(">", elem.hierarchyIndexes);
                            }
                            else
                            {
                                childhier = "";
                            }
                            if (childhier.IndexOf(hierarchy) == 0)
                            {
                                if (elem.tagName == searchName)
                                {
                                    //found matching element !!!!!!!
                                    elements.Add(elem);
                                }
                            }
                        }
                        break;
                    }
                }
                lastPath = path;
                if (lastPath == "")
                {
                    lastPath = "/";
                }
            }

            return(elements);
        }
Example #2
0
        public void Parse(string htm)
        {
            if (htm.Length <= 3)
            {
                return;
            }
            bool   isClosingTag = false;
            bool   isSelfClosing = false;
            bool   isInScript = false;
            bool   isComment = false;
            bool   foundTag = false;
            int    s1, s2, s3, xs = -1;
            int    parentElement = -1;
            string str1, schar, strTag, strText, docType = "html";
            var    hierarchy        = new List <string>();
            var    hierarchyIndexes = new List <int>();
            var    domTag           = new DomElement(this);
            var    textTag          = new DomElement(this);
            var    tagNameChars     = new string[] { "/", "!", "?" };

            for (var x = 0; x < htm.Length; x++)
            {
                //find HTML tag
                domTag = new DomElement(this);

                if (foundTag == false && xs == 0)
                {
                    //no tags found in htm, create text tag and exit
                    textTag = new DomElement(this)
                    {
                        tagName = "#text",
                        text    = htm
                    };
                    AddTag(textTag, parentElement, true, false, hierarchy, hierarchyIndexes);
                    break;
                }
                else if (xs == -1)
                {
                    xs = x;
                }
                else if (foundTag == true)
                {
                    xs = x;
                }

                isClosingTag  = false;
                isSelfClosing = false;
                isComment     = false;
                foundTag      = false;
                if (isInScript == true)
                {
                    //find closing script tag
                    //TODO: make sure </script> tag isn't in a
                    //      javascript string, but instead is the
                    //      actual closing tag for the script
                    x = htm.IndexOf("</script>", x);
                    if (x == -1)
                    {
                        break;
                    }
                    schar = htm.Substring(x, 9).ToString();
                }
                else
                {
                    //find next html tag
                    x = htm.IndexOf('<', x);
                    if (x == -1)
                    {
                        break;
                    }
                    schar = htm.Substring(x, 3).ToString();
                }
                if (schar[0] == '<')
                {
                    if (schar[1].ToString().OnlyAlphabet(tagNameChars))
                    {
                        //found HTML tag
                        s1 = htm.IndexOf(">", x + 2);
                        s2 = htm.IndexOf("<", x + 2);
                        if (s1 >= 0)
                        {
                            //check for comment
                            if (htm.Substring(x + 1, 3) == "!--")
                            {
                                s1 = htm.IndexOf("-->", x + 1);
                                if (s1 < 0)
                                {
                                    s1 = htm.Length - 1;
                                }
                                else
                                {
                                    s1 += 2;
                                }
                                s2            = -1;
                                isSelfClosing = true;
                                isComment     = true;
                            }

                            //check for broken tag
                            if (s2 < s1 && s2 >= 0)
                            {
                                continue;
                            }

                            //found end of tag
                            foundTag = true;
                            strTag   = htm.Substring(x + 1, s1 - (x + 1));

                            //check for self-closing tag
                            str1 = strTag.Substring(strTag.Length - 1, 1);
                            if (str1 == "/" || (str1 == "?" && schar[1] == '?'))
                            {
                                isSelfClosing = true;
                            }
                            if (Elements.Count == 0)
                            {
                                if (strTag.IndexOf("?xml") == 0)
                                {
                                    docType = "xml";
                                }
                            }
                            documentType = docType;

                            //check for attributes
                            domTag.className = new List <string>();
                            if (isComment == true)
                            {
                                domTag.tagName = "!--";
                                domTag.text    = strTag.Substring(3, strTag.Length - 5);
                            }
                            else
                            {
                                s3 = strTag.IndexOf(" ");
                                if (s3 < 0)
                                {
                                    //tag has no attributes
                                    if (isSelfClosing)
                                    {
                                        if (strTag.Length > 1)
                                        {
                                            domTag.tagName = strTag.Substring(0, strTag.Length - 2).ToLower();
                                        }
                                    }
                                    else
                                    {
                                        //tag has no attributes & no forward-slash
                                        domTag.tagName = strTag.ToLower();
                                    }
                                }
                                else
                                {
                                    //tag has attributes
                                    domTag.tagName   = strTag.Substring(0, s3).ToLower();
                                    domTag.attribute = GetAttributes(strTag);
                                    domTag.style     = new Dictionary <string, string>();

                                    //set up class name list
                                    if (domTag.attribute.ContainsKey("class"))
                                    {
                                        domTag.className = new List <string>(domTag.attribute["class"].Split(' '));
                                    }
                                    else
                                    {
                                        domTag.className = new List <string>();
                                    }

                                    //set up style dictionary
                                    if (domTag.attribute.ContainsKey("style"))
                                    {
                                        var domStyle = new List <string>(domTag.attribute["style"].Split(';'));
                                        foreach (string keyval in domStyle)
                                        {
                                            var styleKeyVal = keyval.Trim().Split(new char[] { ':' }, 2);
                                            if (styleKeyVal.Length == 2)
                                            {
                                                var kv = styleKeyVal[0].Trim().ToLower();
                                                if (domTag.style.ContainsKey(kv) == false)
                                                {
                                                    domTag.style.Add(kv, styleKeyVal[1].Trim());
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                            if (domTag.tagName != "")
                            {
                                //check if tag is script
                                if (docType == "html")
                                {
                                    if (isInScript == true)
                                    {
                                        isInScript = false;
                                    }
                                    else if (domTag.tagName == "script" && isSelfClosing == false)
                                    {
                                        isInScript = true;
                                    }

                                    //check if tag is self-closing even if it
                                    //doesn't include a forward-slash at the end
                                    switch (domTag.tagName)
                                    {
                                    case "br":
                                    case "img":
                                    case "input":
                                    case "link":
                                    case "meta":
                                    case "hr":
                                        isSelfClosing = true;
                                        break;
                                    }
                                }

                                if (domTag.tagName.Substring(0, 1) == "!")
                                {
                                    //comments & doctype are self-closing tags
                                    isSelfClosing = true;
                                }

                                if (schar[1] == '/')
                                {
                                    //found closing tag
                                    isClosingTag = true;
                                }

                                //extract text before beginning of tag
                                strText = htm.Substring(xs, x - xs).Trim();
                                if (strText != "")
                                {
                                    textTag = new DomElement(this)
                                    {
                                        tagName = "#text",
                                        text    = strText
                                    };
                                    AddTag(textTag, parentElement, true, false, hierarchy, hierarchyIndexes);
                                }

                                //check if domTag is unusable
                                if (domTag.tagName == "" || domTag.tagName == null)
                                {
                                    foundTag = false;
                                    continue;
                                }

                                //add tag to array
                                parentElement = AddTag(domTag, parentElement, isSelfClosing, isClosingTag, hierarchy, hierarchyIndexes);
                                //parentElement = pelem;
                                if (isClosingTag == true)
                                {
                                    //go back one parent if this tag is a closing tag
                                    if (parentElement >= 0)
                                    {
                                        if (Elements[parentElement].tagName != domTag.tagName.Replace("/", ""))
                                        {
                                            //not the same tag as the current parent tag, add missing closing tag
                                            if (Elements[parentElement].parent >= 0)
                                            {
                                                if (Elements[Elements[parentElement].parent].tagName == domTag.tagName.Replace("/", ""))
                                                {
                                                    //replace unknown closing tag with missing closing tag
                                                    domTag.tagName = "/" + Elements[Elements[parentElement].parent].tagName;
                                                }
                                                else
                                                {
                                                    //skip this closing tag because it doesn't have an opening tag
                                                    //Elements.RemoveAt(Elements.Count - 1);
                                                    x = xs = s1;
                                                    continue;
                                                }
                                            }
                                        }
                                        parentElement = Elements[parentElement].parent;
                                        if (hierarchy.Count > 0)
                                        {
                                            hierarchy.RemoveAt(hierarchy.Count - 1);
                                            hierarchyIndexes.RemoveAt(hierarchyIndexes.Count - 1);
                                        }
                                    }
                                }
                            }
                            x = xs = s1;
                        }
                    }
                }
            }
            //finally, add last text tag (if possible)
            if (xs < htm.Length - 1)
            {
                if (htm.Substring(xs).Trim().Replace("\r", "").Replace("\n", "").Length > 0)
                {
                    textTag = new DomElement(this)
                    {
                        tagName = "#text",
                        text    = htm.Substring(xs)
                    };
                    AddTag(textTag, parentElement, true, false, hierarchy, hierarchyIndexes);
                }
            }
        }