예제 #1
0
 public htmlTag(string strTag, string strHTML, int iLineNr, htmlTag pParentTag, htmlTag pPreviousTag)
 {
     _tag         = strTag.Trim().ToUpper();
     _html        = strHTML.Trim();
     _lineNr      = iLineNr;
     _attributes  = new System.Collections.Generic.Dictionary <string, string>();
     _parentTag   = pParentTag;
     _previousTag = pPreviousTag;
     _nextTag     = null;
     _innerTags   = new List <htmlTag>();
 }
예제 #2
0
 public htmlTag()
 {
     _tag         = "";
     _html        = "";
     _lineNr      = 0;
     _attributes  = new System.Collections.Generic.Dictionary <string, string>();
     _parentTag   = null;
     _previousTag = null;
     _nextTag     = null;
     _innerTags   = new List <htmlTag>();
 }
예제 #3
0
        /// <summary>
        /// Returns the first tag from InnerTags that matches HTML expression.
        /// Takes a case-insensitive regular expression as search string.
        /// </summary>
        public htmlTag FirstHtml(string strHTML)
        {
            foreach (htmlTag t in this.InnerTags)
            {
                if (Regex.IsMatch(t.Html, strHTML, RegexOptions.IgnoreCase))
                {
                    return(t);
                }
                htmlTag t2 = t.FirstHtml(strHTML);
                if (t2 != null)
                {
                    return(t2);
                }
            }

            //return null if nothing is found
            return(null);
        }
예제 #4
0
        /// <summary>
        /// Parses the given HTML code.
        /// If Tag is given it will match tags with that name.
        /// If attribute is given it will match tags having that attribute.
        /// If value is given it will match tags with any attribute having that value.
        /// If both attribute and value is given it will match attributes with that value.
        /// </summary>
        public void Parse(string strHTML)
        {
            #region Init/Clear local variables
            htmlTag lastTag    = null;
            Int32   currLineNr = 1;
            htmlTag currParent = null;
            this.AllTags.Clear();
            this.InnerTags.Clear();
            #endregion

            do
            {
                Match mCurrHtml;
                // Process comments
                if ((mCurrHtml = Regex.Match(strHTML, @"^\s*(<!--((?!-->).)*-->|<![^<>]+>|<!\[[^\s/<>]+\[((?!\]\]>).)*\]\]>)\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success)
                {
                    htmlTag t = new htmlTag("<COMMENT>", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag);

                    if (currParent == null)
                    {
                        this.InnerTags.Add(t);
                    }
                    else
                    {
                        t.ParentTag.InnerTags.Add(t);
                    }
                    if (lastTag != null)
                    {
                        lastTag.NextTag = t;
                    }
                    lastTag = t;
                    this.AllTags.Add(t);

                    currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1;
                    strHTML     = Regex.Replace(strHTML, @"^\s*(<!--((?!-->).)*-->|<![^<>]+>|<!\[[^\s/<>]+\[((?!\]\]>).)*\]\]>)\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                }
                // Process scripts
                else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*(<script(?<a>\s+[^>]*)?>(?<s>((?!</script>).)*)</script>)\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success)
                {
                    htmlTag t = new htmlTag("<SCRIPT>", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag);

                    if (currParent == null)
                    {
                        this.InnerTags.Add(t);
                    }
                    else
                    {
                        t.ParentTag.InnerTags.Add(t);
                    }
                    if (lastTag != null)
                    {
                        lastTag.NextTag = t;
                    }
                    lastTag = t;
                    this.AllTags.Add(t);

                    // Process tag attributes
                    MatchCollection mAttributeCollection = Regex.Matches(mCurrHtml.Groups["a"].Value, "\\s+(?<a>[^\\n/>\"'’=]+)\\s*=\\s*([\"'])(?<v>[^\"']*)[\"']|\\s+(?<a>[^\\n/>\"'’=]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                    foreach (Match mAttribute in mAttributeCollection)
                    {
                        if (t.Attributes.ContainsKey(mAttribute.Groups["a"].Value.Trim()))
                        {
                            t.Attributes[mAttribute.Groups["a"].Value.Trim()] = mAttribute.Groups["v"].Value.Trim();
                        }
                        else
                        {
                            t.Attributes.Add(mAttribute.Groups["a"].Value.Trim(), mAttribute.Groups["v"].Value.Trim());
                        }
                    }

                    currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1;
                    strHTML     = Regex.Replace(strHTML, @"^\s*(<script(?<a>\s+[^>]*)?>(?<s>((?!</script>).)*)</script>)\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                }
                // Process tags
                else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*<(?<t>[^\s/<>]+)(?<a>\s+[^>]*?)?\s*(?<c>/)?>\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success)
                {
                    htmlTag t = new htmlTag("<" + mCurrHtml.Groups["t"].Value + ">", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag);

                    if (currParent == null)
                    {
                        this.InnerTags.Add(t);
                    }
                    else
                    {
                        t.ParentTag.InnerTags.Add(t);
                    }
                    if (lastTag != null)
                    {
                        lastTag.NextTag = t;
                    }
                    lastTag = t;
                    this.AllTags.Add(t);

                    // Process tag attributes
                    MatchCollection mAttributeCollection = Regex.Matches(mCurrHtml.Groups["a"].Value, "\\s+(?<a>[^\\n/>\"'’=]+)\\s*=\\s*([\"'])(?<v>[^\"']*)[\"']|\\s+(?<a>[^\\n/>\"'’=]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                    foreach (Match mAttribute in mAttributeCollection)
                    {
                        if (t.Attributes.ContainsKey(mAttribute.Groups["a"].Value.Trim()))
                        {
                            t.Attributes[mAttribute.Groups["a"].Value.Trim()] = mAttribute.Groups["v"].Value.Trim();
                        }
                        else
                        {
                            t.Attributes.Add(mAttribute.Groups["a"].Value.Trim(), mAttribute.Groups["v"].Value.Trim());
                        }
                    }

                    // Treat tag as parent if it does not have /> ending
                    if (mCurrHtml.Groups["c"].Value == "")
                    {
                        currParent = t;
                    }

                    currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1;
                    strHTML     = Regex.Replace(strHTML, @"^\s*<(?<t>[^\s/<>]+)(?<a>\s+[^>]*?)?\s*(?<c>/)?>\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                }
                // Process closing tags
                else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*</(?<t>[^\s/<>]+)( /)?>\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success)
                {
                    if (currParent != null)
                    {
                        currParent = currParent.ParentTag;
                    }

                    currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1;
                    strHTML     = Regex.Replace(strHTML, @"^\s*</(?<t>[^\s/<>]+)( /)?>\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                }
                // Process text in between tags
                else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*([^<]+)(?=<)", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success)
                {
                    htmlTag t = new htmlTag("<TEXT>", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag);

                    if (currParent == null)
                    {
                        this.InnerTags.Add(t);
                    }
                    else
                    {
                        t.ParentTag.InnerTags.Add(t);
                    }
                    if (lastTag != null)
                    {
                        lastTag.NextTag = t;
                    }
                    lastTag = t;
                    this.AllTags.Add(t);

                    currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1;
                    strHTML     = Regex.Replace(strHTML, @"^\s*([^<]+)(?=<)", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                }
                else
                {
                    // Remove unrecognized stuff and just keep on going. May of course cause missing elements so if you find it skipping something important please let me know.
                    strHTML = Regex.Replace(strHTML, "^.+?(?=<)", "", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                }

                // Generate an error if number of html tags exceed 10.000.000
                if (this.AllTags.Count > 10000000)
                {
                    throw new Exception("Number of tags in document have exceeded 10.000.000 entries! Surely something must be wrong!");
                }
            } while (!Regex.IsMatch(strHTML, @"^\s*$", RegexOptions.Singleline));
        }
예제 #5
0
        /// <summary>
        /// Returns the first tag from InnerTags that matches all given search expressions.
        /// If Tag is given it will return tags with names that match.
        /// If attribute is given it return tags with attributes that match.
        /// If value is given it will return tags with any attribute having values that match.
        /// If both attribute and value is given it will return tags with any attributes that match having a value that match.
        /// All parameters take a case-insensitive regular expression as search string.
        /// </summary>
        public htmlTag FirstTag(string strTag, string strAttribute, string strValue)
        {
            foreach (htmlTag t in this.InnerTags)
            {
                bool bFound = true;
                // Check if current Tag matches criteria
                if (!string.IsNullOrEmpty(strTag) && !Regex.IsMatch(t.Tag, strTag, RegexOptions.IgnoreCase))
                {
                    bFound = false;
                }
                if (!string.IsNullOrEmpty(strAttribute) && !string.IsNullOrEmpty(strValue))
                {
                    bool bAttVal = false;
                    foreach (string s in t.Attributes.Keys)
                    {
                        if (Regex.IsMatch(s, strAttribute, RegexOptions.IgnoreCase) && Regex.IsMatch(t.Attributes[s], strValue, RegexOptions.IgnoreCase))
                        {
                            bAttVal = true;
                        }
                    }
                    if (!bAttVal)
                    {
                        bFound = false;
                    }
                    ;
                }
                else if (!string.IsNullOrEmpty(strAttribute))
                {
                    bool bAttVal = false;
                    foreach (string s in t.Attributes.Keys)
                    {
                        if (Regex.IsMatch(s, strAttribute, RegexOptions.IgnoreCase))
                        {
                            bAttVal = true;
                        }
                    }
                    if (!bAttVal)
                    {
                        bFound = false;
                    }
                    ;
                }
                else if (!string.IsNullOrEmpty(strValue))
                {
                    bool bAttVal = false;
                    foreach (string s in t.Attributes.Values)
                    {
                        if (Regex.IsMatch(s, strValue, RegexOptions.IgnoreCase))
                        {
                            bAttVal = true;
                        }
                    }
                    if (!bAttVal)
                    {
                        bFound = false;
                    }
                    ;
                }

                // Return current Tag if it matches criteria
                if (bFound)
                {
                    return(t);
                }

                // Search InnerTags
                htmlTag t2 = t.FirstTag(strTag, strAttribute, strValue);
                if (t2 != null)
                {
                    return(t2);
                }
            }

            //return null if nothing is found
            return(null);
        }