public htmlTag(string strTag, string strHTML, int iLineNr, htmlTag pParentTag, htmlTag pPreviousTag) { _tag = strTag.Trim().ToUpper(); _html = strHTML.Trim(); _lineNr = iLineNr; _attributes = new System.Collections.Generic.Dictionary <string, string>(); _parentTag = pParentTag; _previousTag = pPreviousTag; _nextTag = null; _innerTags = new List <htmlTag>(); }
public htmlTag() { _tag = ""; _html = ""; _lineNr = 0; _attributes = new System.Collections.Generic.Dictionary <string, string>(); _parentTag = null; _previousTag = null; _nextTag = null; _innerTags = new List <htmlTag>(); }
/// <summary> /// Returns the first tag from InnerTags that matches HTML expression. /// Takes a case-insensitive regular expression as search string. /// </summary> public htmlTag FirstHtml(string strHTML) { foreach (htmlTag t in this.InnerTags) { if (Regex.IsMatch(t.Html, strHTML, RegexOptions.IgnoreCase)) { return(t); } htmlTag t2 = t.FirstHtml(strHTML); if (t2 != null) { return(t2); } } //return null if nothing is found return(null); }
/// <summary> /// Parses the given HTML code. /// If Tag is given it will match tags with that name. /// If attribute is given it will match tags having that attribute. /// If value is given it will match tags with any attribute having that value. /// If both attribute and value is given it will match attributes with that value. /// </summary> public void Parse(string strHTML) { #region Init/Clear local variables htmlTag lastTag = null; Int32 currLineNr = 1; htmlTag currParent = null; this.AllTags.Clear(); this.InnerTags.Clear(); #endregion do { Match mCurrHtml; // Process comments if ((mCurrHtml = Regex.Match(strHTML, @"^\s*(<!--((?!-->).)*-->|<![^<>]+>|<!\[[^\s/<>]+\[((?!\]\]>).)*\]\]>)\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success) { htmlTag t = new htmlTag("<COMMENT>", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag); if (currParent == null) { this.InnerTags.Add(t); } else { t.ParentTag.InnerTags.Add(t); } if (lastTag != null) { lastTag.NextTag = t; } lastTag = t; this.AllTags.Add(t); currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1; strHTML = Regex.Replace(strHTML, @"^\s*(<!--((?!-->).)*-->|<![^<>]+>|<!\[[^\s/<>]+\[((?!\]\]>).)*\]\]>)\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); } // Process scripts else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*(<script(?<a>\s+[^>]*)?>(?<s>((?!</script>).)*)</script>)\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success) { htmlTag t = new htmlTag("<SCRIPT>", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag); if (currParent == null) { this.InnerTags.Add(t); } else { t.ParentTag.InnerTags.Add(t); } if (lastTag != null) { lastTag.NextTag = t; } lastTag = t; this.AllTags.Add(t); // Process tag attributes MatchCollection mAttributeCollection = Regex.Matches(mCurrHtml.Groups["a"].Value, "\\s+(?<a>[^\\n/>\"'’=]+)\\s*=\\s*([\"'])(?<v>[^\"']*)[\"']|\\s+(?<a>[^\\n/>\"'’=]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline); foreach (Match mAttribute in mAttributeCollection) { if (t.Attributes.ContainsKey(mAttribute.Groups["a"].Value.Trim())) { t.Attributes[mAttribute.Groups["a"].Value.Trim()] = mAttribute.Groups["v"].Value.Trim(); } else { t.Attributes.Add(mAttribute.Groups["a"].Value.Trim(), mAttribute.Groups["v"].Value.Trim()); } } currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1; strHTML = Regex.Replace(strHTML, @"^\s*(<script(?<a>\s+[^>]*)?>(?<s>((?!</script>).)*)</script>)\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); } // Process tags else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*<(?<t>[^\s/<>]+)(?<a>\s+[^>]*?)?\s*(?<c>/)?>\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success) { htmlTag t = new htmlTag("<" + mCurrHtml.Groups["t"].Value + ">", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag); if (currParent == null) { this.InnerTags.Add(t); } else { t.ParentTag.InnerTags.Add(t); } if (lastTag != null) { lastTag.NextTag = t; } lastTag = t; this.AllTags.Add(t); // Process tag attributes MatchCollection mAttributeCollection = Regex.Matches(mCurrHtml.Groups["a"].Value, "\\s+(?<a>[^\\n/>\"'’=]+)\\s*=\\s*([\"'])(?<v>[^\"']*)[\"']|\\s+(?<a>[^\\n/>\"'’=]+)", RegexOptions.IgnoreCase | RegexOptions.Singleline); foreach (Match mAttribute in mAttributeCollection) { if (t.Attributes.ContainsKey(mAttribute.Groups["a"].Value.Trim())) { t.Attributes[mAttribute.Groups["a"].Value.Trim()] = mAttribute.Groups["v"].Value.Trim(); } else { t.Attributes.Add(mAttribute.Groups["a"].Value.Trim(), mAttribute.Groups["v"].Value.Trim()); } } // Treat tag as parent if it does not have /> ending if (mCurrHtml.Groups["c"].Value == "") { currParent = t; } currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1; strHTML = Regex.Replace(strHTML, @"^\s*<(?<t>[^\s/<>]+)(?<a>\s+[^>]*?)?\s*(?<c>/)?>\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); } // Process closing tags else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*</(?<t>[^\s/<>]+)( /)?>\s*", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success) { if (currParent != null) { currParent = currParent.ParentTag; } currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1; strHTML = Regex.Replace(strHTML, @"^\s*</(?<t>[^\s/<>]+)( /)?>\s*", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); } // Process text in between tags else if ((mCurrHtml = Regex.Match(strHTML, @"^\s*([^<]+)(?=<)", RegexOptions.IgnoreCase | RegexOptions.Singleline)).Success) { htmlTag t = new htmlTag("<TEXT>", mCurrHtml.Groups[0].Value, currLineNr, currParent, lastTag); if (currParent == null) { this.InnerTags.Add(t); } else { t.ParentTag.InnerTags.Add(t); } if (lastTag != null) { lastTag.NextTag = t; } lastTag = t; this.AllTags.Add(t); currLineNr += mCurrHtml.Groups[0].Value.Split('\n').Length - 1; strHTML = Regex.Replace(strHTML, @"^\s*([^<]+)(?=<)", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); } else { // Remove unrecognized stuff and just keep on going. May of course cause missing elements so if you find it skipping something important please let me know. strHTML = Regex.Replace(strHTML, "^.+?(?=<)", "", RegexOptions.IgnoreCase | RegexOptions.Singleline); } // Generate an error if number of html tags exceed 10.000.000 if (this.AllTags.Count > 10000000) { throw new Exception("Number of tags in document have exceeded 10.000.000 entries! Surely something must be wrong!"); } } while (!Regex.IsMatch(strHTML, @"^\s*$", RegexOptions.Singleline)); }
/// <summary> /// Returns the first tag from InnerTags that matches all given search expressions. /// If Tag is given it will return tags with names that match. /// If attribute is given it return tags with attributes that match. /// If value is given it will return tags with any attribute having values that match. /// If both attribute and value is given it will return tags with any attributes that match having a value that match. /// All parameters take a case-insensitive regular expression as search string. /// </summary> public htmlTag FirstTag(string strTag, string strAttribute, string strValue) { foreach (htmlTag t in this.InnerTags) { bool bFound = true; // Check if current Tag matches criteria if (!string.IsNullOrEmpty(strTag) && !Regex.IsMatch(t.Tag, strTag, RegexOptions.IgnoreCase)) { bFound = false; } if (!string.IsNullOrEmpty(strAttribute) && !string.IsNullOrEmpty(strValue)) { bool bAttVal = false; foreach (string s in t.Attributes.Keys) { if (Regex.IsMatch(s, strAttribute, RegexOptions.IgnoreCase) && Regex.IsMatch(t.Attributes[s], strValue, RegexOptions.IgnoreCase)) { bAttVal = true; } } if (!bAttVal) { bFound = false; } ; } else if (!string.IsNullOrEmpty(strAttribute)) { bool bAttVal = false; foreach (string s in t.Attributes.Keys) { if (Regex.IsMatch(s, strAttribute, RegexOptions.IgnoreCase)) { bAttVal = true; } } if (!bAttVal) { bFound = false; } ; } else if (!string.IsNullOrEmpty(strValue)) { bool bAttVal = false; foreach (string s in t.Attributes.Values) { if (Regex.IsMatch(s, strValue, RegexOptions.IgnoreCase)) { bAttVal = true; } } if (!bAttVal) { bFound = false; } ; } // Return current Tag if it matches criteria if (bFound) { return(t); } // Search InnerTags htmlTag t2 = t.FirstTag(strTag, strAttribute, strValue); if (t2 != null) { return(t2); } } //return null if nothing is found return(null); }