private void GetElementsStringList(string html, ref List <string> eleList) { HtmlInfo info = new HtmlInfo(); info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value; string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>"; //获取当前标签元素开始标签正则 string currentTagEndReg = @"\<\/" + info.TagName + @"\>"; //获取当前标签元素收尾标签正则 if (string.IsNullOrEmpty(info.TagName)) { return; } string eleHtml = ""; //情况1 <a/> //情况2 <a></a> //情况3 <a> 错误格式 //情况4endif if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//单标签 { eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value; } else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾 { if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if")) { eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1); } else { eleHtml = Regex.Match(html, currentTagBeginReg, RegexOptions.Singleline).Value; } } else { eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1); } try { eleList.Add(eleHtml); html = html.Replace(eleHtml, ""); html = Regex.Replace(html, @"<\!DOCTYPE.*?>", ""); if (!Regex.IsMatch(html, @"^\s*$")) { GetElementsStringList(html, ref eleList); } } catch (Exception ex) { throw new Exception("SORRY,您的HTML格式不能解析!!!"); } }
/// <summary> /// 获取第一级元素 /// </summary> /// <param name="elementName"></param> /// <returns></returns> public List <HtmlInfo> RootDescendants(string html = null) { /* * 业务逻辑: * 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1 * 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素 */ if (html == null) { html = _html; } var firstTag = Regex.Match(html, "<.+?>"); List <string> eleList = new List <string>(); List <HtmlInfo> reval = new List <HtmlInfo>(); GetElementsStringList(html, ref eleList); foreach (var r in eleList) { HtmlInfo data = new HtmlInfo(); data.OldFullHtml = r; data.SameLeveHtml = html; data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value; data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value; var eleBegin = Regex.Match(r, "<.+?>").Value; var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast <Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList(); data.Attributes = new Dictionary <string, string>(); if (attrList != null && attrList.Count > 0) { foreach (var a in attrList) { data.Attributes.Add(a.key, a.value); } } reval.Add(data); } return(reval); }