예제 #1
0
        private void GetElementsStringList(string html, ref List <string> eleList)
        {
            HtmlInfo info = new HtmlInfo();

            info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value;
            string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>"; //获取当前标签元素开始标签正则
            string currentTagEndReg   = @"\<\/" + info.TagName + @"\>";        //获取当前标签元素收尾标签正则

            if (string.IsNullOrEmpty(info.TagName))
            {
                return;
            }

            string eleHtml = "";

            //情况1 <a/>
            //情况2 <a></a>
            //情况3 <a> 错误格式
            //情况4endif
            if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//单标签
            {
                eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value;
            }
            else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾
            {
                if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if"))
                {
                    eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1);
                }
                else
                {
                    eleHtml = Regex.Match(html, currentTagBeginReg, RegexOptions.Singleline).Value;
                }
            }
            else
            {
                eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
            }


            try
            {
                eleList.Add(eleHtml);
                html = html.Replace(eleHtml, "");
                html = Regex.Replace(html, @"<\!DOCTYPE.*?>", "");
                if (!Regex.IsMatch(html, @"^\s*$"))
                {
                    GetElementsStringList(html, ref eleList);
                }
            }
            catch (Exception ex)
            {
                throw new Exception("SORRY,您的HTML格式不能解析!!!");
            }
        }
예제 #2
0
        /// <summary>
        /// 获取第一级元素
        /// </summary>
        /// <param name="elementName"></param>
        /// <returns></returns>
        public List <HtmlInfo> RootDescendants(string html = null)
        {
            /*
             * 业务逻辑:
             * 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1
             * 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素
             */
            if (html == null)
            {
                html = _html;
            }
            var firstTag = Regex.Match(html, "<.+?>");

            List <string>   eleList = new List <string>();
            List <HtmlInfo> reval   = new List <HtmlInfo>();

            GetElementsStringList(html, ref eleList);
            foreach (var r in eleList)
            {
                HtmlInfo data = new HtmlInfo();
                data.OldFullHtml  = r;
                data.SameLeveHtml = html;
                data.TagName      = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value;
                data.InnerHtml    = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value;
                var eleBegin = Regex.Match(r, "<.+?>").Value;
                var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast <Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
                data.Attributes = new Dictionary <string, string>();
                if (attrList != null && attrList.Count > 0)
                {
                    foreach (var a in attrList)
                    {
                        data.Attributes.Add(a.key, a.value);
                    }
                }
                reval.Add(data);
            }
            return(reval);
        }