Пример #1
0
        /// <summary>
        /// 将含有HTML实体字符的字符串解码
        /// </summary>
        /// <param name="input">输入字符串</param>
        /// <returns></returns>
        public static string DecodeHTMLString(string input, bool skipWiteSpace)
        {
            char          c;
            int           i  = 0;
            StringBuilder sb = new StringBuilder(input.Length);

            while (i < input.Length)
            {
                c = input[i];
                if (c == '&')
                {
                    i++;
                    int j = i;
                    while (j - i < _maxEntLen && j < input.Length)
                    {
                        if (input[j] == ';')
                        {
                            if (j - i > 0)
                            {
                                char e = DecodeEntity(new string(input.ToCharArray(i, j - i)));
                                if (e == HtmlConst.CHEMPTY)
                                {
                                    break;
                                }
                                sb.Append(e);
                            }
                            else
                            {
                                sb.Append("&;");
                            }
                            j++;
                            i = j;
                            break;
                        }
                        j++;
                    }
                    if (i != j)
                    {
                        sb.Append('&');
                    }
                    continue;
                }
                else
                {
                    if (skipWiteSpace)
                    {
                        if (!HtmlConst.IsWhiteSpace(c))
                        {
                            sb.Append(c);
                        }
                    }
                    else
                    {
                        sb.Append(c);
                    }
                }
                i++;
            }
            return(sb.ToString());
        }
Пример #2
0
 /// <summary>
 /// 创建一个HTML标签String
 /// </summary>
 /// <param name="elementTagName"></param>
 /// <returns></returns>
 private static string CreateElementStr(string elementTagName)
 {
     if (HtmlConst.IsSelfBackTag(elementTagName))
     {
         return(string.Format("<{0} />", elementTagName));
     }
     return(string.Format("<{0}></{0}>", elementTagName));
 }
Пример #3
0
 /// <summary>
 /// 得到一个小写String
 /// </summary>
 /// <param name="input">字符串</param>
 /// <param name="start">开始索引</param>
 /// <param name="length">长度</param>
 /// <returns>小写String</returns>
 private string GetStringToLower(string input, int start, int length)
 {
     char[] arr = input.ToCharArray(start, length);
     for (int i = 0; i < arr.Length; i++)
     {
         arr[i] = HtmlConst.ToLower(arr[i]);
     }
     return(new string(arr));
 }
Пример #4
0
        /*
         * 检查是否是空白符
         */
        internal void EatWhiteSpaceForRegex()
        {
            char c;

            while (reader.NEof())
            {
                c = reader.CurrChar();
                if (HtmlConst.IsWhiteSpaceWithoutlineBreak(c))
                {
                    reader.IncPos();
                    continue;
                }
                return;
            }
        }
Пример #5
0
        /// <summary>
        /// 解析样式
        /// </summary>
        /// <param name="style"></param>
        /// <returns></returns>
        public Dictionary <string, string> ParseStyleString(string style)
        {
            int    i = 0;
            char   c;
            string key;
            string val;
            int    valStartIndex = 0;
            int    keyStartIndex = -1;

            while (i < style.Length)
            {
                c = style[i];
                if (c == ':')
                {
                    if (keyStartIndex == -1)
                    {
                        i++;
                        continue;
                    }
                    //key = new string(style.ToCharArray(keyStartIndex, i - keyStartIndex));
                    key = this.GetStringToLower(style, keyStartIndex, i - keyStartIndex);
                    i++;
                    while (i < style.Length)
                    {
                        if (!HtmlConst.IsWhiteSpace(style[i]))
                        {
                            break;
                        }
                        i++;
                    }
                    int j = i;
                    valStartIndex = i;
                    while (j < style.Length)
                    {
                        if (style[j] == ';')
                        {
                            if (j - i > 0)
                            {
                                //val = new string(style.ToCharArray(i, j - i));
                                val = this.GetStringToLower(style, i, j - i);
                                Insert(key, val);
                            }
                            valStartIndex = 0;
                            keyStartIndex = -1;
                            i             = j;
                            break;
                        }
                        j++;
                    }
                    if (valStartIndex > 0)
                    {
                        if (j - i > 0)
                        {
                            //val = new string(style.ToCharArray(i, j - i));
                            val = this.GetStringToLower(style, i, j - i);
                            Insert(key, val);
                        }
                        valStartIndex = 0;
                        keyStartIndex = -1;
                        i             = j;
                        break;
                    }
                }
                else
                if (keyStartIndex == -1 && valStartIndex == 0)
                {
                    if (!HtmlConst.IsWhiteSpace(c))
                    {
                        keyStartIndex = i;
                    }
                }
                i++;
            }
            return(this._dic);
        }
Пример #6
0
        /// <summary>
        /// 格式化正则
        /// </summary>
        internal void FormatterRegex()
        {
            char c   = HtmlConst.CHEMPTY;
            int  pos = reader.pos - 1;

            //正则必须在一行,如果换行则必须欺骗回溯
            while (reader.NEof())
            {
                c = reader.CurrChar();
                switch (c)
                {
                case HtmlConst.CHCL:
                    goto LABEL_CHECK;

                case HtmlConst.CHTCL:
                    reader.IncPosTwice();
                    continue;

                case HtmlConst.CHLB:
                case HtmlConst.CHNL:
                    goto LABEL_TRICK;

                default:
                    break;
                }
                reader.IncPos();
            }

LABEL_CHECK:
            if (reader.NEofWithInc())
            {
                reader.IncPos();
                do
                {
                    c = reader.CurrChar();
                    if (HtmlConst.IsWhiteSpaceWithoutlineBreak(c) || (c == 'g' || c == 'i' || c == 'm'))
                    {
                        reader.IncPos();
                        continue;
                    }
                    break;
                } while (reader.NEof());
            }


            switch (c)
            {
            case '.':
            case ',':
            case '|':
            case '&':
            case '!':
            case '=':
            case '\r':
            case '\n':
            case ')':
                return;

            case '<':
                if (reader.NextChar() == '/')
                {
                    reader.minPos();
                    return;
                }
                break;

            default:
                break;
            }

            //欺骗回溯
LABEL_TRICK:
            reader.pos = pos;
            return;
        }
Пример #7
0
        /// <summary>
        /// 将HTML源码扫描成HTML DOM
        /// </summary>
        public void Parse()
        {
            if (_html == null)
            {
                throw new ArgumentNullException("HTML", "必须设置HTML源");
            }
            reader = new HtmlReader(_html);
            HtmlTag tag        = null;
            HtmlTag prev       = null;
            int     startIndex = -1;

            while (reader.NEof())
            {
                if (reader.IsTagStartPos())
                {
                    if (tag == null)
                    {
                        tag = prev;
                    }
                    else
                    {
                        if (reader.GT(startIndex))
                        {
                            tag.Value  = reader.GetString(startIndex);
                            startIndex = -1;
                        }
                        else
                        {
                            tag.Value  = string.Empty;
                            startIndex = -1;
                        }
                    }
                    prev          = tag;
                    tag           = new HtmlTag();
                    tag._prevNode = prev;
                    tag._idDic    = _idList;
                    tag._document = this;
                    if (prev == null)
                    {
                        _htmlElement = tag;
                    }
                    else
                    {
                        prev._nextNode = tag;
                    }
                    tag.FormatterTag();
                    if (HtmlConst.IsNullOrEmpty(tag._tagName))
                    {
                        tag = null;
                    }
                    else
                    {
                        if (!tag.IsBackTag)
                        {
                            if (tag._tagName == HtmlConst.SCRIPT)
                            {
                                if (startIndex == -1)
                                {
                                    if (reader.NEofWithInc())
                                    {
                                        reader.IncPos();
                                        startIndex = reader.pos;
                                    }
                                }
                                ParseScript();
                            }
                        }
                        if (tag._tagName[0] == HtmlConst.CHT || _rmTagDic.ContainsKey(tag._tagName))
                        {
                            tag = null;
                            if (prev == null)
                            {
                                _htmlElement = null;
                            }
                            else
                            {
                                prev._nextNode = null;
                            }

                            startIndex = -1;
                        }
                    }
                }
                else
                {
                    if (tag != null)
                    {
                        if (startIndex == -1)
                        {
                            startIndex = reader.pos;
                        }
                    }
                }
                reader.IncPos();
                //tmp = c;
            }
            //如果结束标签不为空,修复部分重要标记
            if (tag != null && _autoFixImportantTag)
            {
                tag.FixImportantTag();
            }
        }