/// <summary> /// 将含有HTML实体字符的字符串解码 /// </summary> /// <param name="input">输入字符串</param> /// <returns></returns> public static string DecodeHTMLString(string input, bool skipWiteSpace) { char c; int i = 0; StringBuilder sb = new StringBuilder(input.Length); while (i < input.Length) { c = input[i]; if (c == '&') { i++; int j = i; while (j - i < _maxEntLen && j < input.Length) { if (input[j] == ';') { if (j - i > 0) { char e = DecodeEntity(new string(input.ToCharArray(i, j - i))); if (e == HtmlConst.CHEMPTY) { break; } sb.Append(e); } else { sb.Append("&;"); } j++; i = j; break; } j++; } if (i != j) { sb.Append('&'); } continue; } else { if (skipWiteSpace) { if (!HtmlConst.IsWhiteSpace(c)) { sb.Append(c); } } else { sb.Append(c); } } i++; } return(sb.ToString()); }
/// <summary> /// 创建一个HTML标签String /// </summary> /// <param name="elementTagName"></param> /// <returns></returns> private static string CreateElementStr(string elementTagName) { if (HtmlConst.IsSelfBackTag(elementTagName)) { return(string.Format("<{0} />", elementTagName)); } return(string.Format("<{0}></{0}>", elementTagName)); }
/// <summary> /// 得到一个小写String /// </summary> /// <param name="input">字符串</param> /// <param name="start">开始索引</param> /// <param name="length">长度</param> /// <returns>小写String</returns> private string GetStringToLower(string input, int start, int length) { char[] arr = input.ToCharArray(start, length); for (int i = 0; i < arr.Length; i++) { arr[i] = HtmlConst.ToLower(arr[i]); } return(new string(arr)); }
/* * 检查是否是空白符 */ internal void EatWhiteSpaceForRegex() { char c; while (reader.NEof()) { c = reader.CurrChar(); if (HtmlConst.IsWhiteSpaceWithoutlineBreak(c)) { reader.IncPos(); continue; } return; } }
/// <summary> /// 解析样式 /// </summary> /// <param name="style"></param> /// <returns></returns> public Dictionary <string, string> ParseStyleString(string style) { int i = 0; char c; string key; string val; int valStartIndex = 0; int keyStartIndex = -1; while (i < style.Length) { c = style[i]; if (c == ':') { if (keyStartIndex == -1) { i++; continue; } //key = new string(style.ToCharArray(keyStartIndex, i - keyStartIndex)); key = this.GetStringToLower(style, keyStartIndex, i - keyStartIndex); i++; while (i < style.Length) { if (!HtmlConst.IsWhiteSpace(style[i])) { break; } i++; } int j = i; valStartIndex = i; while (j < style.Length) { if (style[j] == ';') { if (j - i > 0) { //val = new string(style.ToCharArray(i, j - i)); val = this.GetStringToLower(style, i, j - i); Insert(key, val); } valStartIndex = 0; keyStartIndex = -1; i = j; break; } j++; } if (valStartIndex > 0) { if (j - i > 0) { //val = new string(style.ToCharArray(i, j - i)); val = this.GetStringToLower(style, i, j - i); Insert(key, val); } valStartIndex = 0; keyStartIndex = -1; i = j; break; } } else if (keyStartIndex == -1 && valStartIndex == 0) { if (!HtmlConst.IsWhiteSpace(c)) { keyStartIndex = i; } } i++; } return(this._dic); }
/// <summary> /// 格式化正则 /// </summary> internal void FormatterRegex() { char c = HtmlConst.CHEMPTY; int pos = reader.pos - 1; //正则必须在一行,如果换行则必须欺骗回溯 while (reader.NEof()) { c = reader.CurrChar(); switch (c) { case HtmlConst.CHCL: goto LABEL_CHECK; case HtmlConst.CHTCL: reader.IncPosTwice(); continue; case HtmlConst.CHLB: case HtmlConst.CHNL: goto LABEL_TRICK; default: break; } reader.IncPos(); } LABEL_CHECK: if (reader.NEofWithInc()) { reader.IncPos(); do { c = reader.CurrChar(); if (HtmlConst.IsWhiteSpaceWithoutlineBreak(c) || (c == 'g' || c == 'i' || c == 'm')) { reader.IncPos(); continue; } break; } while (reader.NEof()); } switch (c) { case '.': case ',': case '|': case '&': case '!': case '=': case '\r': case '\n': case ')': return; case '<': if (reader.NextChar() == '/') { reader.minPos(); return; } break; default: break; } //欺骗回溯 LABEL_TRICK: reader.pos = pos; return; }
/// <summary> /// 将HTML源码扫描成HTML DOM /// </summary> public void Parse() { if (_html == null) { throw new ArgumentNullException("HTML", "必须设置HTML源"); } reader = new HtmlReader(_html); HtmlTag tag = null; HtmlTag prev = null; int startIndex = -1; while (reader.NEof()) { if (reader.IsTagStartPos()) { if (tag == null) { tag = prev; } else { if (reader.GT(startIndex)) { tag.Value = reader.GetString(startIndex); startIndex = -1; } else { tag.Value = string.Empty; startIndex = -1; } } prev = tag; tag = new HtmlTag(); tag._prevNode = prev; tag._idDic = _idList; tag._document = this; if (prev == null) { _htmlElement = tag; } else { prev._nextNode = tag; } tag.FormatterTag(); if (HtmlConst.IsNullOrEmpty(tag._tagName)) { tag = null; } else { if (!tag.IsBackTag) { if (tag._tagName == HtmlConst.SCRIPT) { if (startIndex == -1) { if (reader.NEofWithInc()) { reader.IncPos(); startIndex = reader.pos; } } ParseScript(); } } if (tag._tagName[0] == HtmlConst.CHT || _rmTagDic.ContainsKey(tag._tagName)) { tag = null; if (prev == null) { _htmlElement = null; } else { prev._nextNode = null; } startIndex = -1; } } } else { if (tag != null) { if (startIndex == -1) { startIndex = reader.pos; } } } reader.IncPos(); //tmp = c; } //如果结束标签不为空,修复部分重要标记 if (tag != null && _autoFixImportantTag) { tag.FixImportantTag(); } }