/// <summary> /// 解析HTML节点 /// </summary> /// <param name="html"></param> private void create(string html) { int length = html.Length; children = new list<htmlNode>(); if (length < 2) { children.Add(new htmlNode { nodeText = new htmlText { FormatHtml = html }, Parent = this }); } else { int nextIndex, nodeCount; htmlNode nextNode; fixed (char* htmlFixed = html + "<") { fixedMap spaceFixedMap = new fixedMap(spaceMap.Map); fixedMap spaceSplitFixedMap = new fixedMap(spaceSplitMap); fixedMap tagNameFixedMap = new fixedMap(tagNameMap); fixedMap tagNameSplitFixedMap = new fixedMap(tagNameSplitMap); fixedMap attributeSplitFixedMap = new fixedMap(attributeSplitMap); fixedMap attributeNameSplitFixedMap = new fixedMap(attributeNameSplitMap); int startIndex, tagNameLength; string name, htmlValue; char* startChar = htmlFixed, currentChar = htmlFixed, endChar = htmlFixed + length, scriptChar; char splitChar; while (currentChar != endChar) { for (*endChar = '<'; *currentChar != '<'; ++currentChar) ; if (currentChar != endChar) { if ((*++currentChar & 0xff80) == 0) { if (tagNameFixedMap.Get(*currentChar)) { while ((*startChar & 0xffc0) == 0 && spaceFixedMap.Get(*startChar)) ++startChar; if (startChar != currentChar - 1) { for (scriptChar = currentChar - 2; (*scriptChar & 0xffc0) == 0 && spaceFixedMap.Get(*scriptChar); --scriptChar) ; children.Add(new htmlNode { nodeText = new htmlText { FormatHtml = html.Substring((int)(startChar - htmlFixed), (int)(scriptChar - startChar) + 1) } }); } if (*currentChar == '/') { #region 标签回合 startChar = currentChar - 1; if (++currentChar != endChar) { while ((*currentChar & 0xffc0) == 0 && spaceFixedMap.Get(*currentChar)) ++currentChar; if (currentChar != endChar) { if ((uint)((*currentChar | 0x20) - 'a') <= 26) { for (*endChar = '>'; (*currentChar & 0xffc0) != 0 || !tagNameSplitFixedMap.Get(*currentChar); ++currentChar) ; TagName = html.Substring((int)((startChar += 2) - htmlFixed), (int)(currentChar - startChar)).toLower(); for (startIndex = children.Count - 1; startIndex >= 0 && (children[startIndex].nodeText.FormatHtml != null || children[startIndex].TagName != TagName); --startIndex) ; if (startIndex != -1) { for (nextIndex = children.Count - 1; nextIndex != startIndex; --nextIndex) { nextNode = children[nextIndex]; if (nextNode.nodeText.FormatHtml == null) { if (web.html.MustRoundTagNames.Contains(nextNode.TagName) && (nodeCount = (children.Count - nextIndex - 1)) != 0) { nextNode.children = new list<htmlNode>(children.GetSub(nextIndex + 1, nodeCount), true); children.RemoveRange(nextIndex + 1, nodeCount); foreach (htmlNode value in nextNode.children) value.Parent = nextNode; } } else if (nextNode.nodeText.FormatHtml.Length == 0) nextNode.nodeText.FormatHtml = null; } nextNode = children[startIndex]; if ((nodeCount = children.Count - ++startIndex) != 0) { nextNode.children = new list<htmlNode>(children.GetSub(startIndex, nodeCount), true); children.RemoveRange(startIndex, nodeCount); foreach (htmlNode value in nextNode.children) value.Parent = nextNode; } nextNode.nodeText.FormatHtml = string.Empty;//已回合标识 } while (*currentChar != '>') ++currentChar; if (currentChar != endChar) ++currentChar; } else { for (*endChar = '>'; *currentChar != '>'; ++currentChar) ; if (currentChar != endChar) ++currentChar; htmlValue = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar)); children.Add(new htmlNode { TagName = "/", nodeText = new htmlText { FormatHtml = htmlValue, FormatText = htmlValue } }); } startChar = currentChar; } } #endregion } else if (*currentChar != '!') { #region 标签开始 startChar = currentChar; children.Add(nextNode = new htmlNode()); for (*endChar = '>'; (*currentChar & 0xffc0) != 0 || !tagNameSplitFixedMap.Get(*currentChar); ++currentChar) ; nextNode.TagName = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar)).toLower(); if (currentChar == endChar) startChar = endChar; else { #region 属性解析 if (*currentChar != '>') { startChar = ++currentChar; while (currentChar != endChar) { while ((*currentChar & 0xffc0) == 0 && attributeSplitFixedMap.Get(*currentChar)) ++currentChar; if (*currentChar == '>') { if (currentChar != endChar) { if (*(currentChar - 1) == '/') nextNode.nodeText.FormatHtml = string.Empty; startChar = ++currentChar; } break; } else { for (startChar = currentChar++; (*currentChar & 0xffc0) != 0 || !tagNameSplitFixedMap.Get(*currentChar); ++currentChar) ; htmlValue = name = checkName(html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar)).toLower()); if (currentChar != endChar && ((*currentChar & 0xffc0) != 0 || !attributeNameSplitFixedMap.Get(*currentChar))) { if (*currentChar != '=') { while ((*currentChar & 0xffc0) == 0 && spaceFixedMap.Get(*currentChar)) ++currentChar; } if (*currentChar == '=') { while ((*++currentChar & 0xffc0) == 0 && spaceFixedMap.Get(*currentChar)) ; if ((splitChar = *currentChar) != '>') { if (splitChar == '"' || splitChar == '\'') { for (startChar = ++currentChar, *endChar = splitChar; *currentChar != splitChar; ++currentChar) ; *endChar = '>'; } else { for (startChar = currentChar++; (*currentChar & 0xffc0) != 0 || !spaceSplitFixedMap.Get(*currentChar); ++currentChar) ; } htmlValue = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar)); } } } if (nextNode.attributes == null) nextNode.attributes = new Dictionary<string, htmlText>(); nextNode.attributes[name] = new htmlText { FormatHtml = htmlValue }; if (currentChar != endChar) { if (*currentChar == '>') { if (*(currentChar - 1) == '/') nextNode.nodeText.FormatHtml = string.Empty; startChar = ++currentChar; break; } startChar = ++currentChar; } } } } else startChar = ++currentChar; #endregion #region 非解析标签 if (currentChar == endChar) startChar = endChar; else if (web.html.NonanalyticTagNames.Contains(TagName = nextNode.TagName)) { scriptChar = endChar; tagNameLength = TagName.Length + 2; fixed (char* tagNameFixed = TagName) { while ((int)(endChar - currentChar) > tagNameLength) { for (currentChar += tagNameLength; *currentChar != '>'; ++currentChar) ; if (currentChar != endChar && *(int*)(currentChar - tagNameLength) == (('/' << 16) + '<')) { if (unsafer.String.EqualCase(currentChar - TagName.Length, tagNameFixed, TagName.Length)) { scriptChar = currentChar - tagNameLength; if (currentChar != endChar) ++currentChar; break; } } } } if (startChar != scriptChar) { nextNode.nodeText.FormatHtml = nextNode.nodeText.FormatText = html.Substring((int)(startChar - htmlFixed), (int)(scriptChar - startChar)); } if (scriptChar == endChar) currentChar = endChar; startChar = currentChar; } #endregion } #endregion } else { #region 注释 startChar = currentChar - 1; if (++currentChar != endChar) { *endChar = '>'; if ((length = (int)(endChar - currentChar)) > 2 && *(int*)currentChar == (('-' << 16) + '-')) { for (currentChar += 2; *currentChar != '>'; ++currentChar) ; while (currentChar != endChar && *(int*)(currentChar - 2) != (('-' << 16) + '-')) { if ((currentChar += 3) < endChar) { while (*currentChar != '>') ++currentChar; } else currentChar = endChar; } } else if (length > 9 && (*(int*)currentChar & 0x200000) == ('[' + ('c' << 16)) && (*(int*)(currentChar + 2) & 0x200020) == ('d' + ('a' << 16)) && (*(int*)(currentChar + 4) & 0x200020) == ('t' + ('a' << 16)) && *(currentChar + 6) == '[') { for (currentChar += 9; *currentChar != '>'; ++currentChar) ; while (currentChar != endChar && *(int*)(currentChar - 2) != ((']' << 16) + ']')) { if ((currentChar += 3) < endChar) { while (*currentChar != '>') ++currentChar; } else currentChar = endChar; } } else { while (*currentChar != '>') ++currentChar; } if (currentChar != endChar) ++currentChar; } htmlValue = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar) + (*(currentChar - 1) == '>' ? 0 : 1)); children.Add(new htmlNode { TagName = "!", nodeText = new htmlText { FormatHtml = htmlValue, FormatText = htmlValue } }); startChar = currentChar; #endregion } } } else ++currentChar; } } if (startChar != endChar) { *endChar = '>'; while ((*startChar & 0xffc0) == 0 && spaceFixedMap.Get(*startChar)) ++startChar; if (startChar != endChar) { for (scriptChar = endChar - 1; (*scriptChar & 0xffc0) == 0 && spaceFixedMap.Get(*scriptChar); --scriptChar) ; children.Add(new htmlNode { nodeText = new htmlText { FormatHtml = html.Substring((int)(startChar - htmlFixed), (int)(scriptChar - startChar) + 1) } }); } } } for (nextIndex = children.Count - 1; nextIndex != -1; nextIndex--) { nextNode = children[nextIndex]; if (nextNode.nodeText.FormatHtml == null) { if (web.html.MustRoundTagNames.Contains(nextNode.TagName) && (nodeCount = (children.Count - nextIndex - 1)) != 0) { nextNode.children = new list<htmlNode>(children.GetSub(nextIndex + 1, nodeCount), true); children.RemoveRange(nextIndex + 1, nodeCount); foreach (htmlNode value in children) value.Parent = nextNode; } } else if (nextNode.nodeText.FormatHtml.Length == 0) nextNode.nodeText.FormatHtml = null; } foreach (htmlNode value in children) value.Parent = this; } }