Beispiel #1
0
        /// <summary>
        /// 解析HTML节点
        /// </summary>
        /// <param name="html"></param>
        private void create(string html)
        {
            int length = html.Length;
            children = new list<htmlNode>();
            if (length < 2)
            {
                children.Add(new htmlNode { nodeText = new htmlText { FormatHtml = html }, Parent = this });
            }
            else
            {
                int nextIndex, nodeCount;
                htmlNode nextNode;
                fixed (char* htmlFixed = html + "<")
                {
                    fixedMap spaceFixedMap = new fixedMap(spaceMap.Map);
                    fixedMap spaceSplitFixedMap = new fixedMap(spaceSplitMap);
                    fixedMap tagNameFixedMap = new fixedMap(tagNameMap);
                    fixedMap tagNameSplitFixedMap = new fixedMap(tagNameSplitMap);
                    fixedMap attributeSplitFixedMap = new fixedMap(attributeSplitMap);
                    fixedMap attributeNameSplitFixedMap = new fixedMap(attributeNameSplitMap);
                    int startIndex, tagNameLength;
                    string name, htmlValue;
                    char* startChar = htmlFixed, currentChar = htmlFixed, endChar = htmlFixed + length, scriptChar;
                    char splitChar;
                    while (currentChar != endChar)
                    {
                        for (*endChar = '<'; *currentChar != '<'; ++currentChar) ;
                        if (currentChar != endChar)
                        {
                            if ((*++currentChar & 0xff80) == 0)
                            {
                                if (tagNameFixedMap.Get(*currentChar))
                                {
                                    while ((*startChar & 0xffc0) == 0 && spaceFixedMap.Get(*startChar)) ++startChar;
                                    if (startChar != currentChar - 1)
                                    {
                                        for (scriptChar = currentChar - 2; (*scriptChar & 0xffc0) == 0 && spaceFixedMap.Get(*scriptChar); --scriptChar) ;
                                        children.Add(new htmlNode { nodeText = new htmlText { FormatHtml = html.Substring((int)(startChar - htmlFixed), (int)(scriptChar - startChar) + 1) } });
                                    }
                                    if (*currentChar == '/')
                                    {
                                        #region 标签回合
                                        startChar = currentChar - 1;
                                        if (++currentChar != endChar)
                                        {
                                            while ((*currentChar & 0xffc0) == 0 && spaceFixedMap.Get(*currentChar)) ++currentChar;
                                            if (currentChar != endChar)
                                            {
                                                if ((uint)((*currentChar | 0x20) - 'a') <= 26)
                                                {
                                                    for (*endChar = '>'; (*currentChar & 0xffc0) != 0 || !tagNameSplitFixedMap.Get(*currentChar); ++currentChar) ;
                                                    TagName = html.Substring((int)((startChar += 2) - htmlFixed), (int)(currentChar - startChar)).toLower();
                                                    for (startIndex = children.Count - 1; startIndex >= 0 && (children[startIndex].nodeText.FormatHtml != null || children[startIndex].TagName != TagName); --startIndex) ;
                                                    if (startIndex != -1)
                                                    {
                                                        for (nextIndex = children.Count - 1; nextIndex != startIndex; --nextIndex)
                                                        {
                                                            nextNode = children[nextIndex];
                                                            if (nextNode.nodeText.FormatHtml == null)
                                                            {
                                                                if (web.html.MustRoundTagNames.Contains(nextNode.TagName) && (nodeCount = (children.Count - nextIndex - 1)) != 0)
                                                                {
                                                                    nextNode.children = new list<htmlNode>(children.GetSub(nextIndex + 1, nodeCount), true);
                                                                    children.RemoveRange(nextIndex + 1, nodeCount);
                                                                    foreach (htmlNode value in nextNode.children) value.Parent = nextNode;
                                                                }
                                                            }
                                                            else if (nextNode.nodeText.FormatHtml.Length == 0) nextNode.nodeText.FormatHtml = null;
                                                        }
                                                        nextNode = children[startIndex];
                                                        if ((nodeCount = children.Count - ++startIndex) != 0)
                                                        {
                                                            nextNode.children = new list<htmlNode>(children.GetSub(startIndex, nodeCount), true);
                                                            children.RemoveRange(startIndex, nodeCount);
                                                            foreach (htmlNode value in nextNode.children) value.Parent = nextNode;
                                                        }
                                                        nextNode.nodeText.FormatHtml = string.Empty;//已回合标识
                                                    }
                                                    while (*currentChar != '>') ++currentChar;
                                                    if (currentChar != endChar) ++currentChar;
                                                }
                                                else
                                                {
                                                    for (*endChar = '>'; *currentChar != '>'; ++currentChar) ;
                                                    if (currentChar != endChar) ++currentChar;
                                                    htmlValue = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar));
                                                    children.Add(new htmlNode { TagName = "/", nodeText = new htmlText { FormatHtml = htmlValue, FormatText = htmlValue } });
                                                }
                                                startChar = currentChar;
                                            }
                                        }
                                        #endregion
                                    }
                                    else if (*currentChar != '!')
                                    {
                                        #region 标签开始
                                        startChar = currentChar;
                                        children.Add(nextNode = new htmlNode());
                                        for (*endChar = '>'; (*currentChar & 0xffc0) != 0 || !tagNameSplitFixedMap.Get(*currentChar); ++currentChar) ;
                                        nextNode.TagName = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar)).toLower();
                                        if (currentChar == endChar) startChar = endChar;
                                        else
                                        {
                                            #region 属性解析
                                            if (*currentChar != '>')
                                            {
                                                startChar = ++currentChar;
                                                while (currentChar != endChar)
                                                {
                                                    while ((*currentChar & 0xffc0) == 0 && attributeSplitFixedMap.Get(*currentChar)) ++currentChar;
                                                    if (*currentChar == '>')
                                                    {
                                                        if (currentChar != endChar)
                                                        {
                                                            if (*(currentChar - 1) == '/') nextNode.nodeText.FormatHtml = string.Empty;
                                                            startChar = ++currentChar;
                                                        }
                                                        break;
                                                    }
                                                    else
                                                    {
                                                        for (startChar = currentChar++; (*currentChar & 0xffc0) != 0 || !tagNameSplitFixedMap.Get(*currentChar); ++currentChar) ;
                                                        htmlValue = name = checkName(html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar)).toLower());
                                                        if (currentChar != endChar && ((*currentChar & 0xffc0) != 0 || !attributeNameSplitFixedMap.Get(*currentChar)))
                                                        {
                                                            if (*currentChar != '=')
                                                            {
                                                                while ((*currentChar & 0xffc0) == 0 && spaceFixedMap.Get(*currentChar)) ++currentChar;
                                                            }
                                                            if (*currentChar == '=')
                                                            {
                                                                while ((*++currentChar & 0xffc0) == 0 && spaceFixedMap.Get(*currentChar)) ;
                                                                if ((splitChar = *currentChar) != '>')
                                                                {
                                                                    if (splitChar == '"' || splitChar == '\'')
                                                                    {
                                                                        for (startChar = ++currentChar, *endChar = splitChar; *currentChar != splitChar; ++currentChar) ;
                                                                        *endChar = '>';
                                                                    }
                                                                    else
                                                                    {
                                                                        for (startChar = currentChar++; (*currentChar & 0xffc0) != 0 || !spaceSplitFixedMap.Get(*currentChar); ++currentChar) ;
                                                                    }
                                                                    htmlValue = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar));
                                                                }
                                                            }
                                                        }
                                                        if (nextNode.attributes == null) nextNode.attributes = new Dictionary<string, htmlText>();
                                                        nextNode.attributes[name] = new htmlText { FormatHtml = htmlValue };
                                                        if (currentChar != endChar)
                                                        {
                                                            if (*currentChar == '>')
                                                            {
                                                                if (*(currentChar - 1) == '/') nextNode.nodeText.FormatHtml = string.Empty;
                                                                startChar = ++currentChar;
                                                                break;
                                                            }
                                                            startChar = ++currentChar;
                                                        }
                                                    }
                                                }
                                            }
                                            else startChar = ++currentChar;
                                            #endregion

                                            #region 非解析标签
                                            if (currentChar == endChar) startChar = endChar;
                                            else if (web.html.NonanalyticTagNames.Contains(TagName = nextNode.TagName))
                                            {
                                                scriptChar = endChar;
                                                tagNameLength = TagName.Length + 2;
                                                fixed (char* tagNameFixed = TagName)
                                                {
                                                    while ((int)(endChar - currentChar) > tagNameLength)
                                                    {
                                                        for (currentChar += tagNameLength; *currentChar != '>'; ++currentChar) ;
                                                        if (currentChar != endChar && *(int*)(currentChar - tagNameLength) == (('/' << 16) + '<'))
                                                        {
                                                            if (unsafer.String.EqualCase(currentChar - TagName.Length, tagNameFixed, TagName.Length))
                                                            {
                                                                scriptChar = currentChar - tagNameLength;
                                                                if (currentChar != endChar) ++currentChar;
                                                                break;
                                                            }
                                                        }
                                                    }
                                                }
                                                if (startChar != scriptChar)
                                                {
                                                    nextNode.nodeText.FormatHtml = nextNode.nodeText.FormatText = html.Substring((int)(startChar - htmlFixed), (int)(scriptChar - startChar));
                                                }
                                                if (scriptChar == endChar) currentChar = endChar;
                                                startChar = currentChar;
                                            }
                                            #endregion
                                        }
                                        #endregion
                                    }
                                    else
                                    {
                                        #region 注释
                                        startChar = currentChar - 1;
                                        if (++currentChar != endChar)
                                        {
                                            *endChar = '>';
                                            if ((length = (int)(endChar - currentChar)) > 2 && *(int*)currentChar == (('-' << 16) + '-'))
                                            {
                                                for (currentChar += 2; *currentChar != '>'; ++currentChar) ;
                                                while (currentChar != endChar && *(int*)(currentChar - 2) != (('-' << 16) + '-'))
                                                {
                                                    if ((currentChar += 3) < endChar)
                                                    {
                                                        while (*currentChar != '>') ++currentChar;
                                                    }
                                                    else currentChar = endChar;
                                                }
                                            }
                                            else if (length > 9
                                                && (*(int*)currentChar & 0x200000) == ('[' + ('c' << 16))
                                                && (*(int*)(currentChar + 2) & 0x200020) == ('d' + ('a' << 16))
                                                && (*(int*)(currentChar + 4) & 0x200020) == ('t' + ('a' << 16))
                                                && *(currentChar + 6) == '[')
                                            {
                                                for (currentChar += 9; *currentChar != '>'; ++currentChar) ;
                                                while (currentChar != endChar && *(int*)(currentChar - 2) != ((']' << 16) + ']'))
                                                {
                                                    if ((currentChar += 3) < endChar)
                                                    {
                                                        while (*currentChar != '>') ++currentChar;
                                                    }
                                                    else currentChar = endChar;
                                                }
                                            }
                                            else
                                            {
                                                while (*currentChar != '>') ++currentChar;
                                            }
                                            if (currentChar != endChar) ++currentChar;
                                        }
                                        htmlValue = html.Substring((int)(startChar - htmlFixed), (int)(currentChar - startChar) + (*(currentChar - 1) == '>' ? 0 : 1));
                                        children.Add(new htmlNode { TagName = "!", nodeText = new htmlText { FormatHtml = htmlValue, FormatText = htmlValue } });
                                        startChar = currentChar;
                                        #endregion
                                    }
                                }
                            }
                            else ++currentChar;
                        }
                    }
                    if (startChar != endChar)
                    {
                        *endChar = '>';
                        while ((*startChar & 0xffc0) == 0 && spaceFixedMap.Get(*startChar)) ++startChar;
                        if (startChar != endChar)
                        {
                            for (scriptChar = endChar - 1; (*scriptChar & 0xffc0) == 0 && spaceFixedMap.Get(*scriptChar); --scriptChar) ;
                            children.Add(new htmlNode { nodeText = new htmlText { FormatHtml = html.Substring((int)(startChar - htmlFixed), (int)(scriptChar - startChar) + 1) } });
                        }
                    }
                }
                for (nextIndex = children.Count - 1; nextIndex != -1; nextIndex--)
                {
                    nextNode = children[nextIndex];
                    if (nextNode.nodeText.FormatHtml == null)
                    {
                        if (web.html.MustRoundTagNames.Contains(nextNode.TagName)
                            && (nodeCount = (children.Count - nextIndex - 1)) != 0)
                        {
                            nextNode.children = new list<htmlNode>(children.GetSub(nextIndex + 1, nodeCount), true);
                            children.RemoveRange(nextIndex + 1, nodeCount);
                            foreach (htmlNode value in children) value.Parent = nextNode;
                        }
                    }
                    else if (nextNode.nodeText.FormatHtml.Length == 0) nextNode.nodeText.FormatHtml = null;
                }
                foreach (htmlNode value in children) value.Parent = this;
            }
        }