Esempio n. 1
0
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int curPos)
        {
            /*
             *  WARNING: this code was optimised for performance rather than for readability,
             *  so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML
             *
             *  This routine takes about 60% of CPU time, in theory its the best place to gain extra speed,
             *  but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post
             *  your changes for everyone to enjoy!
             * */

            //var whiteSpaceHere = false;

            //var paramValue = false;
            byte c     = 0;
            byte cPeek = 0;

            // if true it means we have parsed complete tag
            //var gotTag = false;

            //var equalIdx = 0;

            // we reach this function immediately after tag's byte (<) was
            // detected, so we need to save it in order to keep correct HTML copy
            // _hunk.Append((byte)'<'); // (byte)'<'

            /*
             * _chunk.Buffer[0] = 60;
             * _chunk.BufPos = 1;
             * _chunk.HTMLen = 1;
             */

            // initialise peeked char - this will point to the next after < character
            if (curPos < _dataLength)
            {
                cPeek = _html[curPos];
                // in case of comments ! must follow immediately after <
                if (cPeek == (byte)'!')
                {
                    if (curPos + 2 < _dataLength &&
                        _html[curPos + 1] == (byte)'-' && _html[curPos + 2] == (byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        _chunk.Tag      = "!--";
                        _chunk.Type     = HTMLchunkType.Comment;
                        _chunk.Comments = true;
                        // _chunk.Append((byte)'!');
                        // _chunk.Append((byte)'-');
                        // _chunk.Append((byte)'-');
                        curPos            += 3;
                        _chunk             = ParseComments(ref curPos, out bool fullTag);
                        _chunk.ChunkLength = curPos - _chunk.ChunkOffset;
                        if (_p.AutoKeepComments || _p.KeepRawHTML)
                        {
                            if (!_p.AutoExtractBetweenTagsOnly)
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset, _chunk.ChunkLength);
                            }
                            else
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset + 4, _chunk.ChunkLength - fullTag ? 7 : 4);
                            }
                        }
                        return(_chunk);
                    }

                    // ok we might have here CDATA element of XML:
                    // ref: http://www.w3schools.com/xml/xml_cdata.asp
                    if (curPos + 7 < _dataLength &&
                        _html[curPos + 1] == (byte)'[' &&
                        _html[curPos + 2] == (byte)'C' &&
                        _html[curPos + 3] == (byte)'D' &&
                        _html[curPos + 4] == (byte)'A' &&
                        _html[curPos + 5] == (byte)'T' &&
                        _html[curPos + 6] == (byte)'A' &&
                        _html[curPos + 7] == (byte)'[')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        _chunk.Tag      = "![CDATA[";
                        _chunk.Type     = HTMLchunkType.Comment;
                        _chunk.Comments = true;
                        // _chunk.Append((byte)'!');
                        // _chunk.Append((byte)'-');
                        // _chunk.Append((byte)'-');
                        curPos            += 8;
                        _chunk             = ParseCDATA(ref curPos, out bool fullTag);
                        _chunk.ChunkLength = curPos - _chunk.ChunkOffset;
                        if (_p.AutoKeepComments || _p.KeepRawHTML)
                        {
                            if (!_p.AutoExtractBetweenTagsOnly)
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset, _chunk.ChunkLength);
                            }
                            else
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset + 4 + 5, _chunk.ChunkLength - fullTag ? 7 + 5 : 4 + 5);
                            }
                        }
                        return(_chunk);
                    }
                }
            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                _chunk.Type = HTMLchunkType.OpenTag;
                // end of data... before it started
                return(_chunk);
            }

            // tag ID, non-zero if matched by heuristics engine
            var tagId = 0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if (EnableHeuristics && curPos < _maxHeuDataLength)
            {
                // check if we have got closure of the tag
                if (cPeek == (byte)'/')
                {
                    _chunk.Closure    = true;
                    _chunk.EndClosure = false;
                    _chunk.Type       = HTMLchunkType.CloseTag;
                    curPos++;
                    cPeek = _html[curPos];
                }
                c = _html[curPos + 1];
                // probability of having a match is very high (or so we expect)
                tagId = _he.MatchTag(cPeek, c);
                if (tagId != 0)
                {
                    if (tagId < 0)
                    {
                        tagId *= -1;
                        // single character tag
                        _chunk.Tag = _he.GetString(tagId);
                        // see if we got fully closed tag
                        if (c == (byte)'>')
                        {
                            curPos += 2;
                            goto ReturnChunk;
                        }
                        cPeek = c;
                        curPos++;
                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        var nextChar = _html[curPos + 2];
                        if (nextChar == (byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            _chunk.Tag = _he.GetTwoCharString(cPeek, c);
                            curPos    += 3;
                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if (nextChar == (byte)' ')
                        {
                            //_chunk.Tag = _he.GetString(tagId);
                            _chunk.Tag = _he.GetTwoCharString(cPeek, c);
                            curPos    += 2;
                            cPeek      = nextChar;
                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        var tag = _he.GetStringData(tagId);

                        if (curPos + tag.Length + 5 >= _dataLength)
                        {
                            goto TagParsing;
                        }

                        // in a loop (and this is not an ideal solution, but still)
                        for (int i = 2; i < tag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if (tag[i] != _html[curPos + i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer word
                        nextChar = _html[curPos + tag.Length];
                        if (nextChar == (byte)'>')
                        {
                            _chunk.Tag = _he.GetString(tagId);
                            curPos    += tag.Length + 1;
                            goto ReturnChunk;
                        }
                        if (nextChar == (byte)' ')
                        {
                            cPeek      = nextChar;
                            _chunk.Tag = _he.GetString(tagId);
                            curPos    += tag.Length;
                            goto AttributeParsing;
                        }
                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }
                }
            }
TagParsing:

            _text.Clear();
            var charType = 0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while (cPeek != 0)
            {
                charType = _tagCharTypes[cPeek];

                //if (cPeek <= 32 && whiteSpace[cPeek] == 1)
                if (charType == (byte)TagCharType.WhiteSpace)
                {
                    curPos++;
                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        c = 0;
                    }
                    charType = _tagCharTypes[c];

                    //if (c == ' ' || c == '\t' || c == 13 || c == 10)
                    //if (c <= 32 && whiteSpace[c] == 1)
                    if (charType == (byte)TagCharType.WhiteSpace)
                    {
                        while (curPos < _dataLength)
                        {
                            c        = _html[curPos++];
                            charType = _tagCharTypes[c];
                            if (charType == (byte)TagCharType.WhiteSpace)
                            //if(c != ' ' && c != '\t' && c != 13 && c != 10)
                            {
                                //cPeek = _html[curPos];
                                continue;
                            }
                            break;
                        }
                        if (curPos >= _dataLength)
                        {
                            c = 0;
                        }
                    }

                    //whiteSpaceHere = true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if (_text._bufPos > 0)
                    {
                        _chunk.Tag = _text.SetToStringASCII();
                        // _chunk.Append((byte)' ');
                        curPos--;
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }
                        break;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //c = cPeek; curPos++;
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        c = 0;
                    }
                }
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos];
                }
                else
                {
                    cPeek = 0;
                }
                // most likely we should have lower-cased ASCII char
                if (charType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    _text._buffer[_text._bufPos++] = c;
                    // _chunk.Append(c);
                    continue;
                }

                // tag end - we did not have any params
                if (c == (byte)'>')
                {
                    if (_text._bufPos > 0)
                    {
                        _chunk.Tag = _text.SetToStringASCII();
                    }
                    if (!_chunk.Closure)
                    {
                        _chunk.Type = HTMLchunkType.OpenTag;
                    }
                    return(_chunk);
                }

                // closure of tag sign
                if (c == (byte)'/')
                {
                    _chunk.Closure    = true;
                    _chunk.EndClosure = (_text._bufPos > 0);
                    _chunk.Type       = HTMLchunkType.CloseTag;
                    continue;
                }

                // 03/08/08 XML support: ?xml tags - grrr
                if (c == (byte)'?')
                {
                    _text._buffer[_text._bufPos++] = c;
                    continue;
                }

                // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and /
                //if (c >= 65 && c <= 90)
                if (charType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    _text._buffer[_text._bufPos++] = charType;
                    // _chunk.Append(bCharType);
                    continue;
                }

                // we might have namespace : sign here - all text before would have to be
                // saved as namespace and we will need to continue parsing actual tag
                if (charType == (byte)TagCharType.NameSpaceColon)
                {
                    // ok here we got a choice - we can just continue and treat the whole
                    // thing as a single tag with namespace stuff prefixed, OR
                    // we can separate first part into namespace and keep tag as normal
                    _text._buffer[_text._bufPos++] = (byte)':';
                    continue;
                }
                // ok, we have got some other char - we break out to deal with it in attributes part
                break;
            }

            if (cPeek == 0)
            {
                return(_chunk);
            }

            // if true then equal sign was found
            //var equalsSign = false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters
AttributeParsing:

            string attrName;

            if (tagId != 0)
            {
                // first, skip whitespace:
                if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    curPos++;
                    if (curPos >= _dataLength)
                    {
                        goto ReturnChunk;
                    }
                    cPeek = _html[curPos];
                    if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while (curPos < _dataLength)
                        {
                            cPeek = _html[curPos++];
                            if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                            {
                                continue;
                            }
                            break;
                        }
                        if (cPeek == (byte)'>')
                        {
                            goto ReturnChunk;
                        }
                        curPos--;
                        if (curPos >= _dataLength)
                        {
                            goto ReturnChunk;
                        }
                    }
                    if (curPos >= _dataLength)
                    {
                        goto ReturnChunk;
                    }
                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                var attrId = _he.MatchAttr(cPeek, tagId);
                if (attrId > 0)
                {
                    var attr = _he.GetAttrData(attrId);
                    if (curPos + attr.Length + 2 >= _dataLength)
                    {
                        goto ActualAttributeParsing;
                    }
                    // in a loop (and this is not an ideal solution, but still)
                    for (var i = 1; i < attr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if (attr[i] != _html[curPos + i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }
                    var nextChar = _html[curPos + attr.Length];
                    // ok, we expect next symbol to be =
                    if (nextChar == (byte)'=')
                    {
                        attrName = _he.GetAttr(attrId);
                        curPos  += attr.Length + 1;
                        cPeek    = _html[curPos];
                        goto AttributeValueParsing;
                    }
                }
            }

ActualAttributeParsing:

            _text.Clear();
            // doing exactly the same thing as in tag parsing
            while (cPeek != 0)
            {
                charType = _tagCharTypes[cPeek];
                //if (cPeek <= 32 && whiteSpace[cPeek] == 1)
                if (charType == (byte)TagCharType.WhiteSpace)
                {
                    curPos++;
                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        cPeek = 0;
                        break;
                    }
                    charType = _tagCharTypes[c];
                    //if (c == ' ' || c == '\t' || c == 13 || c == 10)
                    //if (c <= 32 && whiteSpace[c] == 1)
                    if (charType == (byte)TagCharType.WhiteSpace)
                    {
                        while (curPos < _dataLength)
                        {
                            c        = _html[curPos++];
                            charType = _tagCharTypes[c];
                            if (charType == (byte)TagCharType.WhiteSpace)
                            //if(c != ' ' && c != '\t' && c != 13 && c != 10)
                            {
                                //cPeek = _html[curPos];
                                continue;
                            }
                            //if (c == (byte)'>')
                            // goto ReturnChunk;
                            //curPos--;
                            break;
                        }
                        if (curPos >= _dataLength)
                        {
                            c     = 0;
                            cPeek = 0;
                            break;
                        }
                    }

                    //whiteSpaceHere = true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if (_text._bufPos > 0)
                    {
                        // _chunk.Append((byte)' ');
                        curPos--;
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }
                        // ok, we have got attribute name and now we have got next char there
                        // most likely we have got = here  and then value
                        if (cPeek == (byte)'=')
                        {
                            //equalsSign = true;
                            // move forward one char
                            curPos++;
                            if (curPos < _dataLength)
                            {
                                cPeek = _html[curPos];
                            }
                            else
                            {
                                cPeek = 0;
                            }
                            break;
                        }
                        // or we can have end of tag itself, doh!
                        if (cPeek == (byte)'>')
                        {
                            // move forward one char
                            curPos++;
                            if (_text._bufPos > 0)
                            {
                                _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                            }
                            if (!_chunk.Closure)
                            {
                                _chunk.Type = HTMLchunkType.OpenTag;
                            }
                            return(_chunk);
                        }
                        // closure
                        if (cPeek == (byte)'/')
                        {
                            _chunk.Closure    = true;
                            _chunk.EndClosure = true;
                            _chunk.Type       = HTMLchunkType.CloseTag;
                            continue;
                        }
                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                        _text.Clear();
                        goto AttributeParsing;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //c = cPeek; curPos++;
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        c = 0;
                    }
                }
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos];
                }
                else
                {
                    cPeek = 0;
                }
                // most likely we should have lower-cased ASCII char here
                if (charType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    _text._buffer[_text._bufPos++] = c;
                    // _chunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if (c == (byte)'=')
                {
                    //equalsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and /
                //if(c >= 65 && c <= 90)
                if (charType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    _text._buffer[_text._bufPos++] = charType;
                    // _chunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if (c == (byte)'>')
                {
                    if (_text._bufPos > 0)
                    {
                        _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                    }
                    if (!_chunk.Closure)
                    {
                        _chunk.Type = HTMLchunkType.OpenTag;
                    }
                    return(_chunk);
                }

                // closure of tag sign
                if (c == (byte)'/')
                {
                    _chunk.Closure    = true;
                    _chunk.EndClosure = true;
                    _chunk.Type       = HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                _text._buffer[_text._bufPos++] = c;
                // _chunk.Append(cChar);
            }

            if (cPeek == 0)
            {
                if (_text._bufPos > 0)
                {
                    _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                }
                if (!_chunk.Closure)
                {
                    _chunk.Type = HTMLchunkType.OpenTag;
                }
                return(_chunk);
            }

            attrName = _text.SetToStringASCII();

AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            var quotes = cPeek;

            int valueStartOffset;

            // skip whitespace if any
            if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
            {
                curPos++;
                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos];
                }
                else
                {
                    valueStartOffset = curPos - 1;
                    goto AttributeValueEnd;
                }

                //if (c == ' ' || c == '\t' || c == 13 || c == 10)
                //if (c <= 32 && whiteSpace[c] == 1)
                if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    while (curPos < _dataLength)
                    {
                        cPeek = _html[curPos++];
                        if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        //if(c != ' ' && c != '\t' && c != 13 && c != 10)
                        {
                            //cPeek = _html[curPos];
                            continue;
                        }
                        curPos--;
                        break;
                    }
                    if (curPos >= _dataLength)
                    {
                        valueStartOffset = curPos - 1;
                        goto AttributeValueEnd;
                    }
                }
                quotes = cPeek;
            }

            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if (cPeek != '\"' && cPeek != '\'')
            {
                valueStartOffset = curPos;
                quotes           = (byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos++];
                }
                else
                {
                    goto AttributeValueEnd;
                }

                while (cPeek != 0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), (byte)' ');
                        curPos--;
                        goto AttributeParsing;
                    }
                    // end of tag?
                    if (cPeek == (byte)'>')
                    {
                        //curPos--;
                        break;
                    }
                    if (curPos < _dataLength)
                    {
                        cPeek = _html[curPos++];
                    }
                    else
                    {
                        curPos = _dataLength + 1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), (byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            curPos++;
            valueStartOffset = curPos;
            if (curPos < _dataLength)
            {
                cPeek = _html[curPos++];
            }
            else
            {
                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while (cPeek != 0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if (cPeek == 38)
                {
                    var prevPos    = curPos;
                    var entityChar = _e.CheckForEntity(_html, ref curPos, _dataLength);
                    // restore current symbol
                    if (entityChar == 0)
                    {
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos++];
                        }
                        else
                        {
                            break;
                        }
                        //_text.Buffer[_text.BufPos++] = 38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        var preEntLen = prevPos - valueStartOffset - 1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        _text.Clear();
                        // copy previous data
                        if (preEntLen > 0)
                        {
                            Array.Copy(_html, valueStartOffset, _text._buffer, 0, preEntLen);
                            _text._bufPos = preEntLen;
                        }
                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        _chunk.Entities = true;
                        if (c == (byte)'<')
                        {
                            _chunk.LtEntity = true;
                        }
                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if (c != ' ')
                        _text.Append(entityChar);
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos++];
                        }
                        else
                        {
                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while (cPeek != 0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if (cPeek == 38)
                            {
                                var newEntityChar = _e.CheckForEntity(_html, ref curPos, _dataLength);
                                // restore current symbol
                                if (newEntityChar != 0)
                                {
                                    if (newEntityChar == (byte)'<')
                                    {
                                        _chunk.LtEntity = true;
                                    }
                                    _text.Append(newEntityChar);
                                    if (curPos < _dataLength)
                                    {
                                        cPeek = _html[curPos++];
                                    }
                                    else
                                    {
                                        goto AttributeValueEnd;
                                    }
                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if (cPeek == quotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                _chunk.AddParam(attrName, _text.SetToString(), quotes);
                                if (curPos < _dataLength)
                                {
                                    cPeek = _html[curPos];
                                }
                                else
                                {
                                    break;
                                }
                                goto AttributeParsing;
                            }
                            _text._buffer[_text._bufPos++] = cPeek;
                            //_text.Append(cPeek);
                            if (curPos < _dataLength)
                            {
                                cPeek = _html[curPos++];
                            }
                            else
                            {
                                break;
                            }
                        }
                        _chunk.AddParam(attrName, _text.SetToString(), quotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if (cPeek == quotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //_text.Clear();
                    _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), quotes);
                    if (curPos < _dataLength)
                    {
                        cPeek = _html[curPos];
                    }
                    else /*curPos++;*/ break {
                        ;
                    }
                    goto AttributeParsing;
                }

                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos++];
                }
                else /*curPos++;*/ break {
                    ;
                }
Esempio n. 2
0
        /// <summary>
        /// Parses next chunk and returns it with
        /// </summary>
        /// <returns>HTMLchunk or null if end of data reached</returns>
        public HTMLchunk ParseNext()
        {
            if (_curPos >= _dataLength)
            {
                return(null);
            }
            _chunk.Clear();
            _chunk.ChunkOffset = _curPos;
            var c = _html[_curPos++];

            // most likely what we have here is a normal char,
            if (c == (byte)'<')
            {
                // tag parsing route - we know for sure that we have not had some text chars before
                // that point to worry about
                return(GetNextTag());
            }
            else
            {
                // check if it's whitespace - typically happens after tag end and before new tag starts
                // so it makes sense make it special case
                if (CompressWhiteSpaceBeforeTag && c <= 32 && _whiteSpace[c] == 1)
                {
                    // ok first char is empty space, this can often lead to new tag
                    // thus causing us to create essentially empty strings where as we could have
                    // returned fixed single space string when it is necessary
                    while (_curPos < _dataLength)
                    {
                        c = _html[_curPos++];
                        if (c <= 32 && _whiteSpace[c] == 1)
                        {
                            continue;
                        }
                        // ok we got tag, but all we had before it was spaces, most likely end of lines
                        // so we will return compact representation of that text data
                        if (c == (byte)'<')
                        {
                            _curPos--;
                            _chunk.Type = HTMLchunkType.Text;
                            _chunk.Html = " ";
                            return(_chunk);
                        }
                        break;
                    }
                }

                // ok normal text, we just scan it until tag or end of text
                // statistically this loop will have plenty of iterations
                // thus it makes sense to deal with pointers, we only do that if
                // we have got plenty of bytes to scan left
                var quadBytes = ((_dataLength - _curPos) >> 2) - 1;
                if (!_e.CanDecodeEntities && !_e.MiniEntities)
                {
                    while (_curPos < _dataLength)
                    {
                        // ok we got tag, but all we had before it was spaces, most likely end of lines
                        // so we will return compact representation of that text data
                        if (_html[_curPos++] == (byte)'<')
                        {
                            _curPos--;
                            break;
                        }
                    }
                }
                else
                {
                    // TODO: might help skipping data in quads but we need to perfect bitmap operations for that:
                    // stop when at least one & or < is detected in quad

                    /*
                     * fixed (byte* bpData=&bHTML[iCurPos])
                     * {
                     *  uint* uiData=(uint*)bpData;
                     *  for(int i=0; i<iQuadBytes; i++)
                     *  {
                     *      // use bitmask operation to quickly check if any of the 4 bytes
                     *      // has got < in them - should be FAIRLY unlikely thus allowing us to skip
                     *      // few bytes in one go
                     *      if((~(*uiData &  0x3C3C3C3C)) )
                     *      {
                     *          iCurPos+=4;
                     *          uiData++;
                     *          continue;
                     *      }
                     *      break;
                     *  }
                     * }
                     */

                    // we might have entity here, which is first char of the text:
                    if (c == (byte)'&')
                    {
                        var lastCurPos = _curPos - 1;
                        var entityChar = _e.CheckForEntity(_html, ref _curPos, _dataLength);
                        // restore current symbol
                        if (entityChar != 0)
                        {
                            // ok, we have got entity on our hand, it means that we can't just copy
                            // data from start of the buffer to end of text thereby avoiding having to
                            // accumulate same data elsewhere
                            _text.Clear();
                            _chunk.Entities = true;
                            if (entityChar == (byte)'<')
                            {
                                _chunk.LtEntity = true;
                            }
                            _text.Append(entityChar);
                            return(ParseTextWithEntities());
                        }
                    }

                    while (_curPos < _dataLength)
                    {
                        c = _html[_curPos++];
                        // ok we got tag, but all we had before it was spaces, most likely end of lines
                        // so we will return compact representation of that text data
                        if (c == (byte)'<')
                        {
                            _curPos--;
                            break;
                        }
                        // check if we got entity
                        if (c == (byte)'&')
                        {
                            var lastCurPos = _curPos - 1;
                            var entityChar = _e.CheckForEntity(_html, ref _curPos, _dataLength);
                            // restore current symbol
                            if (entityChar != 0)
                            {
                                // ok, we have got entity on our hand, it means that we can't just copy
                                // data from start of the buffer to end of text thereby avoiding having to
                                // accumulate same data elsewhere
                                _text.Clear();
                                var len = lastCurPos - _chunk.ChunkOffset;
                                if (len > 0)
                                {
                                    Array.Copy(_html, _chunk.ChunkOffset, _text._buffer, 0, len);
                                    _text._bufPos = len;
                                }
                                _chunk.Entities = true;
                                if (entityChar == (byte)'<')
                                {
                                    _chunk.LtEntity = true;
                                }
                                _text.Append(entityChar);
                                return(ParseTextWithEntities());
                            }
                        }
                    }
                }
                _chunk.ChunkLength = _curPos - _chunk.ChunkOffset;
                if (_chunk.ChunkLength == 0)
                {
                    return(null);
                }
                _chunk.Type = HTMLchunkType.Text;
                // oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
                _chunk.Html = _originalHtml.Substring(_chunk.ChunkOffset, _chunk.ChunkLength);
                return(_chunk);
            }
        }