Пример #1
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="p">HTML parser object that is used for parsing</param>
        /// <param name="chunk">Parsed chunk that should contain tag META</param>
        /// <param name="encodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser p, HTMLchunk chunk, ref bool encodingSet)
        {
            if (chunk.Tag.Length != 4 || chunk.Tag[0] != 'm' || chunk.Tag != "meta")
            {
                return(false);
            }
            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!chunk.HashMode)
            {
                chunk.ConvertParamsToHash();
            }
            var key = chunk.Params["http-equiv"] as string;

            if (key != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (key.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                case "content-category":
                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!encodingSet)
                    {
                        var data = chunk.Params["content"] as string;
                        // it is possible we have broken META tag without Content part
                        if (data != null)
                        {
                            if (p.SetEncoding(data))
                            {
                                // we may need to re-encode title
                                if (!encodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    encodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }
                    return(true);

                default: break;
                }
            }
            return(false);
        }
Пример #2
0
 /// <summary>
 /// Internally parses tag and returns it from point when open bracket (&lt;) was found
 /// </summary>
 /// <returns>Chunk</returns>
 HTMLchunk GetNextTag()
 {
     //curPos++;
     _chunk = _tp.ParseTag(ref _curPos);
     // for backwards compatibility mark closed tags with params as open
     if (_chunk.ParamsCount > 0 && AutoMarkClosedTagsWithParamsAsOpen && _chunk.Type == HTMLchunkType.CloseTag)
     {
         _chunk.Type = HTMLchunkType.OpenTag;
     }
     //                    012345
     // check for start of script
     if (_chunk.Tag.Length == 6 && _chunk.Tag[0] == 's' && _chunk.Tag == "script")
     {
         if (!_chunk.Closure)
         {
             _chunk.Type = HTMLchunkType.Script;
             _chunk      = _tp.ParseScript(ref _curPos);
             return(_chunk);
         }
     }
     _chunk.ChunkLength = _curPos - _chunk.ChunkOffset;
     if (KeepRawHTML)
     {
         _chunk.Html = Enc.GetString(_html, _chunk.ChunkOffset, _chunk.ChunkLength);
     }
     return(_chunk);
 }
Пример #3
0
 private void Dispose(bool bDisposing)
 {
     if (!Disposed)
     {
         Disposed = true;
         if (_chunk != null)
         {
             _chunk.Dispose();
             _chunk = null;
         }
         if (_text != null)
         {
             _text.Dispose();
             _text = null;
         }
         _html = null;
         if (_e != null)
         {
             _e.Dispose();
             _e = null;
         }
         if (_tp != null)
         {
             _tp.Dispose();
             _tp = null;
         }
     }
 }
Пример #4
0
 /// <summary>
 /// Inits tag parser
 /// </summary>
 /// <param name="chunk"></param>
 /// <param name="text"></param>
 internal void Init(HTMLparser p, HTMLchunk chunk, DynaString text, byte[] html, int dataLength, HTMLentities e, HTMLheuristics he)
 {
     _p          = p;
     _chunk      = chunk;
     _text       = text;
     _html       = html;
     _dataLength = dataLength;
     // we don't want to be too close to end of data when dealing with heuristics
     _maxHeuDataLength = _dataLength - MIN_DATA_SIZE_FOR_HEURISTICS;
     _e  = e;
     _he = he;
 }
Пример #5
0
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int curPos)
        {
            /*
             *  WARNING: this code was optimised for performance rather than for readability,
             *  so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML
             *
             *  This routine takes about 60% of CPU time, in theory its the best place to gain extra speed,
             *  but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post
             *  your changes for everyone to enjoy!
             * */

            //var whiteSpaceHere = false;

            //var paramValue = false;
            byte c     = 0;
            byte cPeek = 0;

            // if true it means we have parsed complete tag
            //var gotTag = false;

            //var equalIdx = 0;

            // we reach this function immediately after tag's byte (<) was
            // detected, so we need to save it in order to keep correct HTML copy
            // _hunk.Append((byte)'<'); // (byte)'<'

            /*
             * _chunk.Buffer[0] = 60;
             * _chunk.BufPos = 1;
             * _chunk.HTMLen = 1;
             */

            // initialise peeked char - this will point to the next after < character
            if (curPos < _dataLength)
            {
                cPeek = _html[curPos];
                // in case of comments ! must follow immediately after <
                if (cPeek == (byte)'!')
                {
                    if (curPos + 2 < _dataLength &&
                        _html[curPos + 1] == (byte)'-' && _html[curPos + 2] == (byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        _chunk.Tag      = "!--";
                        _chunk.Type     = HTMLchunkType.Comment;
                        _chunk.Comments = true;
                        // _chunk.Append((byte)'!');
                        // _chunk.Append((byte)'-');
                        // _chunk.Append((byte)'-');
                        curPos            += 3;
                        _chunk             = ParseComments(ref curPos, out bool fullTag);
                        _chunk.ChunkLength = curPos - _chunk.ChunkOffset;
                        if (_p.AutoKeepComments || _p.KeepRawHTML)
                        {
                            if (!_p.AutoExtractBetweenTagsOnly)
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset, _chunk.ChunkLength);
                            }
                            else
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset + 4, _chunk.ChunkLength - fullTag ? 7 : 4);
                            }
                        }
                        return(_chunk);
                    }

                    // ok we might have here CDATA element of XML:
                    // ref: http://www.w3schools.com/xml/xml_cdata.asp
                    if (curPos + 7 < _dataLength &&
                        _html[curPos + 1] == (byte)'[' &&
                        _html[curPos + 2] == (byte)'C' &&
                        _html[curPos + 3] == (byte)'D' &&
                        _html[curPos + 4] == (byte)'A' &&
                        _html[curPos + 5] == (byte)'T' &&
                        _html[curPos + 6] == (byte)'A' &&
                        _html[curPos + 7] == (byte)'[')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        _chunk.Tag      = "![CDATA[";
                        _chunk.Type     = HTMLchunkType.Comment;
                        _chunk.Comments = true;
                        // _chunk.Append((byte)'!');
                        // _chunk.Append((byte)'-');
                        // _chunk.Append((byte)'-');
                        curPos            += 8;
                        _chunk             = ParseCDATA(ref curPos, out bool fullTag);
                        _chunk.ChunkLength = curPos - _chunk.ChunkOffset;
                        if (_p.AutoKeepComments || _p.KeepRawHTML)
                        {
                            if (!_p.AutoExtractBetweenTagsOnly)
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset, _chunk.ChunkLength);
                            }
                            else
                            {
                                _chunk.Html = GetString(_chunk.ChunkOffset + 4 + 5, _chunk.ChunkLength - fullTag ? 7 + 5 : 4 + 5);
                            }
                        }
                        return(_chunk);
                    }
                }
            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                _chunk.Type = HTMLchunkType.OpenTag;
                // end of data... before it started
                return(_chunk);
            }

            // tag ID, non-zero if matched by heuristics engine
            var tagId = 0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if (EnableHeuristics && curPos < _maxHeuDataLength)
            {
                // check if we have got closure of the tag
                if (cPeek == (byte)'/')
                {
                    _chunk.Closure    = true;
                    _chunk.EndClosure = false;
                    _chunk.Type       = HTMLchunkType.CloseTag;
                    curPos++;
                    cPeek = _html[curPos];
                }
                c = _html[curPos + 1];
                // probability of having a match is very high (or so we expect)
                tagId = _he.MatchTag(cPeek, c);
                if (tagId != 0)
                {
                    if (tagId < 0)
                    {
                        tagId *= -1;
                        // single character tag
                        _chunk.Tag = _he.GetString(tagId);
                        // see if we got fully closed tag
                        if (c == (byte)'>')
                        {
                            curPos += 2;
                            goto ReturnChunk;
                        }
                        cPeek = c;
                        curPos++;
                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        var nextChar = _html[curPos + 2];
                        if (nextChar == (byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            _chunk.Tag = _he.GetTwoCharString(cPeek, c);
                            curPos    += 3;
                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if (nextChar == (byte)' ')
                        {
                            //_chunk.Tag = _he.GetString(tagId);
                            _chunk.Tag = _he.GetTwoCharString(cPeek, c);
                            curPos    += 2;
                            cPeek      = nextChar;
                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        var tag = _he.GetStringData(tagId);

                        if (curPos + tag.Length + 5 >= _dataLength)
                        {
                            goto TagParsing;
                        }

                        // in a loop (and this is not an ideal solution, but still)
                        for (int i = 2; i < tag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if (tag[i] != _html[curPos + i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer word
                        nextChar = _html[curPos + tag.Length];
                        if (nextChar == (byte)'>')
                        {
                            _chunk.Tag = _he.GetString(tagId);
                            curPos    += tag.Length + 1;
                            goto ReturnChunk;
                        }
                        if (nextChar == (byte)' ')
                        {
                            cPeek      = nextChar;
                            _chunk.Tag = _he.GetString(tagId);
                            curPos    += tag.Length;
                            goto AttributeParsing;
                        }
                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }
                }
            }
TagParsing:

            _text.Clear();
            var charType = 0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while (cPeek != 0)
            {
                charType = _tagCharTypes[cPeek];

                //if (cPeek <= 32 && whiteSpace[cPeek] == 1)
                if (charType == (byte)TagCharType.WhiteSpace)
                {
                    curPos++;
                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        c = 0;
                    }
                    charType = _tagCharTypes[c];

                    //if (c == ' ' || c == '\t' || c == 13 || c == 10)
                    //if (c <= 32 && whiteSpace[c] == 1)
                    if (charType == (byte)TagCharType.WhiteSpace)
                    {
                        while (curPos < _dataLength)
                        {
                            c        = _html[curPos++];
                            charType = _tagCharTypes[c];
                            if (charType == (byte)TagCharType.WhiteSpace)
                            //if(c != ' ' && c != '\t' && c != 13 && c != 10)
                            {
                                //cPeek = _html[curPos];
                                continue;
                            }
                            break;
                        }
                        if (curPos >= _dataLength)
                        {
                            c = 0;
                        }
                    }

                    //whiteSpaceHere = true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if (_text._bufPos > 0)
                    {
                        _chunk.Tag = _text.SetToStringASCII();
                        // _chunk.Append((byte)' ');
                        curPos--;
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }
                        break;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //c = cPeek; curPos++;
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        c = 0;
                    }
                }
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos];
                }
                else
                {
                    cPeek = 0;
                }
                // most likely we should have lower-cased ASCII char
                if (charType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    _text._buffer[_text._bufPos++] = c;
                    // _chunk.Append(c);
                    continue;
                }

                // tag end - we did not have any params
                if (c == (byte)'>')
                {
                    if (_text._bufPos > 0)
                    {
                        _chunk.Tag = _text.SetToStringASCII();
                    }
                    if (!_chunk.Closure)
                    {
                        _chunk.Type = HTMLchunkType.OpenTag;
                    }
                    return(_chunk);
                }

                // closure of tag sign
                if (c == (byte)'/')
                {
                    _chunk.Closure    = true;
                    _chunk.EndClosure = (_text._bufPos > 0);
                    _chunk.Type       = HTMLchunkType.CloseTag;
                    continue;
                }

                // 03/08/08 XML support: ?xml tags - grrr
                if (c == (byte)'?')
                {
                    _text._buffer[_text._bufPos++] = c;
                    continue;
                }

                // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and /
                //if (c >= 65 && c <= 90)
                if (charType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    _text._buffer[_text._bufPos++] = charType;
                    // _chunk.Append(bCharType);
                    continue;
                }

                // we might have namespace : sign here - all text before would have to be
                // saved as namespace and we will need to continue parsing actual tag
                if (charType == (byte)TagCharType.NameSpaceColon)
                {
                    // ok here we got a choice - we can just continue and treat the whole
                    // thing as a single tag with namespace stuff prefixed, OR
                    // we can separate first part into namespace and keep tag as normal
                    _text._buffer[_text._bufPos++] = (byte)':';
                    continue;
                }
                // ok, we have got some other char - we break out to deal with it in attributes part
                break;
            }

            if (cPeek == 0)
            {
                return(_chunk);
            }

            // if true then equal sign was found
            //var equalsSign = false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters
AttributeParsing:

            string attrName;

            if (tagId != 0)
            {
                // first, skip whitespace:
                if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    curPos++;
                    if (curPos >= _dataLength)
                    {
                        goto ReturnChunk;
                    }
                    cPeek = _html[curPos];
                    if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while (curPos < _dataLength)
                        {
                            cPeek = _html[curPos++];
                            if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                            {
                                continue;
                            }
                            break;
                        }
                        if (cPeek == (byte)'>')
                        {
                            goto ReturnChunk;
                        }
                        curPos--;
                        if (curPos >= _dataLength)
                        {
                            goto ReturnChunk;
                        }
                    }
                    if (curPos >= _dataLength)
                    {
                        goto ReturnChunk;
                    }
                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                var attrId = _he.MatchAttr(cPeek, tagId);
                if (attrId > 0)
                {
                    var attr = _he.GetAttrData(attrId);
                    if (curPos + attr.Length + 2 >= _dataLength)
                    {
                        goto ActualAttributeParsing;
                    }
                    // in a loop (and this is not an ideal solution, but still)
                    for (var i = 1; i < attr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if (attr[i] != _html[curPos + i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }
                    var nextChar = _html[curPos + attr.Length];
                    // ok, we expect next symbol to be =
                    if (nextChar == (byte)'=')
                    {
                        attrName = _he.GetAttr(attrId);
                        curPos  += attr.Length + 1;
                        cPeek    = _html[curPos];
                        goto AttributeValueParsing;
                    }
                }
            }

ActualAttributeParsing:

            _text.Clear();
            // doing exactly the same thing as in tag parsing
            while (cPeek != 0)
            {
                charType = _tagCharTypes[cPeek];
                //if (cPeek <= 32 && whiteSpace[cPeek] == 1)
                if (charType == (byte)TagCharType.WhiteSpace)
                {
                    curPos++;
                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        cPeek = 0;
                        break;
                    }
                    charType = _tagCharTypes[c];
                    //if (c == ' ' || c == '\t' || c == 13 || c == 10)
                    //if (c <= 32 && whiteSpace[c] == 1)
                    if (charType == (byte)TagCharType.WhiteSpace)
                    {
                        while (curPos < _dataLength)
                        {
                            c        = _html[curPos++];
                            charType = _tagCharTypes[c];
                            if (charType == (byte)TagCharType.WhiteSpace)
                            //if(c != ' ' && c != '\t' && c != 13 && c != 10)
                            {
                                //cPeek = _html[curPos];
                                continue;
                            }
                            //if (c == (byte)'>')
                            // goto ReturnChunk;
                            //curPos--;
                            break;
                        }
                        if (curPos >= _dataLength)
                        {
                            c     = 0;
                            cPeek = 0;
                            break;
                        }
                    }

                    //whiteSpaceHere = true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if (_text._bufPos > 0)
                    {
                        // _chunk.Append((byte)' ');
                        curPos--;
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }
                        // ok, we have got attribute name and now we have got next char there
                        // most likely we have got = here  and then value
                        if (cPeek == (byte)'=')
                        {
                            //equalsSign = true;
                            // move forward one char
                            curPos++;
                            if (curPos < _dataLength)
                            {
                                cPeek = _html[curPos];
                            }
                            else
                            {
                                cPeek = 0;
                            }
                            break;
                        }
                        // or we can have end of tag itself, doh!
                        if (cPeek == (byte)'>')
                        {
                            // move forward one char
                            curPos++;
                            if (_text._bufPos > 0)
                            {
                                _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                            }
                            if (!_chunk.Closure)
                            {
                                _chunk.Type = HTMLchunkType.OpenTag;
                            }
                            return(_chunk);
                        }
                        // closure
                        if (cPeek == (byte)'/')
                        {
                            _chunk.Closure    = true;
                            _chunk.EndClosure = true;
                            _chunk.Type       = HTMLchunkType.CloseTag;
                            continue;
                        }
                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                        _text.Clear();
                        goto AttributeParsing;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //c = cPeek; curPos++;
                    if (curPos < _dataLength)
                    {
                        c = _html[curPos++];
                    }
                    else
                    {
                        c = 0;
                    }
                }
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos];
                }
                else
                {
                    cPeek = 0;
                }
                // most likely we should have lower-cased ASCII char here
                if (charType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    _text._buffer[_text._bufPos++] = c;
                    // _chunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if (c == (byte)'=')
                {
                    //equalsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and /
                //if(c >= 65 && c <= 90)
                if (charType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    _text._buffer[_text._bufPos++] = charType;
                    // _chunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if (c == (byte)'>')
                {
                    if (_text._bufPos > 0)
                    {
                        _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                    }
                    if (!_chunk.Closure)
                    {
                        _chunk.Type = HTMLchunkType.OpenTag;
                    }
                    return(_chunk);
                }

                // closure of tag sign
                if (c == (byte)'/')
                {
                    _chunk.Closure    = true;
                    _chunk.EndClosure = true;
                    _chunk.Type       = HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                _text._buffer[_text._bufPos++] = c;
                // _chunk.Append(cChar);
            }

            if (cPeek == 0)
            {
                if (_text._bufPos > 0)
                {
                    _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' ');
                }
                if (!_chunk.Closure)
                {
                    _chunk.Type = HTMLchunkType.OpenTag;
                }
                return(_chunk);
            }

            attrName = _text.SetToStringASCII();

AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            var quotes = cPeek;

            int valueStartOffset;

            // skip whitespace if any
            if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
            {
                curPos++;
                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos];
                }
                else
                {
                    valueStartOffset = curPos - 1;
                    goto AttributeValueEnd;
                }

                //if (c == ' ' || c == '\t' || c == 13 || c == 10)
                //if (c <= 32 && whiteSpace[c] == 1)
                if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    while (curPos < _dataLength)
                    {
                        cPeek = _html[curPos++];
                        if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        //if(c != ' ' && c != '\t' && c != 13 && c != 10)
                        {
                            //cPeek = _html[curPos];
                            continue;
                        }
                        curPos--;
                        break;
                    }
                    if (curPos >= _dataLength)
                    {
                        valueStartOffset = curPos - 1;
                        goto AttributeValueEnd;
                    }
                }
                quotes = cPeek;
            }

            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if (cPeek != '\"' && cPeek != '\'')
            {
                valueStartOffset = curPos;
                quotes           = (byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos++];
                }
                else
                {
                    goto AttributeValueEnd;
                }

                while (cPeek != 0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), (byte)' ');
                        curPos--;
                        goto AttributeParsing;
                    }
                    // end of tag?
                    if (cPeek == (byte)'>')
                    {
                        //curPos--;
                        break;
                    }
                    if (curPos < _dataLength)
                    {
                        cPeek = _html[curPos++];
                    }
                    else
                    {
                        curPos = _dataLength + 1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), (byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            curPos++;
            valueStartOffset = curPos;
            if (curPos < _dataLength)
            {
                cPeek = _html[curPos++];
            }
            else
            {
                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while (cPeek != 0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if (cPeek == 38)
                {
                    var prevPos    = curPos;
                    var entityChar = _e.CheckForEntity(_html, ref curPos, _dataLength);
                    // restore current symbol
                    if (entityChar == 0)
                    {
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos++];
                        }
                        else
                        {
                            break;
                        }
                        //_text.Buffer[_text.BufPos++] = 38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        var preEntLen = prevPos - valueStartOffset - 1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        _text.Clear();
                        // copy previous data
                        if (preEntLen > 0)
                        {
                            Array.Copy(_html, valueStartOffset, _text._buffer, 0, preEntLen);
                            _text._bufPos = preEntLen;
                        }
                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        _chunk.Entities = true;
                        if (c == (byte)'<')
                        {
                            _chunk.LtEntity = true;
                        }
                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if (c != ' ')
                        _text.Append(entityChar);
                        if (curPos < _dataLength)
                        {
                            cPeek = _html[curPos++];
                        }
                        else
                        {
                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while (cPeek != 0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if (cPeek == 38)
                            {
                                var newEntityChar = _e.CheckForEntity(_html, ref curPos, _dataLength);
                                // restore current symbol
                                if (newEntityChar != 0)
                                {
                                    if (newEntityChar == (byte)'<')
                                    {
                                        _chunk.LtEntity = true;
                                    }
                                    _text.Append(newEntityChar);
                                    if (curPos < _dataLength)
                                    {
                                        cPeek = _html[curPos++];
                                    }
                                    else
                                    {
                                        goto AttributeValueEnd;
                                    }
                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if (cPeek == quotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                _chunk.AddParam(attrName, _text.SetToString(), quotes);
                                if (curPos < _dataLength)
                                {
                                    cPeek = _html[curPos];
                                }
                                else
                                {
                                    break;
                                }
                                goto AttributeParsing;
                            }
                            _text._buffer[_text._bufPos++] = cPeek;
                            //_text.Append(cPeek);
                            if (curPos < _dataLength)
                            {
                                cPeek = _html[curPos++];
                            }
                            else
                            {
                                break;
                            }
                        }
                        _chunk.AddParam(attrName, _text.SetToString(), quotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if (cPeek == quotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //_text.Clear();
                    _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), quotes);
                    if (curPos < _dataLength)
                    {
                        cPeek = _html[curPos];
                    }
                    else /*curPos++;*/ break {
                        ;
                    }
                    goto AttributeParsing;
                }

                if (curPos < _dataLength)
                {
                    cPeek = _html[curPos++];
                }
                else /*curPos++;*/ break {
                    ;
                }
Пример #6
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="chunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk chunk)
 {
     // note: this really should have been byte array assigned rather than string
     // it would be more correct originality-wise
     chunk.Html = Enc.GetString(_html, chunk.ChunkOffset, chunk.ChunkLength);
 }