Exemplo n.º 1
        private HTMLchunk ParseTextWithEntities()
            // okay, now that we got our first entity we will need to continue
            // parsing by copying data into temporary buffer and when finished
            // convert it to string
            while (iCurPos < iDataLength)
                byte cChar = bHTML[iCurPos++];

                // ok we got tag, but all we had before it was spaces, most likely end of lines
                // so we will return compact representation of that text data
                if (cChar == (byte)'<')

                // check if we got entity again
                if (cChar == (byte)'&')
                    char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                    // restore current symbol
                    if (cNewEntityChar != 0)
                        if (cNewEntityChar == (byte)'<')
                            oChunk.bLtEntity = true;


                sText.bBuffer[sText.iBufPos++] = cChar;

            oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

            oChunk.oType = HTMLchunkType.Text;
            oChunk.oHTML = sText.SetToString();

Exemplo n.º 2
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int iCurPos)
            byte cChar = 0;
            byte cPeek = 0;

            // initialise peeked char - this will point to the next after < character
            if (iCurPos < iDataLength)
                cPeek = bHTML[iCurPos];

                // in case of comments ! must follow immediately after <
                if (cPeek == (byte)'!')
                    if (iCurPos + 2 < iDataLength &&
                        bHTML[iCurPos + 1] == (byte)'-' && bHTML[iCurPos + 2] == (byte)'-')
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag      = "!--";
                        oChunk.oType     = HTMLchunkType.Comment;
                        oChunk.bComments = true;

                        iCurPos += 3;
                        bool bFullTag;
                        oChunk = ParseComments(ref iCurPos, out bFullTag);

                        oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

                        if (oP.bAutoKeepComments || oP.bKeepRawHTML)
                            if (!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength);
                                oChunk.oHTML = GetString(oChunk.iChunkOffset + 4, oChunk.iChunkLength - (bFullTag ? 7 : 4));

                // empty tag but its not closed, so we will call it open...
                oChunk.oType = HTMLchunkType.OpenTag;
                // end of data... before it started

            // tag ID, non-zero if matched by heuristics engine
            int iTagID = 0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if (bEnableHeuristics && iCurPos < iMaxHeuDataLength)
                // check if we have got closure of the tag
                if (cPeek == (byte)'/')
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = false;
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    cPeek = bHTML[iCurPos];

                cChar = bHTML[iCurPos + 1];

                // probability of having a match is very high (or so we expect)
                iTagID = oHE.MatchTag(cPeek, cChar);

                if (iTagID != 0)
                    if (iTagID < 0)
                        iTagID *= -1;
                        // single character tag
                        oChunk.sTag = oHE.GetString(iTagID);

                        // see if we got fully closed tag
                        if (cChar == (byte)'>')
                            iCurPos += 2;
                            goto ReturnChunk;

                        cPeek = cChar;

                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        byte cNextChar = bHTML[iCurPos + 2];

                        if (cNextChar == (byte)'>')
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos    += 3;

                            goto ReturnChunk;

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if (cNextChar == (byte)' ')
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos    += 2;

                            cPeek = cNextChar;

                            goto AttributeParsing;

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        byte[] bTag = oHE.GetStringData(iTagID);

                        if (iCurPos + bTag.Length + 5 >= iDataLength)
                            goto TagParsing;

                        // in a loop (and this is not an ideal solution, but still)
                        for (int i = 2; i < bTag.Length; i++)
                            // if a single char is not matched, then we
                            if (bTag[i] != bHTML[iCurPos + i])
                                goto TagParsing;

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer
                        // word
                        cNextChar = bHTML[iCurPos + bTag.Length];

                        if (cNextChar == (byte)'>')
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos    += bTag.Length + 1;

                            goto ReturnChunk;

                        if (cNextChar == (byte)' ')
                            cPeek       = cNextChar;
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos    += bTag.Length;

                            goto AttributeParsing;

                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(



            byte bCharType = 0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while (cPeek != 0)
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                        cChar = 0;

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                        while (iCurPos < iDataLength)
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)


                        if (iCurPos >= iDataLength)
                            cChar = 0;


                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if (sText.iBufPos > 0)
                        oChunk.sTag = sText.SetToStringASCII();

                        // oChunk.Append((byte)' ');


                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos];
                            cPeek = 0;

                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                        cChar = 0;

                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos];
                    cPeek = 0;

                // most likely we should have lower-cased ASCII char
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                    if (sText.iBufPos > 0)
                        oChunk.sTag = sText.SetToStringASCII();

                    if (!oChunk.bClosure)
                        oChunk.oType = HTMLchunkType.OpenTag;


                // closure of tag sign
                if (cChar == (byte)'/')
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = (sText.iBufPos > 0);
                    oChunk.oType       = HTMLchunkType.CloseTag;

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);

                // ok, we have got some other char - we break out to deal with it in attributes part

            if (cPeek == 0)

            // if true then equal sign was found
            //bool bEqualsSign=false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters


            string sAttrName;

            if (iTagID != 0)
                // first, skip whitespace:
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    // most likely next char is not-whitespace

                    if (iCurPos >= iDataLength)
                        goto ReturnChunk;

                    cPeek = bHTML[iCurPos];

                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        // ok long loop here then
                        while (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos++];

                            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)


                        if (cPeek == (byte)'>')
                            goto ReturnChunk;


                        if (iCurPos >= iDataLength)
                            goto ReturnChunk;

                    if (iCurPos >= iDataLength)
                        goto ReturnChunk;

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                int iAttrID = oHE.MatchAttr(cPeek, iTagID);

                if (iAttrID > 0)
                    byte[] bAttr = oHE.GetAttrData(iAttrID);

                    if (iCurPos + bAttr.Length + 2 >= iDataLength)
                        goto ActualAttributeParsing;

                    // in a loop (and this is not an ideal solution, but still)
                    for (int i = 1; i < bAttr.Length; i++)
                        // if a single char is not matched, then we
                        if (bAttr[i] != bHTML[iCurPos + i])
                            goto ActualAttributeParsing;

                    byte cNextChar = bHTML[iCurPos + bAttr.Length];

                    // ok, we expect next symbol to be =
                    if (cNextChar == (byte)'=')
                        sAttrName = oHE.GetAttr(iAttrID);
                        iCurPos  += bAttr.Length + 1;
                        cPeek     = bHTML[iCurPos];

                        goto AttributeValueParsing;



            // doing exactly the same thing as in tag parsing
            while (cPeek != 0)
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                        cPeek = 0;

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                        while (iCurPos < iDataLength)
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)

                            //	goto ReturnChunk;


                        if (iCurPos >= iDataLength)
                            cChar = 0;
                            cPeek = 0;


                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if (sText.iBufPos > 0)
                        // oChunk.Append((byte)' ');


                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos];
                            cPeek = 0;

                        // ok, we have got attribute name and now we have got next char there

                        // most likely we have got = here  and then value
                        if (cPeek == (byte)'=')

                            // move forward one char

                            if (iCurPos < iDataLength)
                                cPeek = bHTML[iCurPos];
                                cPeek = 0;


                        // or we can have end of tag itself, doh!
                        if (cPeek == (byte)'>')
                            // move forward one char

                            if (sText.iBufPos > 0)
                                oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');

                            if (!oChunk.bClosure)
                                oChunk.oType = HTMLchunkType.OpenTag;


                        // closure
                        if (cPeek == (byte)'/')
                            oChunk.bClosure    = true;
                            oChunk.bEndClosure = true;
                            oChunk.oType       = HTMLchunkType.CloseTag;

                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                        goto AttributeParsing;
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                        cChar = 0;

                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos];
                    cPeek = 0;

                // most likely we should have lower-cased ASCII char here
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);

                // = with attribute value to follow
                if (cChar == (byte)'=')

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                    if (sText.iBufPos > 0)
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');

                    if (!oChunk.bClosure)
                        oChunk.oType = HTMLchunkType.OpenTag;


                // closure of tag sign
                if (cChar == (byte)'/')
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = true;
                    oChunk.oType       = HTMLchunkType.CloseTag;

                // some other char
                sText.bBuffer[sText.iBufPos++] = cChar;
                // oChunk.Append(cChar);

            if (cPeek == 0)
                if (sText.iBufPos > 0)
                    oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');

                if (!oChunk.bClosure)
                    oChunk.oType = HTMLchunkType.OpenTag;


            sAttrName = sText.SetToStringASCII();


            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            byte cQuotes = cPeek;

            int iValueStartOffset;

            // skip whitespace if any
            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)

                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos];
                    iValueStartOffset = iCurPos - 1;
                    goto AttributeValueEnd;

                //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                //if(cChar<=32 && bWhiteSpace[cChar]==1)
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    while (iCurPos < iDataLength)
                        cPeek = bHTML[iCurPos++];

                        if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)


                    if (iCurPos >= iDataLength)
                        iValueStartOffset = iCurPos - 1;
                        goto AttributeValueEnd;

                cQuotes = cPeek;

            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if (cPeek != '\"' && cPeek != '\'')
                iValueStartOffset = iCurPos;

                cQuotes = (byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible

                // move to next char
                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos++];
                    goto AttributeValueEnd;

                while (cPeek != 0)
                    // if whitespace then we got our value and need to go back to param
                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');
                        goto AttributeParsing;

                    // end of tag?
                    if (cPeek == (byte)'>')

                    if (iCurPos < iDataLength)
                        cPeek = bHTML[iCurPos++];
                        iCurPos = iDataLength + 1;
                        goto AttributeValueEnd;

                // ok we are done, add outstanding attribute
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');

                goto ReturnChunk;

            // move one step forward

            iValueStartOffset = iCurPos;

            if (iCurPos < iDataLength)
                cPeek = bHTML[iCurPos++];
                goto AttributeValueEnd;

            // attribute value parsing from between two quotes
            while (cPeek != 0)
                // check whether we have got possible entity (can be anything starting with &)
                if (cPeek == 38)
                    int iPrevPos = iCurPos;

                    char cEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                    // restore current symbol
                    if (cEntityChar == 0)
                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos++];

                        //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';;
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        int iPreEntLen = iPrevPos - iValueStartOffset - 1;

                        // 14/05/08 need to clear text - it contains attribute name text

                        // copy previous data
                        if (iPreEntLen > 0)
                            Array.Copy(bHTML, iValueStartOffset, sText.bBuffer, 0, iPreEntLen);
                            sText.iBufPos = iPreEntLen;

                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        oChunk.bEntities = true;

                        if (cChar == (byte)'<')
                            oChunk.bLtEntity = true;

                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if(cChar!=' ')

                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos++];
                            goto AttributeValueEnd;

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while (cPeek != 0)
                            // check whether we have got possible entity (can be anything starting with &)
                            if (cPeek == 38)
                                char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                                // restore current symbol
                                if (cNewEntityChar != 0)
                                    if (cNewEntityChar == (byte)'<')
                                        oChunk.bLtEntity = true;


                                    if (iCurPos < iDataLength)
                                        cPeek = bHTML[iCurPos++];
                                        goto AttributeValueEnd;


                            // check if is end of quotes
                            if (cPeek == cQuotes)
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);

                                if (iCurPos < iDataLength)
                                    cPeek = bHTML[iCurPos];

                                goto AttributeParsing;

                            sText.bBuffer[sText.iBufPos++] = cPeek;

                            if (iCurPos < iDataLength)
                                cPeek = bHTML[iCurPos++];

                        oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);
                        goto ReturnChunk;

                // check if is end of quotes
                if (cPeek == cQuotes)
                    // ok we finished scanning it: add param with value and then go back to param name parsing

                    oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), cQuotes);

                    if (iCurPos < iDataLength)
                        cPeek = bHTML[iCurPos];

                    goto AttributeParsing;

                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos++];


            // ok we are done, add outstanding attribute
            int iLen = iCurPos - iValueStartOffset - 1;

            if (iLen > 0)
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iLen), cQuotes);
                oChunk.AddParam(sAttrName, "", cQuotes);


            if (oChunk.bClosure)
                oChunk.oType = HTMLchunkType.CloseTag;
                oChunk.oType = HTMLchunkType.OpenTag;
