예제 #1
0
        /// <summary>
        /// Internally parses tag and returns it from point when '<' was found
        /// </summary>
        /// <returns>Chunk</returns>
        HTMLchunk GetNextTag()
        {
            //iCurPos++;

            oChunk = oTP.ParseTag(ref iCurPos);

            // for backwards compatibility mark closed tags with params as open
            if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag)
            {
                oChunk.oType = HTMLchunkType.OpenTag;
            }

            //                    012345
            // check for start of script
            if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script")
            {
                if (!oChunk.bClosure)
                {
                    oChunk.oType = HTMLchunkType.Script;
                    oChunk       = oTP.ParseScript(ref iCurPos);
                    return(oChunk);
                }
            }

            oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

            if (bKeepRawHTML)
            {
                oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
            }

            return(oChunk);
        }
예제 #2
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                if (oChunk != null)
                {
                    oChunk.Dispose();
                    oChunk = null;
                }

                if (sText != null)
                {
                    sText.Dispose();
                    sText = null;
                }

                bHTML = null;

                if (oE != null)
                {
                    oE.Dispose();
                    oE = null;
                }

                if (oTP != null)
                {
                    oTP.Dispose();
                    oTP = null;
                }
            }
        }
예제 #3
0
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int iCurPos)
        {
            /*
             *  WARNING: this code was optimised for performance rather than for readability,
             *  so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML
             *
             *  This routine takes about 60% of CPU time, in theory its the best place to gain extra speed,
             *  but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post
             *  your changes for everyone to enjoy!
             *
             *
             * */

            //bool bWhiteSpaceHere=false;

            //bool bParamValue=false;
            byte cChar=0;
            byte cPeek=0;

            // if true it means we have parsed complete tag
            //bool bGotTag=false;

            //int iEqualIdx=0;

            // we reach this function immediately after tag's byte (<) was
            // detected, so we need to save it in order to keep correct HTML copy
            // oChunk.Append((byte)'<'); // (byte)'<'

            /*
            oChunk.bBuffer[0]=60;
            oChunk.iBufPos=1;
            oChunk.iHTMLen=1;
            */

            // initialise peeked char - this will point to the next after < character
            if(iCurPos<iDataLength)
            {
                cPeek=bHTML[iCurPos];

                // in case of comments ! must follow immediately after <
                if(cPeek==(byte)'!')
                {
                    if(iCurPos+2<iDataLength &&
                        bHTML[iCurPos+1]==(byte)'-' && bHTML[iCurPos+2]==(byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag="!--";
                        oChunk.oType=HTMLchunkType.Comment;
                        oChunk.bComments=true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos+=3;
                        bool bFullTag;
                        oChunk=ParseComments(ref iCurPos,out bFullTag);

                        oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset;

                        if(oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if(!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength);
                            else
                            {
                                oChunk.oHTML=GetString(oChunk.iChunkOffset+4,oChunk.iChunkLength-(bFullTag ? 7 : 4));
                            }

                        }

                        return oChunk;
                    }

                    // ok we might have here CDATA element of XML:
                    // ref: http://www.w3schools.com/xml/xml_cdata.asp
                    if(iCurPos+7<iDataLength &&
                        bHTML[iCurPos+1]==(byte)'[' &&
                        bHTML[iCurPos+2]==(byte)'C' &&
                        bHTML[iCurPos+3]==(byte)'D' &&
                        bHTML[iCurPos+4]==(byte)'A' &&
                        bHTML[iCurPos+5]==(byte)'T' &&
                        bHTML[iCurPos+6]==(byte)'A' &&
                        bHTML[iCurPos+7]==(byte)'['
                        )
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag="![CDATA[";
                        oChunk.oType=HTMLchunkType.Comment;
                        oChunk.bComments=true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos+=8;
                        bool bFullTag;
                        oChunk=ParseCDATA(ref iCurPos,out bFullTag);

                        oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset;

                        if(oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if(!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength);
                            else
                            {
                                oChunk.oHTML=GetString(oChunk.iChunkOffset+4+5,
                                    oChunk.iChunkLength-(bFullTag ? 7+5 : 4+5));
                            }

                        }

                        return oChunk;
                    }

                }

            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                oChunk.oType=HTMLchunkType.OpenTag;
                // end of UltimaData... before it started
                return oChunk;
            }

            // tag ID, non-zero if matched by heuristics engine
            int iTagID=0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if(bEnableHeuristics && iCurPos<iMaxHeuDataLength)
            {
                // check if we have got closure of the tag
                if(cPeek==(byte)'/')
                {
                    oChunk.bClosure=true;
                    oChunk.bEndClosure=false;
                    oChunk.oType=HTMLchunkType.CloseTag;
                    iCurPos++;
                    cPeek=bHTML[iCurPos];
                }

                cChar=bHTML[iCurPos+1];

                // probability of having a match is very high (or so we expect)
                iTagID=oHE.MatchTag(cPeek,cChar);

                if(iTagID!=0)
                {
                    if(iTagID<0)
                    {
                        iTagID*=-1;
                        // single character tag
                        oChunk.sTag=oHE.GetString(iTagID);

                        // see if we got fully closed tag
                        if(cChar==(byte)'>')
                        {
                            iCurPos+=2;
                            goto ReturnChunk;
                        }

                        cPeek=cChar;
                        iCurPos++;

                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        byte cNextChar=bHTML[iCurPos+2];

                        if(cNextChar==(byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar);
                            iCurPos+=3;

                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if(cNextChar==(byte)' ')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar);
                            iCurPos+=2;

                            cPeek=cNextChar;

                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        byte[] bTag=oHE.GetStringData(iTagID);

                        if(iCurPos+bTag.Length+5>=iDataLength)
                            goto TagParsing;

                        // in a loop (and this is not an ideal solution, but still)
                        for(int i=2; i<bTag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if(bTag[i]!=bHTML[iCurPos+i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer
                        // word
                        cNextChar=bHTML[iCurPos+bTag.Length];

                        if(cNextChar==(byte)'>')
                        {
                            oChunk.sTag=oHE.GetString(iTagID);
                            iCurPos+=bTag.Length+1;

                            goto ReturnChunk;
                        }

                        if(cNextChar==(byte)' ')
                        {
                            cPeek=cNextChar;
                            oChunk.sTag=oHE.GetString(iTagID);
                            iCurPos+=bTag.Length;

                            goto AttributeParsing;
                        }

                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }

                }
            }

            TagParsing:

            sText.Clear();

            byte bCharType=0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while(cPeek!=0)
            {
                bCharType=bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if(bCharType==(byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                        cChar=0;

                    bCharType=bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if(bCharType==(byte)TagCharType.WhiteSpace)
                    {

                        while(iCurPos<iDataLength)
                        {
                            cChar=bHTML[iCurPos++];

                            bCharType=bTagCharTypes[cChar];
                            if(bCharType==(byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            break;
                        }

                        if(iCurPos>=iDataLength)
                            cChar=0;
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if(sText.iBufPos>0)
                    {
                        oChunk.sTag=sText.SetToStringASCII();

                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos];
                        else
                            cPeek=0;

                        break;
                    }

                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                        cChar=0;
                }

                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos];
                else
                    cPeek=0;

                // most likely we should have lower-cased ASCII char
                if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++]=cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // tag end - we did not have any params
                if(cChar==(byte)'>')
                {
                    if(sText.iBufPos>0)
                        oChunk.sTag=sText.SetToStringASCII();

                    if(!oChunk.bClosure)
                        oChunk.oType=HTMLchunkType.OpenTag;

                    return oChunk;
                }

                // closure of tag sign
                if(cChar==(byte)'/')
                {
                    oChunk.bClosure=true;
                    oChunk.bEndClosure=(sText.iBufPos>0);
                    oChunk.oType=HTMLchunkType.CloseTag;
                    continue;
                }

                // 03/08/08 XML support: ?xml tags - grrr
                if(cChar==(byte)'?')
                {
                    sText.bBuffer[sText.iBufPos++]=cChar;
                    continue;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if(bCharType>32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++]=bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // we might have namespace : sign here - all text before would have to be
                // saved as namespace and we will need to continue parsing actual tag
                if(bCharType==(byte)TagCharType.NameSpaceColon)
                {
                    // ok here we got a choice - we can just continue and treat the whole
                    // thing as a single tag with namespace stuff prefixed, OR
                    // we can separate first part into namespace and keep tag as normal
                    sText.bBuffer[sText.iBufPos++]=(byte)':';
                    continue;
                }

                // ok, we have got some other char - we break out to deal with it in attributes part
                break;

            }

            if(cPeek==0)
            {
                return oChunk;
            }

            // if true then equal sign was found
            //bool bEqualsSign=false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters

            AttributeParsing:

            string sAttrName;

            if(iTagID!=0)
            {

                // first, skip whitespace:
                if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    iCurPos++;

                    if(iCurPos>=iDataLength)
                        goto ReturnChunk;

                    cPeek=bHTML[iCurPos];

                    if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while(iCurPos<iDataLength)
                        {
                            cPeek=bHTML[iCurPos++];

                            if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                                continue;

                            break;
                        }

                        if(cPeek==(byte)'>')
                            goto ReturnChunk;

                        iCurPos--;

                        if(iCurPos>=iDataLength)
                            goto ReturnChunk;
                    }

                    if(iCurPos>=iDataLength)
                        goto ReturnChunk;

                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                int iAttrID=oHE.MatchAttr(cPeek,iTagID);

                if(iAttrID>0)
                {
                    byte[] bAttr=oHE.GetAttrData(iAttrID);

                    if(iCurPos+bAttr.Length+2>=iDataLength)
                        goto ActualAttributeParsing;

                    // in a loop (and this is not an ideal solution, but still)
                    for(int i=1; i<bAttr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if(bAttr[i]!=bHTML[iCurPos+i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }

                    byte cNextChar=bHTML[iCurPos+bAttr.Length];

                    // ok, we expect next symbol to be =
                    if(cNextChar==(byte)'=')
                    {
                        sAttrName=oHE.GetAttr(iAttrID);
                        iCurPos+=bAttr.Length+1;
                        cPeek=bHTML[iCurPos];

                        goto AttributeValueParsing;
                    }

                }

            }

            ActualAttributeParsing:

            sText.Clear();

            // doing exactly the same thing as in tag parsing
            while(cPeek!=0)
            {
                bCharType=bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if(bCharType==(byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                    {
                        cPeek=0;
                        break;
                    }

                    bCharType=bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if(bCharType==(byte)TagCharType.WhiteSpace)
                    {

                        while(iCurPos<iDataLength)
                        {
                            cChar=bHTML[iCurPos++];

                            bCharType=bTagCharTypes[cChar];
                            if(bCharType==(byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            //if(cChar==(byte)'>')
                            //	goto ReturnChunk;

                            //iCurPos--;
                            break;
                        }

                        if(iCurPos>=iDataLength)
                        {
                            cChar=0;
                            cPeek=0;
                            break;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if(sText.iBufPos>0)
                    {
                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos];
                        else
                            cPeek=0;

                        // ok, we have got attribute name and now we have got next char there

                        // most likely we have got = here  and then value
                        if(cPeek==(byte)'=')
                        {
                            //bEqualsSign=true;

                            // move forward one char
                            iCurPos++;

                            if(iCurPos<iDataLength)
                                cPeek=bHTML[iCurPos];
                            else
                                cPeek=0;

                            break;
                        }

                        // or we can have end of tag itself, doh!
                        if(cPeek==(byte)'>')
                        {
                            // move forward one char
                            iCurPos++;

                            if(sText.iBufPos>0)
                                oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');

                            if(!oChunk.bClosure)
                                oChunk.oType=HTMLchunkType.OpenTag;

                            return oChunk;
                        }

                        // closure
                        if(cPeek==(byte)'/')
                        {
                            oChunk.bClosure=true;
                            oChunk.bEndClosure=true;
                            oChunk.oType=HTMLchunkType.CloseTag;
                            continue;
                        }

                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');
                        sText.Clear();
                        goto AttributeParsing;
                    }

                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                        cChar=0;
                }

                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos];
                else
                    cPeek=0;

                // most likely we should have lower-cased ASCII char here
                if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++]=cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if(cChar==(byte)'=')
                {
                    //bEqualsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if(bCharType>32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++]=bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if(cChar==(byte)'>')
                {
                    if(sText.iBufPos>0)
                        oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');

                    if(!oChunk.bClosure)
                        oChunk.oType=HTMLchunkType.OpenTag;

                    return oChunk;
                }

                // closure of tag sign
                if(cChar==(byte)'/')
                {
                    oChunk.bClosure=true;
                    oChunk.bEndClosure=true;
                    oChunk.oType=HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                sText.bBuffer[sText.iBufPos++]=cChar;
                // oChunk.Append(cChar);
            }

            if(cPeek==0)
            {
                if(sText.iBufPos>0)
                    oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');

                if(!oChunk.bClosure)
                    oChunk.oType=HTMLchunkType.OpenTag;

                return oChunk;
            }

            sAttrName=sText.SetToStringASCII();

            AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            byte cQuotes=cPeek;

            int iValueStartOffset;

            // skip whitespace if any
            if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
            {
                iCurPos++;

                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos];
                else
                {
                    iValueStartOffset=iCurPos-1;
                    goto AttributeValueEnd;
                }

                //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                //if(cChar<=32 && bWhiteSpace[cChar]==1)
                if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                {

                    while(iCurPos<iDataLength)
                    {
                        cPeek=bHTML[iCurPos++];

                        if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                        //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                        {
                            //cPeek=bHTML[iCurPos];
                            continue;
                        }

                        iCurPos--;
                        break;
                    }

                    if(iCurPos>=iDataLength)
                    {
                        iValueStartOffset=iCurPos-1;
                        goto AttributeValueEnd;
                    }
                }

                cQuotes=cPeek;
            }

            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if(cPeek!='\"' && cPeek!='\'')
            {
                iValueStartOffset=iCurPos;

                cQuotes=(byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos++];
                else
                {
                    goto AttributeValueEnd;
                }

                while(cPeek!=0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                    {
                        oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' ');
                        iCurPos--;
                        goto AttributeParsing;
                    }

                    // end of tag?
                    if(cPeek==(byte)'>')
                    {
                        //iCurPos--;
                        break;
                    }

                    if(iCurPos<iDataLength)
                        cPeek=bHTML[iCurPos++];
                    else
                    {
                        iCurPos=iDataLength+1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            iCurPos++;

            iValueStartOffset=iCurPos;

            if(iCurPos<iDataLength)
                cPeek=bHTML[iCurPos++];
            else
            {

                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while(cPeek!=0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if(cPeek==38)
                {
                    int iPrevPos=iCurPos;

                    char cEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength);

                    // restore current symbol
                    if(cEntityChar==0)
                    {
                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos++];
                        else
                            break;

                        //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        int iPreEntLen=iPrevPos-iValueStartOffset-1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        sText.Clear();

                        // copy previous data
                        if(iPreEntLen>0)
                        {
                            Array.Copy(bHTML,iValueStartOffset,sText.bBuffer,0,iPreEntLen);
                            sText.iBufPos=iPreEntLen;
                        }

                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        oChunk.bEntities=true;

                        if(cChar==(byte)'<')
                            oChunk.bLtEntity=true;

                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if(cChar!=' ')
                        sText.Append(cEntityChar);

                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos++];
                        else
                        {

                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while(cPeek!=0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if(cPeek==38)
                            {
                                char cNewEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength);

                                // restore current symbol
                                if(cNewEntityChar!=0)
                                {
                                    if(cNewEntityChar==(byte)'<')
                                        oChunk.bLtEntity=true;

                                    sText.Append(cNewEntityChar);

                                    if(iCurPos<iDataLength)
                                        cPeek=bHTML[iCurPos++];
                                    else
                                        goto AttributeValueEnd;

                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if(cPeek==cQuotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes);

                                if(iCurPos<iDataLength)
                                    cPeek=bHTML[iCurPos];
                                else
                                    break;

                                goto AttributeParsing;
                            }

                            sText.bBuffer[sText.iBufPos++]=cPeek;
                            //sText.Append(cPeek);

                            if(iCurPos<iDataLength)
                                cPeek=bHTML[iCurPos++];
                            else
                                break;
                        }

                        oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if(cPeek==cQuotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //sText.Clear();

                    oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),cQuotes);

                    if(iCurPos<iDataLength)
                        cPeek=bHTML[iCurPos];
                    else
                    {
                        //iCurPos++;
                        break;
                    }

                    goto AttributeParsing;
                }

                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos++];
                else
                {
                    //iCurPos++;
                    break;
                }
            }

            AttributeValueEnd:

            // ok we are done, add outstanding attribute
            int iLen=iCurPos-iValueStartOffset-1;
            if(iLen>0)
                oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iLen),cQuotes);
            else
                oChunk.AddParam(sAttrName,"",cQuotes);

            ReturnChunk:

            if(oChunk.bClosure)
            {
                oChunk.oType=HTMLchunkType.CloseTag;
            }
            else
                oChunk.oType=HTMLchunkType.OpenTag;

            return oChunk;
        }
예제 #4
0
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE)
        {
            oP=p_oP;
            oChunk=p_oChunk;
            sText=p_sText;
            bHTML=p_bHTML;
            iDataLength=p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS;

            oE=p_oE;
            oHE=p_oHE;
        }
예제 #5
0
        private void Dispose(bool bDisposing)
        {
            if(!bDisposed)
            {
                bDisposed=true;

                bHTML=null;
                oChunk=null;
                sText=null;
                oE=null;
                oP=null;
            }
        }
예제 #6
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk oChunk)
 {
     // note: this really should have been byte array assigned rather than string
     // it would be more correct originality-wise
     oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
 }
예제 #7
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
                ;
            }

            return(false);
        }
예제 #8
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet)
        {
            if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta")
                return false;

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if(!oChunk.bHashMode)
                oChunk.ConvertParamsToHash();

            string sKey=oChunk.oParams["http-equiv"] as string;

            if(sKey!=null)
            {

                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch(sKey.ToLower())
                {
                    case "content-type":
                    // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                    case "content-category":

                        // we might have charset here that may hint at necessity to decode page
                        // check for possible encoding change

                        // once encoding is set it should not be changed, but you can be damn
                        // sure there are web pages out there that do that!!!
                        if(!bEncodingSet)
                        {
                            string sData=oChunk.oParams["content"] as string;

                            // it is possible we have broken META tag without Content part
                            if(sData!=null)
                            {

                                if(oP.SetEncoding(sData))
                                {
                                    // we may need to re-encode title

                                    if(!bEncodingSet)
                                    {
                                        // here you need to reencode any text that you found so far
                                        // most likely it will be just TITLE, the rest can be ignored anyway
                                        bEncodingSet=true;
                                    }
                                }
                                else
                                {
                                    // failed to set encoding - most likely encoding string
                                    // was incorrect or your machine lacks codepages or something
                                    // else - might be good idea to put warning message here
                                }
                            }

                        }

                        return true;

                    default:
                        break;
                };

            }

            return false;
        }
예제 #9
0
        /// <summary>
        /// Internally parses tag and returns it from point when '<' was found
        /// </summary>
        /// <returns>Chunk</returns>
        HTMLchunk GetNextTag()
        {
            //iCurPos++;

            oChunk=oTP.ParseTag(ref iCurPos);

            // for backwards compatibility mark closed tags with params as open
            if(oChunk.iParams>0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType==HTMLchunkType.CloseTag)
                oChunk.oType=HTMLchunkType.OpenTag;

            //                    012345
            // check for start of script
            if(oChunk.sTag.Length==6 && oChunk.sTag[0]=='s' && oChunk.sTag=="script")
            {
                if(!oChunk.bClosure)
                {
                    oChunk.oType=HTMLchunkType.Script;
                    oChunk=oTP.ParseScript(ref iCurPos);
                    return oChunk;
                }
            }

            oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset;

            if(bKeepRawHTML)
                oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength);

            return oChunk;
        }
예제 #10
0
        private void Dispose(bool bDisposing)
        {
            if(!bDisposed)
            {
                bDisposed=true;

                if(oChunk!=null)
                {
                    oChunk.Dispose();
                    oChunk=null;
                }

                if(sText!=null)
                {
                    sText.Dispose();
                    sText=null;
                }

                bHTML=null;

                if(oE!=null)
                {
                    oE.Dispose();
                    oE=null;
                }

                if(oTP!=null)
                {
                    oTP.Dispose();
                    oTP=null;
                }

            }
        }
예제 #11
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk oChunk)
 {
     // note: this really should have been byte array assigned rather than string
     // it would be more correct originality-wise
     oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength);
 }