Example #1
0
        /// <summary>
        /// Internally parses tag and returns it from point when left angular bracket was found
        /// </summary>
        /// <returns>Chunk</returns>
        internal HTMLchunk GetNextTag()
        {
            oChunk = oTP.ParseTag(ref iCurPos);

            // for backwards compatibility mark closed tags with params as open
            if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag)
            {
                oChunk.oType = HTMLchunkType.OpenTag;
            }

            //                    012345
            // check for start of script
            if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script")
            {
                if (!oChunk.bClosure)
                {
                    oChunk.oType = HTMLchunkType.Script;
                    oChunk       = oTP.ParseScript(ref iCurPos);
                    return(oChunk);
                }
            }

            oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

            if (bKeepRawHTML)
            {
                oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
            }

            return(oChunk);
        }
Example #2
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                if (oChunk != null)
                {
                    oChunk.Dispose();
                    oChunk = null;
                }

                if (sText != null)
                {
                    sText.Dispose();
                    sText = null;
                }

                bHTML = null;

                if (oE != null)
                {
                    oE.Dispose();
                    oE = null;
                }

                if (oTP != null)
                {
                    oTP.Dispose();
                    oTP = null;
                }
            }
        }
Example #3
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                bHTML  = null;
                oChunk = null;
                sText  = null;
                oE     = null;
                oP     = null;
            }
        }
Example #4
0
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HtmlParser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE)
        {
            oP          = p_oP;
            oChunk      = p_oChunk;
            sText       = p_sText;
            bHTML       = p_bHTML;
            iDataLength = p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS;

            oE  = p_oE;
            oHE = p_oHE;
        }
Example #5
0
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int iCurPos)
        {
            byte cChar = 0;
            byte cPeek = 0;



            // initialise peeked char - this will point to the next after < character
            if (iCurPos < iDataLength)
            {
                cPeek = bHTML[iCurPos];

                // in case of comments ! must follow immediately after <
                if (cPeek == (byte)'!')
                {
                    if (iCurPos + 2 < iDataLength &&
                        bHTML[iCurPos + 1] == (byte)'-' && bHTML[iCurPos + 2] == (byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag      = "!--";
                        oChunk.oType     = HTMLchunkType.Comment;
                        oChunk.bComments = true;

                        iCurPos += 3;
                        bool bFullTag;
                        oChunk = ParseComments(ref iCurPos, out bFullTag);

                        oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

                        if (oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if (!oP.bAutoExtractBetweenTagsOnly)
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength);
                            }
                            else
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset + 4, oChunk.iChunkLength - (bFullTag ? 7 : 4));
                            }
                        }

                        return(oChunk);
                    }
                }
            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                oChunk.oType = HTMLchunkType.OpenTag;
                // end of data... before it started
                return(oChunk);
            }

            // tag ID, non-zero if matched by heuristics engine
            int iTagID = 0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if (bEnableHeuristics && iCurPos < iMaxHeuDataLength)
            {
                // check if we have got closure of the tag
                if (cPeek == (byte)'/')
                {
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = false;
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    iCurPos++;
                    cPeek = bHTML[iCurPos];
                }

                cChar = bHTML[iCurPos + 1];

                // probability of having a match is very high (or so we expect)
                iTagID = oHE.MatchTag(cPeek, cChar);

                if (iTagID != 0)
                {
                    if (iTagID < 0)
                    {
                        iTagID *= -1;
                        // single character tag
                        oChunk.sTag = oHE.GetString(iTagID);

                        // see if we got fully closed tag
                        if (cChar == (byte)'>')
                        {
                            iCurPos += 2;
                            goto ReturnChunk;
                        }

                        cPeek = cChar;
                        iCurPos++;

                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        byte cNextChar = bHTML[iCurPos + 2];

                        if (cNextChar == (byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos    += 3;

                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if (cNextChar == (byte)' ')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos    += 2;

                            cPeek = cNextChar;


                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        byte[] bTag = oHE.GetStringData(iTagID);

                        if (iCurPos + bTag.Length + 5 >= iDataLength)
                        {
                            goto TagParsing;
                        }

                        // in a loop (and this is not an ideal solution, but still)
                        for (int i = 2; i < bTag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if (bTag[i] != bHTML[iCurPos + i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer
                        // word
                        cNextChar = bHTML[iCurPos + bTag.Length];

                        if (cNextChar == (byte)'>')
                        {
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos    += bTag.Length + 1;

                            goto ReturnChunk;
                        }

                        if (cNextChar == (byte)' ')
                        {
                            cPeek       = cNextChar;
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos    += bTag.Length;

                            goto AttributeParsing;
                        }

                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }
                }
            }

TagParsing:

            sText.Clear();

            byte bCharType = 0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while (cPeek != 0)
            {
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cChar = 0;
                    }

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                    {
                        while (iCurPos < iDataLength)
                        {
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            break;
                        }

                        if (iCurPos >= iDataLength)
                        {
                            cChar = 0;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if (sText.iBufPos > 0)
                    {
                        oChunk.sTag = sText.SetToStringASCII();

                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }

                        break;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cChar = 0;
                    }
                }

                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos];
                }
                else
                {
                    cPeek = 0;
                }

                // most likely we should have lower-cased ASCII char
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                {
                    if (sText.iBufPos > 0)
                    {
                        oChunk.sTag = sText.SetToStringASCII();
                    }

                    if (!oChunk.bClosure)
                    {
                        oChunk.oType = HTMLchunkType.OpenTag;
                    }

                    return(oChunk);
                }

                // closure of tag sign
                if (cChar == (byte)'/')
                {
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = (sText.iBufPos > 0);
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    continue;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // ok, we have got some other char - we break out to deal with it in attributes part
                break;
            }

            if (cPeek == 0)
            {
                return(oChunk);
            }

            // if true then equal sign was found
            //bool bEqualsSign=false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters

AttributeParsing:

            string sAttrName;

            if (iTagID != 0)
            {
                // first, skip whitespace:
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    iCurPos++;

                    if (iCurPos >= iDataLength)
                    {
                        goto ReturnChunk;
                    }

                    cPeek = bHTML[iCurPos];

                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];

                            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                            {
                                continue;
                            }

                            break;
                        }

                        if (cPeek == (byte)'>')
                        {
                            goto ReturnChunk;
                        }

                        iCurPos--;

                        if (iCurPos >= iDataLength)
                        {
                            goto ReturnChunk;
                        }
                    }

                    if (iCurPos >= iDataLength)
                    {
                        goto ReturnChunk;
                    }
                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                int iAttrID = oHE.MatchAttr(cPeek, iTagID);

                if (iAttrID > 0)
                {
                    byte[] bAttr = oHE.GetAttrData(iAttrID);

                    if (iCurPos + bAttr.Length + 2 >= iDataLength)
                    {
                        goto ActualAttributeParsing;
                    }

                    // in a loop (and this is not an ideal solution, but still)
                    for (int i = 1; i < bAttr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if (bAttr[i] != bHTML[iCurPos + i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }

                    byte cNextChar = bHTML[iCurPos + bAttr.Length];

                    // ok, we expect next symbol to be =
                    if (cNextChar == (byte)'=')
                    {
                        sAttrName = oHE.GetAttr(iAttrID);
                        iCurPos  += bAttr.Length + 1;
                        cPeek     = bHTML[iCurPos];

                        goto AttributeValueParsing;
                    }
                }
            }

ActualAttributeParsing:

            sText.Clear();

            // doing exactly the same thing as in tag parsing
            while (cPeek != 0)
            {
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cPeek = 0;
                        break;
                    }

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                    {
                        while (iCurPos < iDataLength)
                        {
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            //if(cChar==(byte)'>')
                            //	goto ReturnChunk;

                            //iCurPos--;
                            break;
                        }

                        if (iCurPos >= iDataLength)
                        {
                            cChar = 0;
                            cPeek = 0;
                            break;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if (sText.iBufPos > 0)
                    {
                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }

                        // ok, we have got attribute name and now we have got next char there

                        // most likely we have got = here  and then value
                        if (cPeek == (byte)'=')
                        {
                            //bEqualsSign=true;

                            // move forward one char
                            iCurPos++;

                            if (iCurPos < iDataLength)
                            {
                                cPeek = bHTML[iCurPos];
                            }
                            else
                            {
                                cPeek = 0;
                            }

                            break;
                        }

                        // or we can have end of tag itself, doh!
                        if (cPeek == (byte)'>')
                        {
                            // move forward one char
                            iCurPos++;

                            if (sText.iBufPos > 0)
                            {
                                oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                            }

                            if (!oChunk.bClosure)
                            {
                                oChunk.oType = HTMLchunkType.OpenTag;
                            }

                            return(oChunk);
                        }

                        // closure
                        if (cPeek == (byte)'/')
                        {
                            oChunk.bClosure    = true;
                            oChunk.bEndClosure = true;
                            oChunk.oType       = HTMLchunkType.CloseTag;
                            continue;
                        }

                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                        sText.Clear();
                        goto AttributeParsing;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cChar = 0;
                    }
                }

                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos];
                }
                else
                {
                    cPeek = 0;
                }

                // most likely we should have lower-cased ASCII char here
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if (cChar == (byte)'=')
                {
                    //bEqualsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                {
                    if (sText.iBufPos > 0)
                    {
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                    }

                    if (!oChunk.bClosure)
                    {
                        oChunk.oType = HTMLchunkType.OpenTag;
                    }

                    return(oChunk);
                }

                // closure of tag sign
                if (cChar == (byte)'/')
                {
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = true;
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                sText.bBuffer[sText.iBufPos++] = cChar;
                // oChunk.Append(cChar);
            }

            if (cPeek == 0)
            {
                if (sText.iBufPos > 0)
                {
                    oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                }

                if (!oChunk.bClosure)
                {
                    oChunk.oType = HTMLchunkType.OpenTag;
                }

                return(oChunk);
            }

            sAttrName = sText.SetToStringASCII();

AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            byte cQuotes = cPeek;

            int iValueStartOffset;

            // skip whitespace if any
            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
            {
                iCurPos++;

                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos];
                }
                else
                {
                    iValueStartOffset = iCurPos - 1;
                    goto AttributeValueEnd;
                }

                //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                //if(cChar<=32 && bWhiteSpace[cChar]==1)
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    while (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos++];

                        if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                        {
                            //cPeek=bHTML[iCurPos];
                            continue;
                        }

                        iCurPos--;
                        break;
                    }

                    if (iCurPos >= iDataLength)
                    {
                        iValueStartOffset = iCurPos - 1;
                        goto AttributeValueEnd;
                    }
                }

                cQuotes = cPeek;
            }



            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if (cPeek != '\"' && cPeek != '\'')
            {
                iValueStartOffset = iCurPos;

                cQuotes = (byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos++];
                }
                else
                {
                    goto AttributeValueEnd;
                }

                while (cPeek != 0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');
                        iCurPos--;
                        goto AttributeParsing;
                    }

                    // end of tag?
                    if (cPeek == (byte)'>')
                    {
                        //iCurPos--;
                        break;
                    }

                    if (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos++];
                    }
                    else
                    {
                        iCurPos = iDataLength + 1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            iCurPos++;

            iValueStartOffset = iCurPos;

            if (iCurPos < iDataLength)
            {
                cPeek = bHTML[iCurPos++];
            }
            else
            {
                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while (cPeek != 0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if (cPeek == 38)
                {
                    int iPrevPos = iCurPos;

                    char cEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                    // restore current symbol
                    if (cEntityChar == 0)
                    {
                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];
                        }
                        else
                        {
                            break;
                        }

                        //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        int iPreEntLen = iPrevPos - iValueStartOffset - 1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        sText.Clear();

                        // copy previous data
                        if (iPreEntLen > 0)
                        {
                            Array.Copy(bHTML, iValueStartOffset, sText.bBuffer, 0, iPreEntLen);
                            sText.iBufPos = iPreEntLen;
                        }

                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        oChunk.bEntities = true;

                        if (cChar == (byte)'<')
                        {
                            oChunk.bLtEntity = true;
                        }

                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if(cChar!=' ')
                        sText.Append(cEntityChar);

                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];
                        }
                        else
                        {
                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while (cPeek != 0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if (cPeek == 38)
                            {
                                char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                                // restore current symbol
                                if (cNewEntityChar != 0)
                                {
                                    if (cNewEntityChar == (byte)'<')
                                    {
                                        oChunk.bLtEntity = true;
                                    }

                                    sText.Append(cNewEntityChar);

                                    if (iCurPos < iDataLength)
                                    {
                                        cPeek = bHTML[iCurPos++];
                                    }
                                    else
                                    {
                                        goto AttributeValueEnd;
                                    }

                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if (cPeek == cQuotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);

                                if (iCurPos < iDataLength)
                                {
                                    cPeek = bHTML[iCurPos];
                                }
                                else
                                {
                                    break;
                                }

                                goto AttributeParsing;
                            }

                            sText.bBuffer[sText.iBufPos++] = cPeek;
                            //sText.Append(cPeek);

                            if (iCurPos < iDataLength)
                            {
                                cPeek = bHTML[iCurPos++];
                            }
                            else
                            {
                                break;
                            }
                        }

                        oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if (cPeek == cQuotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //sText.Clear();

                    oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), cQuotes);


                    if (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos];
                    }
                    else
                    {
                        //iCurPos++;
                        break;
                    }

                    goto AttributeParsing;
                }

                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos++];
                }
                else
                {
                    //iCurPos++;
                    break;
                }
            }

AttributeValueEnd:



            // ok we are done, add outstanding attribute
            int iLen = iCurPos - iValueStartOffset - 1;

            if (iLen > 0)
            {
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iLen), cQuotes);
            }
            else
            {
                oChunk.AddParam(sAttrName, "", cQuotes);
            }

ReturnChunk:

            if (oChunk.bClosure)
            {
                oChunk.oType = HTMLchunkType.CloseTag;
            }
            else
            {
                oChunk.oType = HTMLchunkType.OpenTag;
            }

            return(oChunk);
        }
Example #6
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk oChunk)
 {
     oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
 }
Example #7
0
        public int Parse(string content, ArrayList lines)
        {
            lines.Clear();
            result = lines;
            HtmlParser parser = new HtmlParser();

            parser.SetChunkHashMode(true);

            parser.Init(content);
            parser.SetEncoding(System.Text.Encoding.UTF8);


            HTMLchunk chunk = null;

            Start();


            while (!finished && (chunk = parser.ParseNext()) != null)
            {
                switch (chunk.oType)
                {
                case HTMLchunkType.OpenTag:
                    HandleTag(chunk);
                    break;

                case HTMLchunkType.CloseTag:
                    HandleClosure(chunk.sTag);
                    break;

                case HTMLchunkType.Text:
                    offset = chunk.iChunkOffset;
                    string text = ClearExtraBlanks(chunk.oHTML);
                    text = HTMLentities.DecodeEntities(text);
                    HandleWords(text);
                    break;
                }
            }


            NewLine(true);
            if (!finished)
            {
                TextLine textLine = new TextLine();

                /* Raffaele Russo - 19/04/2011 - Start - Modificato l'argomento del costruttore TextSegment, il quarto parametro
                 * da "context.Font.Height" รจ stato modificato in "context.Font.GetHeight(96)" */
                textLine.SetSegments(new ArrayList(line), context.Font.GetHeight(96));
                // Raffaele Russo - 19/04/2011 - End

                textLine.Alignment = context.Alignment;
                if (height + textLine.Height <= bounds.Height)
                {
                    result.Add(textLine);
                }
                previousLine = null;
            }


            byte[] b = new byte[parser.iCurPos];
            Array.Copy(parser.bHTML, 0, b, 0, parser.iCurPos);
            int a = System.Text.Encoding.UTF8.GetString(b).Length;

            return(a);
        }
Example #8
0
        private void HandleTag(HTMLchunk tag)
        {
            if (tag.sTag == "b")
            {
                Font    f = new Font(context.Font.FontFamily, context.Font.Size, context.Font.Style | FontStyle.Bold);
                Context c = new Context("b", f, context.Color, context.Alignment);
                stack.Push(context);
                context = c;
            }
            else if (tag.sTag == "i")
            {
                Font    f = new Font(context.Font.FontFamily, context.Font.Size, context.Font.Style | FontStyle.Italic);
                Context c = new Context("i", f, context.Color, context.Alignment);
                stack.Push(context);
                context = c;
            }
            else if (tag.sTag == "div")
            {
                if (tag.oParams["align"] != null)
                {
                    NewLine(false);

                    if (tag.oParams["align"].ToString() == "center")
                    {
                        Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Center);
                        stack.Push(context);
                        context = c;
                    }
                    else if (tag.oParams["align"].ToString() == "right")
                    {
                        Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Right);
                        stack.Push(context);
                        context = c;
                    }
                    else if (tag.oParams["align"].ToString() == "justify")
                    {
                        Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Justified);
                        stack.Push(context);
                        context = c;
                    }
                    else
                    {
                        Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Left);
                        stack.Push(context);
                        context = c;
                    }
                }
            }
            else if (tag.sTag == "font")
            {
                float     fontSize   = context.Font.Size;
                string    fontFamily = context.Font.FontFamily.Name;
                FontStyle fontStyle  = context.Font.Style;
                Color     color      = context.Color;

                if (tag.oParams["size"] != null)
                {
                    try
                    {
                        fontSize = ResolveFontSize(tag.oParams["size"].ToString().Trim());
                    }
                    catch (Exception) {}
                }

                if (tag.oParams["face"] != null)
                {
                    try
                    {
                        fontFamily = tag.oParams["face"].ToString();
                    }
                    catch (Exception) {}
                }

                if (tag.oParams["color"] != null)
                {
                    try
                    {
                        color = ResolveColor("ff" + tag.oParams["color"].ToString().TrimStart('#'));
                    }
                    catch (Exception) {}
                }

                Font    f = new Font(fontFamily, fontSize, fontStyle);
                Context c = new Context("font", f, color, context.Alignment);
                stack.Push(context);
                context = c;
            }
            else if (tag.sTag == "br")
            {
                NewLine(true);
            }
            else if (tag.sTag == "p")
            {
                NewLine(true);
            }
        }