Ejemplo n.º 1
0
        /// <summary>
        /// Internally parses tag and returns it from point when open bracket (&lt;) was found
        /// </summary>
        /// <returns>Chunk</returns>
        HTMLchunk GetNextTag()
        {
            //iCurPos++;

            oChunk = oTP.ParseTag(ref iCurPos);

            // for backwards compatibility mark closed tags with params as open
            if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag)
            {
                oChunk.oType = HTMLchunkType.OpenTag;
            }

            //                    012345
            // check for start of script
            if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script")
            {
                if (!oChunk.bClosure)
                {
                    oChunk.oType = HTMLchunkType.Script;
                    oChunk       = oTP.ParseScript(ref iCurPos);
                    return(oChunk);
                }
            }

            oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

            if (bKeepRawHTML)
            {
                oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
            }

            return(oChunk);
        }
Ejemplo n.º 2
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                if (oChunk != null)
                {
                    oChunk.Dispose();
                    oChunk = null;
                }

                if (sText != null)
                {
                    sText.Dispose();
                    sText = null;
                }

                bHTML = null;

                if (oE != null)
                {
                    oE.Dispose();
                    oE = null;
                }

                if (oTP != null)
                {
                    oTP.Dispose();
                    oTP = null;
                }
            }
        }
Ejemplo n.º 3
0
        public OpenTag(HTMLchunk chunk)
        {
            sTag = chunk.sTag;
            bClosure = chunk.bClosure;
            bEndClosure = chunk.bEndClosure;

            oParams = new Hashtable();
            foreach (DictionaryEntry entry in chunk.oParams)
            {
                oParams.Add(entry.Key, entry.Value);
            }
        }
Ejemplo n.º 4
0
        public void CloseOneTag(HTMLchunk chunk)
        {
            bool bMustRecalculateStyle = false;

            for (int i = m_OpenTags.Count - 1; i >= 0; i--)
            {
                if (m_OpenTags[i].sTag == chunk.sTag)
                {
                    m_OpenTags.RemoveAt(i);
                    bMustRecalculateStyle = true;
                    break;
                }
            }

            if (bMustRecalculateStyle)
            {
                RecalculateStyle();
            }
        }
Ejemplo n.º 5
0
 public void ParseTag(HTMLchunk chunk, AElement atom)
 {
     if (!chunk.bClosure || chunk.bEndClosure)
     {
         // create the tag and add it to the list of open tags.
         OpenTag tag = new OpenTag(chunk);
         m_OpenTags.Add(tag);
         // parse the tag (which will update the StyleParser's current style
         ParseTag(tag, atom);
         // if the style has changed and atom is not null, set the atom's style to the current style.
         if (atom != null)
             atom.Style = Style;
         // if this is a self-closing tag (<br/>) close it!
         if (chunk.bEndClosure)
             CloseOneTag(chunk);
     }
     else
     {
         CloseOneTag(chunk);
     }
 }
Ejemplo n.º 6
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                bHTML = null;
                oChunk = null;
                sText = null;
                oE = null;
                oP = null;
            }
        }
Ejemplo n.º 7
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk oChunk)
 {
     // note: this really should have been byte array assigned rather than string
     // it would be more correct originality-wise
     oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
 }
Ejemplo n.º 8
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
            }

            return(false);
        }
Ejemplo n.º 9
0
        private void Dispose(bool bDisposing)
        {
            if(!bDisposed)
            {
                bDisposed=true;

                if(oChunk!=null)
                {
                    oChunk.Dispose();
                    oChunk=null;
                }

                if(sText!=null)
                {
                    sText.Dispose();
                    sText=null;
                }

                bHTML=null;

                if(oE!=null)
                {
                    oE.Dispose();
                    oE=null;
                }

                if(oTP!=null)
                {
                    oTP.Dispose();
                    oTP=null;
                }

            }
        }
Ejemplo n.º 10
0
        public void InterpretHREF(HTMLchunk chunk, AElement atom)
        {
            if (chunk.bEndClosure)
            {
                // solo anchor elements are meaningless.
            }

            if (!chunk.bClosure)
            {
                // opening a hyperlink!
                RecalculateStyle();
                OpenTag tag = new OpenTag(chunk);
                m_OpenTags.Add(tag);
                ParseTag(tag, atom);
            }
            else
            {
                // closing a hyperlink.
                RecalculateStyle();
            }
        }
Ejemplo n.º 11
0
        public void InterpretHREF(HTMLchunk chunk)
        {
            if (chunk.bEndClosure)
            {
                // solo anchor elements are meaningless.
            }

            if (!chunk.bClosure)
            {
                // hyperlink with attributes
                Style.HREF = new HREFAttributes();
                OpenTag tag = new OpenTag(chunk);
                ParseTag(tag);
            }
            else
            {
                // closing a hyperlink. NOTE: Recalculating the styles will NOT restore the previous link. Is this worth fixing?
                RecalculateStyle();
            }
        }
Ejemplo n.º 12
0
        public void OpenTag(HTMLchunk chunk)
        {
            OpenTag tag = new OpenTag(chunk);

            if (!chunk.bClosure || chunk.bEndClosure)
            {
                m_OpenTags.Add(tag);
                ParseTag(tag);
            }
            else
            {
                CloseOneTag(chunk);
            }
        }
Ejemplo n.º 13
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet)
        {
            if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta")
                return false;

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if(!oChunk.bHashMode)
                oChunk.ConvertParamsToHash();

            string sKey=oChunk.oParams["http-equiv"] as string;

            if(sKey!=null)
            {

                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch(sKey.ToLower())
                {
                    case "content-type":
                    // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                    case "content-category":

                        // we might have charset here that may hint at necessity to decode page
                        // check for possible encoding change

                        // once encoding is set it should not be changed, but you can be damn
                        // sure there are web pages out there that do that!!!
                        if(!bEncodingSet)
                        {
                            string sData=oChunk.oParams["content"] as string;

                            // it is possible we have broken META tag without Content part
                            if(sData!=null)
                            {

                                if(oP.SetEncoding(sData))
                                {
                                    // we may need to re-encode title

                                    if(!bEncodingSet)
                                    {
                                        // here you need to reencode any text that you found so far
                                        // most likely it will be just TITLE, the rest can be ignored anyway
                                        bEncodingSet=true;
                                    }
                                }
                                else
                                {
                                    // failed to set encoding - most likely encoding string
                                    // was incorrect or your machine lacks codepages or something
                                    // else - might be good idea to put warning message here
                                }
                            }

                        }

                        return true;

                    default:
                        break;
                };

            }

            return false;
        }
Ejemplo n.º 14
0
        /// <summary>
        /// Internally parses tag and returns it from point when '<' was found
        /// </summary>
        /// <returns>Chunk</returns>
        HTMLchunk GetNextTag()
        {
            //iCurPos++;

            oChunk=oTP.ParseTag(ref iCurPos);

            // for backwards compatibility mark closed tags with params as open
            if(oChunk.iParams>0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType==HTMLchunkType.CloseTag)
                oChunk.oType=HTMLchunkType.OpenTag;

            //                    012345
            // check for start of script
            if(oChunk.sTag.Length==6 && oChunk.sTag[0]=='s' && oChunk.sTag=="script")
            {
                if(!oChunk.bClosure)
                {
                    oChunk.oType=HTMLchunkType.Script;
                    oChunk=oTP.ParseScript(ref iCurPos);
                    return oChunk;
                }
            }

            oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset;

            if(bKeepRawHTML)
                oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength);

            return oChunk;
        }
Ejemplo n.º 15
0
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE)
        {
            oP = p_oP;
            oChunk = p_oChunk;
            sText = p_sText;
            bHTML = p_bHTML;
            iDataLength = p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS;

            oE = p_oE;
            oHE = p_oHE;
        }
Ejemplo n.º 16
0
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int iCurPos)
        {
            /*
             *  WARNING: this code was optimised for performance rather than for readability,
             *  so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML
             *
             *  This routine takes about 60% of CPU time, in theory its the best place to gain extra speed,
             *  but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post
             *  your changes for everyone to enjoy!
             *
             *
             * */

            //bool bWhiteSpaceHere=false;

            //bool bParamValue=false;
            byte cChar = 0;
            byte cPeek = 0;

            // if true it means we have parsed complete tag
            //bool bGotTag=false;

            //int iEqualIdx=0;

            // we reach this function immediately after tag's byte (<) was
            // detected, so we need to save it in order to keep correct HTML copy
            // oChunk.Append((byte)'<'); // (byte)'<'

            /*
            oChunk.bBuffer[0]=60;
            oChunk.iBufPos=1;
            oChunk.iHTMLen=1;
            */

            // initialise peeked char - this will point to the next after < character
            if (iCurPos < iDataLength)
            {
                cPeek = bHTML[iCurPos];

                // in case of comments ! must follow immediately after <
                if (cPeek == (byte)'!')
                {
                    if (iCurPos + 2 < iDataLength &&
                        bHTML[iCurPos + 1] == (byte)'-' && bHTML[iCurPos + 2] == (byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag = "!--";
                        oChunk.oType = HTMLchunkType.Comment;
                        oChunk.bComments = true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos += 3;
                        bool bFullTag;
                        oChunk = ParseComments(ref iCurPos, out bFullTag);

                        oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

                        if (oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if (!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength);
                            else
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset + 4, oChunk.iChunkLength - (bFullTag ? 7 : 4));
                            }

                        }

                        return oChunk;
                    }

                    // ok we might have here CDATA element of XML:
                    // ref: http://www.w3schools.com/xml/xml_cdata.asp
                    if (iCurPos + 7 < iDataLength &&
                        bHTML[iCurPos + 1] == (byte)'[' &&
                        bHTML[iCurPos + 2] == (byte)'C' &&
                        bHTML[iCurPos + 3] == (byte)'D' &&
                        bHTML[iCurPos + 4] == (byte)'A' &&
                        bHTML[iCurPos + 5] == (byte)'T' &&
                        bHTML[iCurPos + 6] == (byte)'A' &&
                        bHTML[iCurPos + 7] == (byte)'['
                        )
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag = "![CDATA[";
                        oChunk.oType = HTMLchunkType.Comment;
                        oChunk.bComments = true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos += 8;
                        bool bFullTag;
                        oChunk = ParseCDATA(ref iCurPos, out bFullTag);

                        oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

                        if (oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if (!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength);
                            else
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset + 4 + 5,
                                    oChunk.iChunkLength - (bFullTag ? 7 + 5 : 4 + 5));
                            }

                        }

                        return oChunk;
                    }

                }

            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                oChunk.oType = HTMLchunkType.OpenTag;
                // end of data... before it started
                return oChunk;
            }

            // tag ID, non-zero if matched by heuristics engine
            int iTagID = 0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if (bEnableHeuristics && iCurPos < iMaxHeuDataLength)
            {
                // check if we have got closure of the tag
                if (cPeek == (byte)'/')
                {
                    oChunk.bClosure = true;
                    oChunk.bEndClosure = false;
                    oChunk.oType = HTMLchunkType.CloseTag;
                    iCurPos++;
                    cPeek = bHTML[iCurPos];
                }

                cChar = bHTML[iCurPos + 1];

                // probability of having a match is very high (or so we expect)
                iTagID = oHE.MatchTag(cPeek, cChar);

                if (iTagID != 0)
                {
                    if (iTagID < 0)
                    {
                        iTagID *= -1;
                        // single character tag
                        oChunk.sTag = oHE.GetString(iTagID);

                        // see if we got fully closed tag
                        if (cChar == (byte)'>')
                        {
                            iCurPos += 2;
                            goto ReturnChunk;
                        }

                        cPeek = cChar;
                        iCurPos++;

                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        byte cNextChar = bHTML[iCurPos + 2];

                        if (cNextChar == (byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos += 3;

                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if (cNextChar == (byte)' ')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos += 2;

                            cPeek = cNextChar;

                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        byte[] bTag = oHE.GetStringData(iTagID);

                        if (iCurPos + bTag.Length + 5 >= iDataLength)
                            goto TagParsing;

                        // in a loop (and this is not an ideal solution, but still)
                        for (int i = 2; i < bTag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if (bTag[i] != bHTML[iCurPos + i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer
                        // word
                        cNextChar = bHTML[iCurPos + bTag.Length];

                        if (cNextChar == (byte)'>')
                        {
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos += bTag.Length + 1;

                            goto ReturnChunk;
                        }

                        if (cNextChar == (byte)' ')
                        {
                            cPeek = cNextChar;
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos += bTag.Length;

                            goto AttributeParsing;
                        }

                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }

                }
            }

            TagParsing:

            sText.Clear();

            byte bCharType = 0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while (cPeek != 0)
            {
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                    else
                        cChar = 0;

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                    {

                        while (iCurPos < iDataLength)
                        {
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            break;
                        }

                        if (iCurPos >= iDataLength)
                            cChar = 0;
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if (sText.iBufPos > 0)
                    {
                        oChunk.sTag = sText.SetToStringASCII();

                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos];
                        else
                            cPeek = 0;

                        break;
                    }

                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                    else
                        cChar = 0;
                }

                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos];
                else
                    cPeek = 0;

                // most likely we should have lower-cased ASCII char
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                {
                    if (sText.iBufPos > 0)
                        oChunk.sTag = sText.SetToStringASCII();

                    if (!oChunk.bClosure)
                        oChunk.oType = HTMLchunkType.OpenTag;

                    return oChunk;
                }

                // closure of tag sign
                if (cChar == (byte)'/')
                {
                    oChunk.bClosure = true;
                    oChunk.bEndClosure = (sText.iBufPos > 0);
                    oChunk.oType = HTMLchunkType.CloseTag;
                    continue;
                }

                // 03/08/08 XML support: ?xml tags - grrr
                if (cChar == (byte)'?')
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    continue;
                }

                // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // we might have namespace : sign here - all text before would have to be
                // saved as namespace and we will need to continue parsing actual tag
                if (bCharType == (byte)TagCharType.NameSpaceColon)
                {
                    // ok here we got a choice - we can just continue and treat the whole
                    // thing as a single tag with namespace stuff prefixed, OR
                    // we can separate first part into namespace and keep tag as normal
                    sText.bBuffer[sText.iBufPos++] = (byte)':';
                    continue;
                }

                // ok, we have got some other char - we break out to deal with it in attributes part
                break;

            }

            if (cPeek == 0)
            {
                return oChunk;
            }

            // if true then equal sign was found
            //bool bEqualsSign=false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters

            AttributeParsing:

            string sAttrName;

            if (iTagID != 0)
            {

                // first, skip whitespace:
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    iCurPos++;

                    if (iCurPos >= iDataLength)
                        goto ReturnChunk;

                    cPeek = bHTML[iCurPos];

                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];

                            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                                continue;

                            break;
                        }

                        if (cPeek == (byte)'>')
                            goto ReturnChunk;

                        iCurPos--;

                        if (iCurPos >= iDataLength)
                            goto ReturnChunk;
                    }

                    if (iCurPos >= iDataLength)
                        goto ReturnChunk;

                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                int iAttrID = oHE.MatchAttr(cPeek, iTagID);

                if (iAttrID > 0)
                {
                    byte[] bAttr = oHE.GetAttrData(iAttrID);

                    if (iCurPos + bAttr.Length + 2 >= iDataLength)
                        goto ActualAttributeParsing;

                    // in a loop (and this is not an ideal solution, but still)
                    for (int i = 1; i < bAttr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if (bAttr[i] != bHTML[iCurPos + i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }

                    byte cNextChar = bHTML[iCurPos + bAttr.Length];

                    // ok, we expect next symbol to be =
                    if (cNextChar == (byte)'=')
                    {
                        sAttrName = oHE.GetAttr(iAttrID);
                        iCurPos += bAttr.Length + 1;
                        cPeek = bHTML[iCurPos];

                        goto AttributeValueParsing;
                    }

                }

            }

            ActualAttributeParsing:

            sText.Clear();

            // doing exactly the same thing as in tag parsing
            while (cPeek != 0)
            {
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                    else
                    {
                        cPeek = 0;
                        break;
                    }

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                    {

                        while (iCurPos < iDataLength)
                        {
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            //if(cChar==(byte)'>')
                            // goto ReturnChunk;

                            //iCurPos--;
                            break;
                        }

                        if (iCurPos >= iDataLength)
                        {
                            cChar = 0;
                            cPeek = 0;
                            break;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if (sText.iBufPos > 0)
                    {
                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos];
                        else
                            cPeek = 0;

                        // ok, we have got attribute name and now we have got next char there

                        // most likely we have got = here  and then value
                        if (cPeek == (byte)'=')
                        {
                            //bEqualsSign=true;

                            // move forward one char
                            iCurPos++;

                            if (iCurPos < iDataLength)
                                cPeek = bHTML[iCurPos];
                            else
                                cPeek = 0;

                            break;
                        }

                        // or we can have end of tag itself, doh!
                        if (cPeek == (byte)'>')
                        {
                            // move forward one char
                            iCurPos++;

                            if (sText.iBufPos > 0)
                                oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');

                            if (!oChunk.bClosure)
                                oChunk.oType = HTMLchunkType.OpenTag;

                            return oChunk;
                        }

                        // closure
                        if (cPeek == (byte)'/')
                        {
                            oChunk.bClosure = true;
                            oChunk.bEndClosure = true;
                            oChunk.oType = HTMLchunkType.CloseTag;
                            continue;
                        }

                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                        sText.Clear();
                        goto AttributeParsing;
                    }

                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                        cChar = bHTML[iCurPos++];
                    else
                        cChar = 0;
                }

                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos];
                else
                    cPeek = 0;

                // most likely we should have lower-cased ASCII char here
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if (cChar == (byte)'=')
                {
                    //bEqualsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                {
                    if (sText.iBufPos > 0)
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');

                    if (!oChunk.bClosure)
                        oChunk.oType = HTMLchunkType.OpenTag;

                    return oChunk;
                }

                // closure of tag sign
                if (cChar == (byte)'/')
                {
                    oChunk.bClosure = true;
                    oChunk.bEndClosure = true;
                    oChunk.oType = HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                sText.bBuffer[sText.iBufPos++] = cChar;
                // oChunk.Append(cChar);
            }

            if (cPeek == 0)
            {
                if (sText.iBufPos > 0)
                    oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');

                if (!oChunk.bClosure)
                    oChunk.oType = HTMLchunkType.OpenTag;

                return oChunk;
            }

            sAttrName = sText.SetToStringASCII();

            AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            byte cQuotes = cPeek;

            int iValueStartOffset;

            // skip whitespace if any
            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
            {
                iCurPos++;

                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos];
                else
                {
                    iValueStartOffset = iCurPos - 1;
                    goto AttributeValueEnd;
                }

                //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                //if(cChar<=32 && bWhiteSpace[cChar]==1)
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {

                    while (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos++];

                        if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                        {
                            //cPeek=bHTML[iCurPos];
                            continue;
                        }

                        iCurPos--;
                        break;
                    }

                    if (iCurPos >= iDataLength)
                    {
                        iValueStartOffset = iCurPos - 1;
                        goto AttributeValueEnd;
                    }
                }

                cQuotes = cPeek;
            }

            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if (cPeek != '\"' && cPeek != '\'')
            {
                iValueStartOffset = iCurPos;

                cQuotes = (byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos++];
                else
                {
                    goto AttributeValueEnd;
                }

                while (cPeek != 0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');
                        iCurPos--;
                        goto AttributeParsing;
                    }

                    // end of tag?
                    if (cPeek == (byte)'>')
                    {
                        //iCurPos--;
                        break;
                    }

                    if (iCurPos < iDataLength)
                        cPeek = bHTML[iCurPos++];
                    else
                    {
                        iCurPos = iDataLength + 1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            iCurPos++;

            iValueStartOffset = iCurPos;

            if (iCurPos < iDataLength)
                cPeek = bHTML[iCurPos++];
            else
            {

                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while (cPeek != 0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if (cPeek == 38)
                {
                    int iPrevPos = iCurPos;

                    char cEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                    // restore current symbol
                    if (cEntityChar == 0)
                    {
                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos++];
                        else
                            break;

                        //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        int iPreEntLen = iPrevPos - iValueStartOffset - 1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        sText.Clear();

                        // copy previous data
                        if (iPreEntLen > 0)
                        {
                            Array.Copy(bHTML, iValueStartOffset, sText.bBuffer, 0, iPreEntLen);
                            sText.iBufPos = iPreEntLen;
                        }

                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        oChunk.bEntities = true;

                        if (cChar == (byte)'<')
                            oChunk.bLtEntity = true;

                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if(cChar!=' ')
                        sText.Append(cEntityChar);

                        if (iCurPos < iDataLength)
                            cPeek = bHTML[iCurPos++];
                        else
                        {

                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while (cPeek != 0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if (cPeek == 38)
                            {
                                char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                                // restore current symbol
                                if (cNewEntityChar != 0)
                                {
                                    if (cNewEntityChar == (byte)'<')
                                        oChunk.bLtEntity = true;

                                    sText.Append(cNewEntityChar);

                                    if (iCurPos < iDataLength)
                                        cPeek = bHTML[iCurPos++];
                                    else
                                        goto AttributeValueEnd;

                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if (cPeek == cQuotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);

                                if (iCurPos < iDataLength)
                                    cPeek = bHTML[iCurPos];
                                else
                                    break;

                                goto AttributeParsing;
                            }

                            sText.bBuffer[sText.iBufPos++] = cPeek;
                            //sText.Append(cPeek);

                            if (iCurPos < iDataLength)
                                cPeek = bHTML[iCurPos++];
                            else
                                break;
                        }

                        oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if (cPeek == cQuotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //sText.Clear();

                    oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), cQuotes);

                    if (iCurPos < iDataLength)
                        cPeek = bHTML[iCurPos];
                    else
                    {
                        //iCurPos++;
                        break;
                    }

                    goto AttributeParsing;
                }

                if (iCurPos < iDataLength)
                    cPeek = bHTML[iCurPos++];
                else
                {
                    //iCurPos++;
                    break;
                }
            }

            AttributeValueEnd:

            // ok we are done, add outstanding attribute
            int iLen = iCurPos - iValueStartOffset - 1;
            if (iLen > 0)
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iLen), cQuotes);
            else
                oChunk.AddParam(sAttrName, "", cQuotes);

            ReturnChunk:

            if (oChunk.bClosure)
            {
                oChunk.oType = HTMLchunkType.CloseTag;
            }
            else
                oChunk.oType = HTMLchunkType.OpenTag;

            return oChunk;
        }
Ejemplo n.º 17
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk oChunk)
 {
     // note: this really should have been byte array assigned rather than string
     // it would be more correct originality-wise
     oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength);
 }