예제 #1
0
        /// <summary>
        /// This function will be called when & is found, and it will
        /// peek forward to check if its entity, should there be a success
        /// indicated by non-zero returned, the pointer will be left at the new byte
        /// after entity
        /// </summary>
        /// <returns>Char (not byte) that corresponds to the entity or 0 if it was not entity</returns>
        internal char CheckForEntity(byte[] bHTML, ref int iCurPos, int iDataLength)
        {
            if (!bDecodeEntities && !bMiniEntities)
            {
                return((char)0);
            }

            int  iChars = 0;
            byte cChar;
            //string sEntity="";

            // if true it means we are getting hex or decimal value of the byte
            bool bCharCode    = false;
            bool bCharCodeHex = false;

            int iEntLen = 0;

            int iFrom = iCurPos;

            string sEntity;

            try
            {
                /*
                 * while(!Eof())
                 * {
                 *  cChar=NextChar();
                 */
                while (iCurPos < iDataLength)
                {
                    cChar = bHTML[iCurPos++];

                    // 21/10/05: not necessary
                    //if(cChar==0)
                    //    break;

                    if (++iChars <= 2)
                    {
                        // the first byte for numbers should be #
                        if (iChars == 1)
                        {
                            if (cChar == '#')
                            {
                                iFrom++;
                                bCharCode = true;
                                continue;
                            }
                        }
                        else
                        {
                            if (bCharCode && cChar == 'x')
                            {
                                iFrom++;
                                iEntLen--;
                                bCharCodeHex = true;
                            }
                        }
                    }

                    //Console.WriteLine("Got entity end: {0}",sEntity);
                    // Break on:
                    // 1) ; - proper end of entity
                    // 2) number 10-based entity but current byte is not a number
                    //if(cChar==';' || (bCharCode && !bCharCodeHex && !char.IsNumber((char)cChar)))

                    // TODO: browsers appear to be lax about ; requirement for end of entity
                    // we should really do the same and treat whitespace as termination of entity
                    if (cChar == ';' || (bCharCode && !bCharCodeHex && !(cChar >= '0' && cChar <= '9')))
                    {
                        // lets try speculative quick lookup using just first 2 chars
                        // this should be successful in almost all cases thus removing need for
                        // expensive creation of a string
                        if (!bCharCode && iEntLen > 1)
                        {
                            object oChar = oEntities.GetLikelyPresentValue(bHTML[iFrom], bHTML[iFrom + 1]);

                            if (oChar != null)
                            {
                                return((char)((int)oChar));
                            }
                        }

                        // check if its int - this way we can avoid having to create string
                        if (bCharCode && iEntLen > 0 && !bCharCodeHex)
                        {
                            // if mini entities mode is set then we will ignore all numerics
                            if (bMiniEntities)
                            {
                                break;
                            }

                            // we have to backdown one char in case when entity did not end with ;
                            // otherwise we will lose next char in the stream, this correction suggested by Kurt Carlson!
                            if (cChar != ';')
                            {
                                iCurPos--;
                            }

                            return((char)ParseUInt(bHTML, iFrom, iEntLen));
                        }

                        sEntity = Encoding.Default.GetString(bHTML, iFrom, iEntLen);

                        if (bCharCode)
                        {
                            // NOTE: this may fail due to wrong data format,
                            // in which case we will return 0, and entity will be
                            // ignored
                            if (iEntLen > 0)
                            {
                                // if mini entities mode is set then we will ignore all numerics
                                if (bMiniEntities)
                                {
                                    break;
                                }

                                int iChar;

                                if (!bCharCodeHex)
                                {
#if DOTNET20
                                    // we want to avoid exceptions if possible as they are slow
                                    if (!int.TryParse(sEntity, out iChar))
                                    {
                                        if (iChars > 0)
                                        {
                                            if ((iCurPos - iChars) >= 0)
                                            {
                                                iCurPos -= iChars;
                                            }

                                            //PutChars(iChars);
                                        }

                                        return((char)(0));
                                    }
#else
                                    iChar = int.Parse(sEntity);
#endif
                                }
                                else
                                {
#if DOTNET20
                                    // we want to avoid exceptions if possible as they are very slow
                                    if (!int.TryParse(sEntity, System.Globalization.NumberStyles.HexNumber, null, out iChar))
                                    {
                                        if (iChars > 0)
                                        {
                                            if ((iCurPos - iChars) >= 0)
                                            {
                                                iCurPos -= iChars;
                                            }

                                            //PutChars(iChars);
                                        }
                                        return((char)(0));
                                    }
#else
                                    iChar = int.Parse(sEntity, NumberStyles.HexNumber);
#endif
                                }

                                return((char)iChar);
                            }
                        }

                        if (iEntLen >= iMinEntityLen && iEntLen <= iMaxEntityLen)
                        {
                            object oChar = oEntities.GetLikelyPresentValue(sEntity);

                            if (oChar != null)
                            {
                                return((char)((int)oChar));
                            }
                        }
                    }

                    //break;


                    // as soon as entity length exceed max length of entity known to us
                    // we break up the loop and return nothing found

                    // NOTE: removed due to entities being generally correct and this code costs 10% of CPU in this function

                    if (iEntLen > iMaxEntityLen)
                    {
                        break;
                    }

                    iEntLen++;
                }
            }
            catch //(Exception oEx)
            {
                //Console.WriteLine("Entity parsing exception: "+oEx.ToString());
            }

            // if we have not found squat, then we will need to put point back
            // to where it was before this function was called
            if (iChars > 0)
            {
                if ((iCurPos - iChars) >= 0)
                {
                    iCurPos -= iChars;
                }

                //PutChars(iChars);
            }

            return((char)(0));
        }
예제 #2
0
        /// <summary>
        /// This function will decode any entities found in a string - not fast!
        /// </summary>
        /// <returns>Possibly decoded string</returns>
        internal static string DecodeEntities(string sData)
        {
            char cChar;

            StringBuilder oSB = new StringBuilder(sData.Length);

            string sEntity = "";

            try
            {
                for (int i = 0; i < sData.Length; i++)
                {
                    cChar = sData[i];

                    if (cChar != '&' || (i + 1 >= sData.Length))
                    {
                        oSB.Append(cChar);
                    }
                    else
                    {
                        // if true it means we are getting hex or decimal value of the byte
                        bool bCharCode    = false;
                        bool bCharCodeHex = false;
                        int  iEntLen      = 0;
                        int  iChars       = 0;

                        int j = i + 1;

                        int iFrom = i + 1;

                        for (; j < sData.Length; j++)
                        {
                            cChar = sData[j];

                            if (++iChars <= 2)
                            {
                                // the first byte for numbers should be #
                                if (iChars == 1)
                                {
                                    if (cChar == '#')
                                    {
                                        iFrom++;
                                        bCharCode = true;
                                        continue;
                                    }
                                }
                                else
                                {
                                    if (bCharCode && cChar == 'x' && !bCharCodeHex)
                                    {
                                        iFrom++;
                                        //iEntLen--;
                                        bCharCodeHex = true;
                                        continue;
                                    }
                                }
                            }

                            //Console.WriteLine("Got entity end: {0}",sEntity);
                            // Break on:
                            // 1) ; - proper end of entity
                            // 2) number 10-based entity but current byte is not a number
                            //if(cChar==';' || (bCharCode && !bCharCodeHex && !char.IsNumber((char)cChar)))
                            bool bLastChar = j + 1 >= sData.Length;

                            if (cChar == ';' || (bCharCode && !bCharCodeHex && !(cChar >= '0' && cChar <= '9')) || (bCharCode && bLastChar))
                            {
                                // end of string
                                if (bLastChar && cChar != ';')
                                {
                                    iEntLen++;
                                }

                                // lets try speculative quick lookup using just first 2 chars
                                // this should be successful in almost all cases thus removing need for
                                // expensive creation of a string
                                if (!bCharCode && iEntLen > 1)
                                {
                                    // make sure we aint at the end of string
                                    if (i + 2 < sData.Length)
                                    {
                                        object oChar = oAllEntities.GetLikelyPresentValue((byte)sData[i + 1], (byte)sData[i + 2]);

                                        if (oChar != null)
                                        {
                                            oSB.Append((char)((int)oChar));
                                            break;
                                        }
                                    }
                                }

                                // check if its int - this way we can avoid having to create string
                                if (bCharCode && iEntLen > 0 && !bCharCodeHex)
                                {
                                    sEntity = sData.Substring(iFrom, iEntLen);

                                    int  iChar    = 0;
                                    bool bSuccess = false;

                                    try
                                    {
                                        iChar    = (int)uint.Parse(sEntity);
                                        bSuccess = true;
                                    }
                                    catch
                                    {
                                    }

                                    if (bSuccess)
                                    {
                                        oSB.Append((char)iChar);

                                        // move back once when we got number done without ; at the end
                                        // of it - Firefox and IE do it this way
                                        if (cChar != ';' && !bLastChar)
                                        {
                                            j--;
                                        }

                                        break;
                                    }
                                    else
                                    {
                                        // this will force to add entity as is - probably broken
                                        // or maybe not entity at all
                                        oSB.Append('&');
                                        j = i;
                                        break;
                                    }
                                }

                                sEntity = sData.Substring(iFrom, iEntLen);

                                if (bCharCode)
                                {
                                    // NOTE: this may fail due to wrong data format,
                                    // in which case we will return 0, and entity will be
                                    // ignored
                                    if (iEntLen > 0)
                                    {
                                        int  iChar    = 0;
                                        bool bSuccess = false;

#if DOTNET20 && false
                                        if (!bCharCodeHex)
                                        {
                                            bSuccess = int.TryParse(sEntity, out iChar);
                                        }
                                        else
                                        {
                                            bSuccess = int.TryParse(sEntity, System.Globalization.NumberStyles.HexNumber, out iChar);
                                        }
#else
                                        try
                                        {
                                            if (!bCharCodeHex)
                                            {
                                                iChar = int.Parse(sEntity);
                                            }
                                            else
                                            {
                                                iChar = int.Parse(sEntity, NumberStyles.HexNumber);
                                            }

                                            bSuccess = true;
                                        }
                                        catch
                                        {
                                            // some numbers might not be parsed correctly so we will ignore them
                                        }
#endif
                                        if (bSuccess)
                                        {
                                            oSB.Append((char)iChar);
                                            break;
                                        }
                                        else
                                        {
                                            // this will force to add entity as is - probably broken
                                            // or maybe not entity at all
                                            iEntLen = iAllMaxEntityLen + 1;
                                        }
                                    }
                                }

                                if (iEntLen >= iAllMinEntityLen && iEntLen <= iAllMaxEntityLen)
                                {
                                    object oChar = oAllEntities.GetLikelyPresentValue(sEntity);

                                    if (oChar != null)
                                    {
                                        oSB.Append((char)((int)oChar));
                                        break;
                                    }
                                    else
                                    {
                                        // this will force to add entity as is - probably broken
                                        // or maybe not entity at all
                                        iEntLen = iAllMaxEntityLen + 1;
                                        //Utils.Write("");
                                    }
                                }
                            }

                            //break;


                            // as soon as entity length exceed max length of entity known to us
                            // we break up the loop and return nothing found

                            // NOTE: removed due to entities being generally correct and this code costs 10% of CPU in this function

                            if (iEntLen > iAllMaxEntityLen || bLastChar)
                            {
                                // append char that triggered entity thingy in the first place
                                oSB.Append('&');
                                j = i;
                                break;
                            }

                            iEntLen++;
                        }

                        i = j;
                    }
                }
            }
            catch (Exception oEx)
            {
                Console.WriteLine("Entity parsing exception: " + oEx.ToString());

                return(sData);
            }

            return(oSB.ToString());
        }