示例#1
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="p">HTML parser object that is used for parsing</param>
        /// <param name="chunk">Parsed chunk that should contain tag META</param>
        /// <param name="encodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser p, HTMLchunk chunk, ref bool encodingSet)
        {
            if (chunk.Tag.Length != 4 || chunk.Tag[0] != 'm' || chunk.Tag != "meta")
            {
                return(false);
            }
            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!chunk.HashMode)
            {
                chunk.ConvertParamsToHash();
            }
            var key = chunk.Params["http-equiv"] as string;

            if (key != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (key.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                case "content-category":
                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!encodingSet)
                    {
                        var data = chunk.Params["content"] as string;
                        // it is possible we have broken META tag without Content part
                        if (data != null)
                        {
                            if (p.SetEncoding(data))
                            {
                                // we may need to re-encode title
                                if (!encodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    encodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }
                    return(true);

                default: break;
                }
            }
            return(false);
        }
示例#2
0
 /// <summary>
 /// Inits tag parser
 /// </summary>
 /// <param name="chunk"></param>
 /// <param name="text"></param>
 internal void Init(HTMLparser p, HTMLchunk chunk, DynaString text, byte[] html, int dataLength, HTMLentities e, HTMLheuristics he)
 {
     _p          = p;
     _chunk      = chunk;
     _text       = text;
     _html       = html;
     _dataLength = dataLength;
     // we don't want to be too close to end of data when dealing with heuristics
     _maxHeuDataLength = _dataLength - MIN_DATA_SIZE_FOR_HEURISTICS;
     _e  = e;
     _he = he;
 }