/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="p">HTML parser object that is used for parsing</param> /// <param name="chunk">Parsed chunk that should contain tag META</param> /// <param name="encodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser p, HTMLchunk chunk, ref bool encodingSet) { if (chunk.Tag.Length != 4 || chunk.Tag[0] != 'm' || chunk.Tag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!chunk.HashMode) { chunk.ConvertParamsToHash(); } var key = chunk.Params["http-equiv"] as string; if (key != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (key.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!encodingSet) { var data = chunk.Params["content"] as string; // it is possible we have broken META tag without Content part if (data != null) { if (p.SetEncoding(data)) { // we may need to re-encode title if (!encodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway encodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } } return(false); }
/// <summary> /// Inits tag parser /// </summary> /// <param name="chunk"></param> /// <param name="text"></param> internal void Init(HTMLparser p, HTMLchunk chunk, DynaString text, byte[] html, int dataLength, HTMLentities e, HTMLheuristics he) { _p = p; _chunk = chunk; _text = text; _html = html; _dataLength = dataLength; // we don't want to be too close to end of data when dealing with heuristics _maxHeuDataLength = _dataLength - MIN_DATA_SIZE_FOR_HEURISTICS; _e = e; _he = he; }