private static string GetBodyText(HttpWebResponse resp) { if (resp.ContentType != null && resp.ContentType.Length > 0) { IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true); string mainType = (string)contentTypeData[""]; switch (mainType) { case "text/plain": { return(DecodeBody(resp)); } case "text/html": { return(DecodeBody(resp)); } } } return(""); }
/// <summary> /// Gets the character set associated with the WebResponse. /// </summary> /// <param name="response">The WebResponse to inspect for a character set.</param> /// <param name="characterSet">When this method returns, contains the character set associated with the /// WebResponse if the character set is explicitly specified; otherwise, null. This parameter is passed /// uninitialized.</param> /// <returns>true if the WebResponse explicitly specifies a character set; otherwise, false.</returns> private bool TryGetCharacterSet(HttpWebResponse response, out string characterSet) { // A very applicable comment from MSDN on why not to use the HttpWebResponse.CharacterSet property: // http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.characterset(v=VS.80).aspx // // "As stated in a previous comment, many web servers are poorly configured and don't include the charset // in their content type header, e. g. they just return "text/html". In theory, user agents should treat // it as ISO-8859-1, as recommended by W3C. This is what the CharacterSet property actually does: It // always returns ISO-8859-1 if the charset it not specified, although often the content has a different // encoding (which of course HttpWebResponse cannot know). // In real life however, in case of a missing charset definition in the HTTP header user agents look into // the markup, and usully a meta tag can be found that contains the correct encoding, like "utf-8". To // implement this pragmatic approach, it would IMHO be much more convenient if the CharacterSet would // default to an empty string, then you know, that the encoding is not specified and you need a workaround // to determine the correct encoding to use. // The only workaround that I could find for me was to parse ContentType myself to extract the character // set, and ignore ContentEncoding and CharacterSet because they are useless." string contentType = response.ContentType; if (!String.IsNullOrEmpty(contentType)) { IDictionary values = MimeHelper.ParseContentType(contentType, true); const string charset = "charset"; if (values.Contains(charset)) { characterSet = values[charset] as string; if (!String.IsNullOrEmpty(characterSet)) { Debug.Assert(characterSet.Equals(response.CharacterSet, StringComparison.OrdinalIgnoreCase), "CharacterSet was parsed incorrectly!"); return(true); } } } characterSet = null; return(false); }
private static string GetBodyText(HttpWebResponse resp) { if (resp.ContentType != null && resp.ContentType.Length > 0) { IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true); string mainType = (string)contentTypeData[""]; switch (mainType) { case "text/plain": { return(DecodeBody(resp)); } case "text/html": { return(StringHelper.CompressExcessWhitespace( HTMLDocumentHelper.HTMLToPlainText( LightWeightHTMLThinner2.Thin( DecodeBody(resp), true)))); } } } return(""); }