/// <summary> /// 标题转码类 <see cref="EncodedWord"/> encoding.<br/> ///<br/> /// This method will decode any encoded-word found in the string.<br/> /// All parts which is not encoded will not be touched.<br/> /// <br/> /// From <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>:<br/> /// <code> /// Generally, an "encoded-word" is a sequence of printable ASCII /// characters that begins with "=?", ends with "?=", and has two "?"s in /// between. It specifies a character set and an encoding method, and /// also includes the original text encoded as graphic ASCII characters, /// according to the rules for that encoding method. /// </code> /// Example:<br/> /// <c>=?ISO-8859-1?q?this=20is=20some=20text?= other text here</c> /// </summary> /// <remarks>See <a href="http://tools.ietf.org/html/rfc2047#section-2">RFC 2047 section 2</a> "Syntax of encoded-words" for more details</remarks> /// <param name="encodedWords">Source text. May be content which is not encoded.</param> /// <returns>Decoded text</returns> /// <exception cref="ArgumentNullException">If <paramref name="encodedWords"/> is <see langword="null"/></exception> public static string Decode(string encodedWords) { if (encodedWords == null) { throw new ArgumentNullException("encodedWords"); } string decodedWords = encodedWords; const string strRegEx = @"\=\?(?<Charset>\S+?)\?(?<Encoding>\w)\?(?<Content>.+?)\?\="; // \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]". // \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]". // +? non-gready equivalent to + // (?<NAME>REGEX) is a named group with name NAME and regular expression REGEX MatchCollection matches = Regex.Matches(encodedWords, strRegEx); foreach (Match match in matches) { // If this match was not a success, we should not use it if (!match.Success) { continue; } string fullMatchValue = match.Value; string encodedText = match.Groups["Content"].Value; string encoding = match.Groups["Encoding"].Value; string charset = match.Groups["Charset"].Value; // Get the encoding which corrosponds to the character set Encoding charsetEncoding = HeaderFieldParser.ParseCharsetToEncoding(charset); // Store decoded text here when done string decodedText; // Encoding may also be written in lowercase switch (encoding.ToUpperInvariant()) { // RFC: // The "B" encoding is identical to the "BASE64" // encoding defined by RFC 2045. // http://tools.ietf.org/html/rfc2045#section-6.8 case "B": decodedText = Base64.Decode(encodedText, charsetEncoding); break; // RFC: // The "Q" encoding is similar to the "Quoted-Printable" content- // transfer-encoding defined in RFC 2045. // There are more details to this. Please check // http://tools.ietf.org/html/rfc2047#section-4.2 // case "Q": decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding); break; default: throw new ArgumentException("The encoding " + encoding + " was not recognized"); } // Repalce our encoded value with our decoded value decodedWords = decodedWords.Replace(fullMatchValue, decodedText); } return(decodedWords); }
/// <summary> /// Decode text that is encoded with the <see cref="EncodedWord"/> encoding.<br/> ///<br/> /// This method will decode any encoded-word found in the string.<br/> /// All parts which is not encoded will not be touched.<br/> /// <br/> /// From <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>:<br/> /// <code> /// Generally, an "encoded-word" is a sequence of printable ASCII /// characters that begins with "=?", ends with "?=", and has two "?"s in /// between. It specifies a character set and an encoding method, and /// also includes the original text encoded as graphic ASCII characters, /// according to the rules for that encoding method. /// </code> /// Example:<br/> /// <c>=?ISO-8859-1?q?this=20is=20some=20text?= other text here</c> /// </summary> /// <remarks>See <a href="http://tools.ietf.org/html/rfc2047#section-2">RFC 2047 section 2</a> "Syntax of encoded-words" for more details</remarks> /// <param name="encodedWords">Source text. May be content which is not encoded.</param> /// <returns>Decoded text</returns> /// <exception cref="ArgumentNullException">If <paramref name="encodedWords"/> is <see langword="null"/></exception> public static string Decode(string encodedWords) { if (encodedWords == null) { throw new ArgumentNullException("encodedWords"); } // Notice that RFC2231 redefines the BNF to // encoded-word := "=?" charset ["*" language] "?" encoded-text "?=" // but no usage of this BNF have been spotted yet. It is here to // ease debugging if such a case is discovered. // This is the regex that should fit the BNF // RFC Says that NO WHITESPACE is allowed in this encoding, but there are examples // where whitespace is there, and therefore this regex allows for such. const string encodedWordRegex = @"\=\?(?<Charset>\S+?)\?(?<Encoding>\w)\?(?<Content>.+?)\?\="; // \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]". // \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]". // +? non-greedy equivalent to + // (?<NAME>REGEX) is a named group with name NAME and regular expression REGEX // Any amount of linear-space-white between 'encoded-word's, // even if it includes a CRLF followed by one or more SPACEs, // is ignored for the purposes of display. // http://tools.ietf.org/html/rfc2047#page-12 // Define a regular expression that captures two encoded words with some whitespace between them const string replaceRegex = @"(?<first>" + encodedWordRegex + @")\s+(?<second>" + encodedWordRegex + ")"; // Then, find an occurrence of such an expression, but remove the whitespace in between when found // Need to be done twice for encodings such as "=?UTF-8?Q?a?= =?UTF-8?Q?b?= =?UTF-8?Q?c?=" // to be replaced correctly encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}"); encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}"); string decodedWords = encodedWords; MatchCollection matches = Regex.Matches(encodedWords, encodedWordRegex); foreach (Match match in matches) { // If this match was not a success, we should not use it if (!match.Success) { continue; } string fullMatchValue = match.Value; string encodedText = match.Groups["Content"].Value; string encoding = match.Groups["Encoding"].Value; string charset = match.Groups["Charset"].Value; // Get the encoding which corrosponds to the character set Encoding charsetEncoding = EncodingFinder.FindEncoding(charset); // Store decoded text here when done string decodedText; // Encoding may also be written in lowercase switch (encoding.ToUpperInvariant()) { // RFC: // The "B" encoding is identical to the "BASE64" // encoding defined by RFC 2045. // http://tools.ietf.org/html/rfc2045#section-6.8 case "B": decodedText = Base64.Decode(encodedText, charsetEncoding); break; // RFC: // The "Q" encoding is similar to the "Quoted-Printable" content- // transfer-encoding defined in RFC 2045. // There are more details to this. Please check // http://tools.ietf.org/html/rfc2047#section-4.2 // case "Q": decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding); break; default: throw new ArgumentException("The encoding " + encoding + " was not recognized"); } // Repalce our encoded value with our decoded value decodedWords = decodedWords.Replace(fullMatchValue, decodedText); } return(decodedWords); }