public void DoNotDoubleDecode() { const string input = "=3D3D"; const string expectedOutput = "=3D"; // Checks that the output itself is not decoded, as this is encoded equal sign string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void RFCExample() { const string input = "Now is the time =\r\nfor all folk to come=\r\n to the aid of their country."; const string expectedOutput = "Now is the time for all folk to come to the aid of their country."; string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void LiteralSpaceAndTabSupported() { const string input = "Test for space\tand\ttabs"; const string expectedOutput = input; // Nothing should happen, spaces and tabs should be kept string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void DoNotTouchLiterals() { const string input = "!\"#$%&'()*+,-./0123456789:;<>@ABCDEFGHIJKLMNIOQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; const string expectedOutput = "!\"#$%&'()*+,-./0123456789:;<>@ABCDEFGHIJKLMNIOQRSTUVWXYZ[\\]^ `abcdefghijklmnopqrstuvwxyz{|}~"; // Only change is that _ delimits SPACE string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void CanDecodeEqualSignTwoTimes() { const string input = "=3D=3D"; const string expectedOutput = "=="; string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void CanDecodeUnderscoreToSpace() { // Space is represented as an _ const string input = "_"; const string expectedOutput = " "; string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void CanHandleWindows1252Encoding() { const string input = "=C5=F7=96"; // http://en.wikipedia.org/wiki/Windows-1254 const string expectedOutput = "Å÷–"; string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding(1252)); Assert.AreEqual(expectedOutput, output); }
public void CanDecodeSpanishSentence() { // Space is represented as an _ in const string input = "=A1Hola,_se=F1or!"; const string expectedOutput = "¡Hola, señor!"; string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void CanDecodeSoftLineBreak() { const string input = "=\r\n"; const string expectedOutput = ""; // =20 should be a space // = just after should be a soft line break string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void ImplementationShouldBeRobustSmallHexDigitsAfterEqual() { const string input = "=3d=a1"; const string expectedOutput = "=¡"; // Should simply be decoded as if hex characters were uppercase // Therefore no exceptions must be thrown Assert.DoesNotThrow(delegate { QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); }); string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); // And output should be correct Assert.AreEqual(expectedOutput, output); }
public void ImplementationShouldBeRobustControlCharactersCarriageReturnNewlineNotPair() { const string input = "\n\runit\ned\r"; // Notice the ordering is wrong with the first pair, and therefore not allowed const string expectedOutput = "united"; // All illegal control characters should have been deleted // Do not throw exceptions Assert.DoesNotThrow(delegate { QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); }); string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); // And output should be correct Assert.AreEqual(expectedOutput, output); }
public void CanDecodeLongSentence() { // Includes a space (=20 is SPACE) and and a soft line break (=\r\n is soft line break) const string input = "If you believe that truth=3Dbeauty, then surely=20=\r\nmathematics is the most beautiful branch of philosophy."; const string expectedOutput = "If you believe that truth=beauty, then surely mathematics is the most beautiful branch of philosophy."; // No exceptions Assert.DoesNotThrow(delegate { QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); }); // And output is to be decoded anyway string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); Assert.AreEqual(expectedOutput, output); }
public void ImplementationShouldBeRobustNoHexAfterEqual() { const string input = "=PK"; // This is clearly illigal input, as the RFC says there MUST be a HEX string after the // equal sign. // It also states that the parser should be robust, and that in such case the input is not to be touched. const string expectedOutput = input; // Therefore no exceptions must be thrown Assert.DoesNotThrow(delegate { QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); }); string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); // And output should be equal input Assert.AreEqual(expectedOutput, output); }
public void ImplementationShouldBeRobustNothingAfterEqual() { const string input = "="; // This is clearly illigal input, as the RFC says there MUST be something after // the equal sign. It also states that the parser should be robust. Therefore no exceptions must be thrown Assert.DoesNotThrow(delegate { QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); }); // Maybe exception thrown? // The RFC says that the input should be though of as not encoded at all const string expectedOutput = "="; string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); // And output should be correct Assert.AreEqual(expectedOutput, output); }
public void ImplementationShouldBeRobustControlCharacters() { const char bellAlert = '\a'; const char backSpace = '\b'; const char formFeed = '\f'; const char nullChar = '\0'; const char carrigeReturn = '\r'; // Allowed if used with \r\n const char newline = '\n'; // Allowed if used with \r\n const char horizontalTab = '\t'; // Allowed const char deleteChar = '\u007F'; string input = "" + bellAlert + backSpace + formFeed + nullChar + carrigeReturn + newline + horizontalTab + deleteChar; const string expectedOutput = "\r\n\t"; // All other illegal control characters should have been deleted // Do not throw exceptions Assert.DoesNotThrow(delegate { QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); }); string output = QuotedPrintable.DecodeEncodedWord(input, Encoding.GetEncoding("iso-8859-1")); // And output should be correct Assert.AreEqual(expectedOutput, output); }
/// <summary> /// Decode text that is encoded with the <see cref="EncodedWord"/> encoding.<br/> ///<br/> /// This method will decode any encoded-word found in the string.<br/> /// All parts which is not encoded will not be touched.<br/> /// <br/> /// From <a href="http://tools.ietf.org/html/rfc2047">RFC 2047</a>:<br/> /// <code> /// Generally, an "encoded-word" is a sequence of printable ASCII /// characters that begins with "=?", ends with "?=", and has two "?"s in /// between. It specifies a character set and an encoding method, and /// also includes the original text encoded as graphic ASCII characters, /// according to the rules for that encoding method. /// </code> /// Example:<br/> /// <c>=?ISO-8859-1?q?this=20is=20some=20text?= other text here</c> /// </summary> /// <remarks>See <a href="http://tools.ietf.org/html/rfc2047#section-2">RFC 2047 section 2</a> "Syntax of encoded-words" for more details</remarks> /// <param name="encodedWords">Source text. May be content which is not encoded.</param> /// <returns>Decoded text</returns> /// <exception cref="ArgumentNullException">If <paramref name="encodedWords"/> is <see langword="null"/></exception> public static string Decode(string encodedWords) { if (encodedWords == null) { throw new ArgumentNullException("encodedWords"); } // Notice that RFC2231 redefines the BNF to // encoded-word := "=?" charset ["*" language] "?" encoded-text "?=" // but no usage of this BNF have been spotted yet. It is here to // ease debugging if such a case is discovered. // This is the regex that should fit the BNF // RFC Says that NO WHITESPACE is allowed in this encoding, but there are examples // where whitespace is there, and therefore this regex allows for such. const string encodedWordRegex = @"\=\?(?<Charset>\S+?)\?(?<Encoding>\w)\?(?<Content>.+?)\?\="; // \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]". // \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]". // +? non-greedy equivalent to + // (?<NAME>REGEX) is a named group with name NAME and regular expression REGEX // Any amount of linear-space-white between 'encoded-word's, // even if it includes a CRLF followed by one or more SPACEs, // is ignored for the purposes of display. // http://tools.ietf.org/html/rfc2047#page-12 // Define a regular expression that captures two encoded words with some whitespace between them const string replaceRegex = @"(?<first>" + encodedWordRegex + @")\s+(?<second>" + encodedWordRegex + ")"; // Then, find an occurrence of such an expression, but remove the whitespace in between when found // Need to be done twice for encodings such as "=?UTF-8?Q?a?= =?UTF-8?Q?b?= =?UTF-8?Q?c?=" // to be replaced correctly encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}"); encodedWords = Regex.Replace(encodedWords, replaceRegex, "${first}${second}"); string decodedWords = encodedWords; MatchCollection matches = Regex.Matches(encodedWords, encodedWordRegex); foreach (Match match in matches) { // If this match was not a success, we should not use it if (!match.Success) { continue; } string fullMatchValue = match.Value; string encodedText = match.Groups["Content"].Value; string encoding = match.Groups["Encoding"].Value; string charset = match.Groups["Charset"].Value; // Get the encoding which corrosponds to the character set System.Text.Encoding charsetEncoding = EncodingFinder.FindEncoding(charset); // Store decoded text here when done string decodedText; // Encoding may also be written in lowercase switch (encoding.ToUpperInvariant()) { // RFC: // The "B" encoding is identical to the "BASE64" // encoding defined by RFC 2045. // http://tools.ietf.org/html/rfc2045#section-6.8 case "B": decodedText = Base64.Decode(encodedText, charsetEncoding); break; // RFC: // The "Q" encoding is similar to the "Quoted-Printable" content- // transfer-encoding defined in RFC 2045. // There are more details to this. Please check // http://tools.ietf.org/html/rfc2047#section-4.2 // case "Q": decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding); break; default: throw new ArgumentException("The encoding " + encoding + " was not recognized"); } // Repalce our encoded value with our decoded value decodedWords = decodedWords.Replace(fullMatchValue, decodedText); } return(decodedWords); }
public static String Decode(String encodedWords) { if (encodedWords == null) { throw new ArgumentNullException("encodedWords"); } String decodedWords = encodedWords; // Notice that RFC2231 redefines the BNF to // encoded-word := "=?" charset ["*" language] "?" encoded-text "?=" // but no usage of this BNF have been spotted yet. It is here to // ease debugging if such a case is discovered. // This is the regex that should fit the BNF // RFC Says that NO WHITESPACE is allowed in this encoding, but there are examples // where whitespace is there, and therefore this regex allows for such. //const String strRegEx = @"\=\?(?<Charset>\S+?)\?(?<Encoding>\w)\?(?<Content>.+?)\?\="; // \w Matches any word character including underscore. Equivalent to "[A-Za-z0-9_]". // \S Matches any nonwhite space character. Equivalent to "[^ \f\n\r\t\v]". // +? non-gready equivalent to + // (?<NAME>REGEX) is a named group with name NAME and regular expression REGEX /*MatchCollection matches = Regex.Matches(encodedWords, strRegEx); * foreach (Match match in matches) * { * // If this match was not a success, we should not use it * if (!match.Success) continue; * * String fullMatchValue = match.Value; * * String encodedText = match.Groups["Content"].Value; * String encoding = match.Groups["Encoding"].Value; * String charset = match.Groups["Charset"].Value;*/ var lTextInfo = ParseEncodedText(encodedWords); foreach (EncodedTextInfo lInfo in lTextInfo) { String fullMatchValue = lInfo.FullText; String encodedText = lInfo.Content; String encoding = lInfo.Encoding; String charset = lInfo.Charset; // Get the encoding which corrosponds to the character set Encoding charsetEncoding = Utility.ParseCharsetToEncoding(charset); // Store decoded text here when done String decodedText; // Encoding may also be written in lowercase switch (encoding.ToUpperInvariant()) { // RFC: // The "B" encoding is identical to the "BASE64" // encoding defined by RFC 2045. // http://tools.ietf.org/html/rfc2045#section-6.8 case "B": decodedText = Base64.Decode(encodedText, charsetEncoding); break; // RFC: // The "Q" encoding is similar to the "Quoted-Printable" content- // transfer-encoding defined in RFC 2045. // There are more details to this. Please check // http://tools.ietf.org/html/rfc2047#section-4.2 // case "Q": decodedText = QuotedPrintable.DecodeEncodedWord(encodedText, charsetEncoding); break; default: throw new ArgumentException("The encoding " + encoding + " was not recognized"); } // Repalce our encoded value with our decoded value decodedWords = decodedWords.Replace(fullMatchValue, decodedText); } return(decodedWords); }