private static int ReadUtf8Char( TransformWithUnget stream, int[] bytesRead) { if (stream == null) { throw new ArgumentNullException("stream"); } var cp = 0; var bytesSeen = 0; var bytesNeeded = 0; var lower = 0x80; var upper = 0xbf; var read = 0; while (true) { int b = stream.ReadByte(); ++read; if (b < 0) { if (bytesNeeded != 0) { stream.Unget(); --read; bytesRead[0] = read; return 0xfffd; } return -1; } if (bytesNeeded == 0) { if ((b & 0x7f) == b) { bytesRead[0] = read; return b; } if (b >= 0xc2 && b <= 0xdf) { bytesNeeded = 1; cp = (b - 0xc0) << 6; } else if (b >= 0xe0 && b <= 0xef) { lower = (b == 0xe0) ? 0xa0 : 0x80; upper = (b == 0xed) ? 0x9f : 0xbf; bytesNeeded = 2; cp = (b - 0xe0) << 12; } else if (b >= 0xf0 && b <= 0xf4) { lower = (b == 0xf0) ? 0x90 : 0x80; upper = (b == 0xf4) ? 0x8f : 0xbf; bytesNeeded = 3; cp = (b - 0xf0) << 18; } else { bytesRead[0] = read; return 0xfffd; } continue; } if (b < lower || b > upper) { stream.Unget(); return 0xfffd; } lower = 0x80; upper = 0xbf; ++bytesSeen; cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen)); if (bytesSeen != bytesNeeded) { continue; } bytesRead[0] = read; return cp; } }
private static void ReadHeaders( IByteReader stream, ICollection<string> headerList, bool start) { var lineCount = 0; var bytesRead = new int[1]; var sb = new StringBuilder(); var ungetStream = new TransformWithUnget(stream); while (true) { sb.Remove(0, sb.Length); var first = true; var endOfHeaders = false; var wsp = false; lineCount = 0; while (true) { int c = ungetStream.ReadByte(); if (c == -1) { throw new MessageDataException("Premature end before all headers were read"); } ++lineCount; if (first && c == '\r') { if (ungetStream.ReadByte() == '\n') { endOfHeaders = true; break; } throw new MessageDataException("CR not followed by LF"); } if ((c >= 0x21 && c <= 57) || (c >= 59 && c <= 0x7e)) { if (wsp) { throw new MessageDataException("Whitespace within header field name"); } first = false; if (c >= 'A' && c <= 'Z') { c += 0x20; } sb.Append((char)c); } else if (!first && c == ':') { if (lineCount >= 999) { // 998 characters includes the colon throw new MessageDataException("Header field name too long"); } break; } else if (c == 0x20 || c == 0x09) { if (start && c == 0x20 && sb.Length == 4 && sb.ToString().Equals( "From")) { // Mbox convention, skip the entire line sb.Remove(0, sb.Length); while (true) { c = ungetStream.ReadByte(); if (c == -1) { throw new MessageDataException("Premature end before all headers were read"); } if (c == '\r') { if (ungetStream.ReadByte() == '\n') { // End of line was reached break; } ungetStream.Unget(); } } start = false; wsp = false; first = true; } else { wsp = true; first = false; } } else { throw new MessageDataException("Malformed header field name"); } } if (endOfHeaders) { break; } if (sb.Length == 0) { throw new MessageDataException("Empty header field name"); } // Set the header field name to the // string builder's current value string fieldName = sb.ToString(); // Clear the string builder to read the // header field's value sb.Remove(0, sb.Length); // Read the header field value using UTF-8 characters // rather than bytes (DEVIATION: RFC 6532 allows UTF-8 // in header field values, but not everywhere in these values, // as is done here for convenience) while (true) { int c = ReadUtf8Char(ungetStream, bytesRead); if (c == -1) { throw new MessageDataException( "Premature end before all headers were read"); } if (c == '\r') { // We're only looking for the single-byte LF, so // there's no need to use ReadUtf8Char c = ungetStream.ReadByte(); if (c == '\n') { lineCount = 0; // Parse obsolete folding whitespace (obs-fws) under RFC5322 // (parsed according to errata), same as LWSP in RFC5234 var fwsFirst = true; var haveFWS = false; while (true) { // Skip the CRLF pair, if any (except if iterating for // the first time, since CRLF was already parsed) // Use ReadByte here since we're just looking for the single // byte characters CR and LF if (!fwsFirst) { c = ungetStream.ReadByte(); if (c == '\r') { c = ungetStream.ReadByte(); if (c == '\n') { // CRLF was read lineCount = 0; } else { // It's the first part of the line, where the header name // should be, so the CR here is illegal throw new MessageDataException("CR not followed by LF"); } } else { // anything else, unget ungetStream.Unget(); } } fwsFirst = false; // Use ReadByte here since we're just looking for the single // byte characters space and tab int c2 = ungetStream.ReadByte(); if (c2 == 0x20 || c2 == 0x09) { ++lineCount; // Don't write SPACE as the first character of the value if (c2 != 0x20 || sb.Length != 0) { sb.Append((char)c2); } haveFWS = true; } else { ungetStream.Unget(); // this isn't space or tab; if this is the start // of the line, this is no longer FWS if (lineCount == 0) { haveFWS = false; } break; } } if (haveFWS) { // We have folding whitespace, line // count found as above continue; } // This ends the header field break; } if (c < 0) { throw new MessageDataException("Premature end before all headers were read"); } sb.Append('\r'); ungetStream.Unget(); ++lineCount; } lineCount += bytesRead[0]; // NOTE: Header field line limit not enforced here, only // in the header field name; it's impossible to generate // a conforming message if the name is too long // NOTE: Some emails still have 8-bit bytes in an unencoded // subject line // or other unstructured header field; however, since RFC6532, // we can just assume the UTF-8 encoding in these cases; in // case the bytes are not valid UTF-8, a replacement character // will be output if (c != 0x20 || sb.Length != 0) { if (c <= 0xffff) { sb.Append((char)c); } else if (c <= 0x10ffff) { sb.Append((char)((((c - 0x10000) >> 10) & 0x3ff) + 0xd800)); sb.Append((char)(((c - 0x10000) & 0x3ff) + 0xdc00)); } } } string fieldValue = sb.ToString(); headerList.Add(fieldName); headerList.Add(fieldValue); } }