/// <summary> /// Parses the given <paramref name="rtfText"/> /// </summary> /// <param name="rtfText">text</param> public void ParseRtfText(string rtfText) { HtmlContent = null; var stringBuilder = new StringBuilder(); var htmlExtraction = false; var rtfContaintsEmbeddedHtml = false; var hexBuffer = string.Empty; using (var stringReader = new StringReader(rtfText)) using (var reader = new Reader(stringReader)) { while (reader.ReadToken() != null) { if (reader.LastToken?.Key == "'" && reader?.Keyword != "'" && hexBuffer != string.Empty && !RuntimeEncoding.IsSingleByte) { var value = HexValueToChar(hexBuffer); if (value != null) { stringBuilder.Append(value); } else { // Double byte charset was detected for the last token but only one byte was used so far. // This token should carry the second byte but it doesn't. // Workaround: To display it anyway, we treat it as a single byte char. var buff = new[] { byte.Parse(hexBuffer, NumberStyles.HexNumber) }; stringBuilder.Append(RuntimeEncoding.GetString(buff)); } hexBuffer = string.Empty; if (reader.TokenType == RtfTokenType.Text) { stringBuilder.Append(reader.Keyword); continue; } } switch (reader.Keyword) { case Consts.Ansi: break; case Consts.Ansicpg: // Read default encoding _defaultEncoding = Encoding.GetEncoding(reader.Parameter); break; case Consts.Info: // Read document information ReadDocumentInfo(reader); return; case Consts.FromHtml: rtfContaintsEmbeddedHtml = true; htmlExtraction = true; break; case Consts.Generator: // Read document generator Generator = ReadInnerText(reader, true); break; case Consts.FormatConverter: FormatConverter = ReadInnerText(reader, true); break; case Consts.Fonttbl: // Read font table ReadFontTable(reader); break; case Consts.F: if (reader.TokenType == RtfTokenType.Text) { goto default; } try { _fontChartset = FontTable[reader.Parameter].Encoding; } catch { _fontChartset = Encoding.Default; } break; case Consts.Af: _associateFontChartset = FontTable[reader.Parameter].Encoding; break; case Consts.HtmlRtf: switch (reader.HasParam) { case false: htmlExtraction = true; break; case true when reader.Parameter == 0: htmlExtraction = false; break; } break; case Consts.MHtmlTag: if (reader.HasParam && reader.Parameter == 0) { htmlExtraction = false; } else { if (hexBuffer != string.Empty) { var buff = new[] { byte.Parse(hexBuffer, NumberStyles.HexNumber) }; hexBuffer = string.Empty; stringBuilder.Append(RuntimeEncoding.GetString(buff)); htmlExtraction = true; } else { htmlExtraction = false; } } break; case Consts.HtmlTag: { if (reader.InnerReader.Peek() == ' ') { reader.InnerReader.Read(); } var text = ReadInnerText(reader, null, true, false, true); if (!string.IsNullOrEmpty(text)) { stringBuilder.Append(text); } break; } case Consts.HtmlBase: { var text = ReadInnerText(reader, null, true, false, true); if (!string.IsNullOrEmpty(text)) { stringBuilder.Append(text); } break; } case Consts.Background: case Consts.Fillcolor: case Consts.Field: ReadInnerText(reader, null, false, true, false); break; //case Consts.Par: //case Consts.Line: // stringBuilder.Append(Environment.NewLine); // break; case Consts.Tab: stringBuilder.Append("\t"); break; case Consts.Lquote: stringBuilder.Append("‘"); break; case Consts.Rquote: stringBuilder.Append("’"); break; case Consts.LdblQuote: stringBuilder.Append("“"); break; case Consts.RdblQuote: stringBuilder.Append("”"); break; case Consts.Bullet: stringBuilder.Append("•"); break; case Consts.Endash: stringBuilder.Append("–"); break; case Consts.Emdash: stringBuilder.Append("—"); break; case Consts.Tilde: stringBuilder.Append(" "); break; case Consts.Underscore: stringBuilder.Append("­"); break; case Consts.Pntext: reader.ReadToEndOfGroup(); break; case Consts.U: //if (reader.TokenType == RtfTokenType.Control && !htmlExtraction) //{ // stringBuilder.Append(HttpUtility.UrlDecode("*", _defaultEncoding)); // continue; //} if (reader.Parameter.ToString().StartsWith("c", StringComparison.InvariantCultureIgnoreCase)) { throw new Exception("\\uc parameter not yet supported, please contact the developer on GitHub"); } if (reader.Parameter.ToString().StartsWith("-")) { // The Unicode standard permanently reserves these code point values for // UTF-16 encoding of the high and low surrogates // U+D800 to U+DFFF // 55296 - 57343 var value = 65536 + int.Parse(reader.Parameter.ToString()); if (value >= 0xD800 && value <= 0xDFFF) { if (!reader.ParsingHighLowSurrogate) { reader.ParsingHighLowSurrogate = true; reader.HighSurrogateValue = value; } else { var combined = ((reader.HighSurrogateValue - 0xD800) << 10) + (value - 0xDC00) + 0x10000; stringBuilder.Append($"&#{combined};"); reader.ParsingHighLowSurrogate = false; reader.HighSurrogateValue = null; } } else { reader.ParsingHighLowSurrogate = false; stringBuilder.Append($"&#{value};"); } } else { stringBuilder.Append($"&#{reader.Parameter};"); } break; case Consts.Apostrophe: if (reader.TokenType != RtfTokenType.Control || htmlExtraction) { continue; } // Convert HEX value directly when we have a single byte charset if (RuntimeEncoding.IsSingleByte) { if (string.IsNullOrEmpty(hexBuffer)) { hexBuffer = reader.CurrentToken.Hex; } var buff = new[] { byte.Parse(hexBuffer, NumberStyles.HexNumber) }; hexBuffer = string.Empty; stringBuilder.Append(RuntimeEncoding.GetString(buff)); } else { // If we have a double byte charset like Chinese then store the value and wait for the next HEX value if (hexBuffer == string.Empty) { hexBuffer = reader.CurrentToken.Hex; } else { // Append the second HEX value and convert it var buff = new[] { byte.Parse(hexBuffer, NumberStyles.HexNumber), byte.Parse(reader.CurrentToken.Hex, NumberStyles.HexNumber) }; stringBuilder.Append(RuntimeEncoding.GetString(buff)); // Empty the HEX buffer hexBuffer = string.Empty; } } break; default: switch (reader.TokenType) { //case RtfTokenType.GroupEnd: // htmlExtraction = false; // break; case RtfTokenType.Text: if (!htmlExtraction) { stringBuilder.Append(reader.Keyword); } break; } break; } } } if (rtfContaintsEmbeddedHtml) { HtmlContent = stringBuilder.ToString(); } }
/// <summary> /// Reads the font table /// </summary> /// <param name="reader"></param> private void ReadFontTable(Reader reader) { FontTable.Clear(); while (reader.ReadToken() != null) { if (reader.TokenType == RtfTokenType.GroupEnd) { break; } if (reader.TokenType != RtfTokenType.GroupStart) { continue; } var index = -1; string name = null; var charset = 1; var nilFlag = false; while (reader.ReadToken() != null) { if (reader.TokenType == RtfTokenType.GroupEnd) { break; } if (reader.TokenType == RtfTokenType.GroupStart) { // If we meet nested levels, then ignore reader.ReadToken(); reader.ReadToEndOfGroup(); reader.ReadToken(); } else { switch (reader.Keyword) { case "f" when reader.HasParam: index = reader.Parameter; break; case "fnil": name = SystemFonts.DefaultFont.Name; nilFlag = true; break; case Consts.Fcharset: charset = reader.Parameter; break; default: if (reader.CurrentToken.IsTextToken) { name = ReadInnerText(reader, reader.CurrentToken, false, false, false); if (name != null) { name = name.Trim(); if (name.EndsWith(";")) { name = name.Substring(0, name.Length - 1); } } } break; } } } if (index < 0 || name == null) { continue; } if (name.EndsWith(";")) { name = name.Substring(0, name.Length - 1); } name = name.Trim(); if (string.IsNullOrEmpty(name)) { name = SystemFonts.DefaultFont.Name; } var font = new Font(index, name) { Charset = charset, NilFlag = nilFlag }; FontTable.Add(font); } }