//common part public static void Process(StringBuilder builder, ref ToPlainTextState state, params char[] chars) { foreach (var ch in chars) { if (char.IsWhiteSpace(ch)) { if (IsHardSpace(ch)) { if (state == ToPlainTextState.WhiteSpace) { builder.Append(' '); } builder.Append(' '); state = ToPlainTextState.NotWhiteSpace; } else { if (state == ToPlainTextState.NotWhiteSpace) { state = ToPlainTextState.WhiteSpace; } } } else { if (state == ToPlainTextState.WhiteSpace) { builder.Append(' '); } builder.Append(ch); state = ToPlainTextState.NotWhiteSpace; } } }
public static string HtmlToPlainText(string htmlDoc) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlDoc); StringBuilder builder = new StringBuilder(); ToPlainTextState state = ToPlainTextState.StartLine; Plain(builder, ref state, new[] { doc.DocumentNode }); return(builder.ToString()); }
private static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable <XNode> nodes) { foreach (var node in nodes) { if (node is XElement) { var element = (XElement)node; var tag = element.Name.LocalName.ToLower(); if (tag == "br") { builder.AppendLine(); state = ToPlainTextState.StartLine; } else if (NonVisibleTags.Contains(tag)) { } else if (InlineTags.Contains(tag)) { Plain(builder, ref state, element.Nodes()); } else { if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } Plain(builder, ref state, element.Nodes()); if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } } } else if (node is XText) { var text = (XText)node; Process(builder, ref state, text.Value.ToCharArray()); } } }
private static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable <HtmlAgilityPack.HtmlNode> nodes) { foreach (var node in nodes) { if (node is HtmlAgilityPack.HtmlTextNode) { var text = (HtmlAgilityPack.HtmlTextNode)node; Process(builder, ref state, HtmlAgilityPack.HtmlEntity.DeEntitize(text.Text).ToCharArray()); } else { var tag = node.Name.ToLower(); if (tag == "br") { builder.AppendLine(); state = ToPlainTextState.StartLine; } else if (NonVisibleTags.Contains(tag)) { } else if (InlineTags.Contains(tag)) { Plain(builder, ref state, node.ChildNodes); } else { if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } Plain(builder, ref state, node.ChildNodes); if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } } } } }
private static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable <HtmlNode> nodes) { foreach (HtmlNode node in nodes) { if (node is HtmlTextNode) { HtmlTextNode text = (HtmlTextNode)node; char[] chars = HtmlEntity.DeEntitize(text.Text).ToCharArray(); foreach (char ch in chars) { if (char.IsWhiteSpace(ch)) { if (ch == 0xA0 || ch == 0x2007 || ch == 0x202F) { if (state == ToPlainTextState.WhiteSpace) { builder.Append(' '); } builder.Append(' '); state = ToPlainTextState.NotWhiteSpace; } else { if (state == ToPlainTextState.NotWhiteSpace) { state = ToPlainTextState.WhiteSpace; } } } else { if (state == ToPlainTextState.WhiteSpace) { builder.Append(' '); } builder.Append(ch); state = ToPlainTextState.NotWhiteSpace; } } } else { string tag = node.Name.ToLower(); if (tag == "br") { builder.AppendLine(); state = ToPlainTextState.StartLine; } else if (NonVisibleTags.Contains(tag)) { } else if (InlineTags.Contains(tag)) { Plain(builder, ref state, node.ChildNodes); } else { if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } Plain(builder, ref state, node.ChildNodes); if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } } } } }