public void ParseBrokenHtmlNoStartTag() { var html = "Some text with broken html</p>"; StringBuilder builder = new StringBuilder(); HtmlTagTable tagTable = new HtmlTagTable(html); Build(builder, html, tagTable); Assert.Equal(html, builder.ToString()); }
public void ParseBrokenHtmlEndTagOnly() { var html = "</body>"; StringBuilder builder = new StringBuilder(); HtmlTagTable tagTable = new HtmlTagTable(html); Build(builder, html, tagTable); Assert.Equal(html, builder.ToString()); }
public void ParsedHtmlSameAsOriginal(string file) { var html = GetHtml(file); StringBuilder builder = new StringBuilder(); HtmlTagTable tagTable = new HtmlTagTable(html); Build(builder, html, tagTable); output.WriteLine(html); output.WriteLine(builder.ToString()); Assert.Equal(html, builder.ToString()); }
private static void Build(StringBuilder builder, string html, HtmlTagTable tagTable) { if (string.IsNullOrEmpty(html)) { return; } var parser = new HtmlHelper(html, tagTable); if (parser.ParseErrors.Count() > 0) { // Fall back on regex parsing foreach (var chunk in Regex.Split(html, "(<.+?>)")) { if (Regex.IsMatch(chunk, "<.+?>")) { builder.Append(chunk); } else { builder.Append(chunk); } } return; } foreach (var node in parser.Descendants()) { if (node.NodeType == HtmlNodeType.Element) { if (!tagTable.Table[node.Name].HasEndTag) { builder.Append(parser.GetRawStartTag(node)); } else if (tagTable.Table[node.Name].IsEndGhostTag) { builder.Append(parser.GetRawEndTag(node)); } else { builder.Append(parser.GetRawStartTag(node)); Build(builder, node.InnerHtml, tagTable); if (node.Closed) { builder.Append(parser.GetRawEndTag(node)); } node.RemoveAllChildren(); } } else if (node.NodeType == HtmlNodeType.Text) { builder.Append(node.InnerText); } else { builder.Append(node.InnerHtml); } } }
private List <IAbstractMarkupData> CreateMarkupData(string input, HtmlTagTable tagTable, HtmlEntitizer entitizer, ConversionItem conversionItem) { var markupList = new List <IAbstractMarkupData>(); var parser = new HtmlHelper(entitizer.Entitize(input, conversionItem.Search.Text), tagTable); if (parser.ParseErrors.Any()) { return(ParseTagsFallback(input, markupList)); } foreach (var node in parser.Descendants()) { if (node.NodeType == HtmlNodeType.Element) { if (!tagTable.Table[node.OriginalName].HasEndTag) { var ph = parser.GetRawStartTag(node); var phTag = CreatePlaceHolderTag(ph); markupList.Add(phTag); } else if (tagTable.Table[node.OriginalName].IsEndGhostTag) { var eTag = parser.GetRawEndTag(node); markupList.Add(CreatePlaceHolderTag(eTag)); } else { var stTag = parser.GetRawStartTag(node); if (node.Closed) { var eTag = parser.GetRawEndTag(node); var startTag = CreateStartTag(stTag); var endTag = CreateEndTag(eTag); var tagPair = CreateTagPair(startTag, endTag); if (!ContainsTags(node.InnerHtml)) { if (!string.IsNullOrEmpty(node.InnerHtml)) { var itext = CreateIText(entitizer.DeEntitize(node.InnerHtml)); tagPair.Add(itext); } // Experimental: // Creation of new formatting CreateNewFormatting(node, tagPair); } else { var list = CreateMarkupData(node.InnerHtml, tagTable, entitizer, conversionItem); foreach (var item in list) { tagPair.Add(item); } } markupList.Add(tagPair); } else { var phTag = CreatePlaceHolderTag(stTag); markupList.Add(phTag); var list = CreateMarkupData(node.InnerHtml, tagTable, entitizer, conversionItem); foreach (var item in list) { markupList.Add(item); } } node.RemoveAllChildren(); } } else if (node.NodeType == HtmlNodeType.Text) { //check for data like {M:>0},{M:<0},{M:≥0} which is treated like a text but tghe intention is to convert them in placeholder if (Regex.IsMatch(input, "(?:>|≤|<|≥)\\d+")) { var originalTextSplited = entitizer.GetOriginalTextSplited(entitizer.DeEntitize(node.InnerText), conversionItem.Search.Text); ReplaceSpecialPlaceholders(originalTextSplited, markupList); } else { markupList.Add(CreateIText(entitizer.DeEntitize(node.InnerText))); } } else { markupList.Add(CreateIText(entitizer.DeEntitize(node.InnerHtml))); } } return(markupList); }