Esempio n. 1
0
        public void ParseBrokenHtmlNoStartTag()
        {
            var html = "Some text with broken html</p>";

            StringBuilder builder  = new StringBuilder();
            HtmlTagTable  tagTable = new HtmlTagTable(html);

            Build(builder, html, tagTable);

            Assert.Equal(html, builder.ToString());
        }
Esempio n. 2
0
        public void ParseBrokenHtmlEndTagOnly()
        {
            var html = "</body>";

            StringBuilder builder  = new StringBuilder();
            HtmlTagTable  tagTable = new HtmlTagTable(html);

            Build(builder, html, tagTable);

            Assert.Equal(html, builder.ToString());
        }
Esempio n. 3
0
        public void ParsedHtmlSameAsOriginal(string file)
        {
            var html = GetHtml(file);

            StringBuilder builder  = new StringBuilder();
            HtmlTagTable  tagTable = new HtmlTagTable(html);

            Build(builder, html, tagTable);

            output.WriteLine(html);
            output.WriteLine(builder.ToString());

            Assert.Equal(html, builder.ToString());
        }
Esempio n. 4
0
        private static void Build(StringBuilder builder, string html, HtmlTagTable tagTable)
        {
            if (string.IsNullOrEmpty(html))
            {
                return;
            }

            var parser = new HtmlHelper(html, tagTable);

            if (parser.ParseErrors.Count() > 0)
            {
                // Fall back on regex parsing
                foreach (var chunk in Regex.Split(html, "(<.+?>)"))
                {
                    if (Regex.IsMatch(chunk, "<.+?>"))
                    {
                        builder.Append(chunk);
                    }
                    else
                    {
                        builder.Append(chunk);
                    }
                }

                return;
            }

            foreach (var node in parser.Descendants())
            {
                if (node.NodeType == HtmlNodeType.Element)
                {
                    if (!tagTable.Table[node.Name].HasEndTag)
                    {
                        builder.Append(parser.GetRawStartTag(node));
                    }
                    else if (tagTable.Table[node.Name].IsEndGhostTag)
                    {
                        builder.Append(parser.GetRawEndTag(node));
                    }
                    else
                    {
                        builder.Append(parser.GetRawStartTag(node));
                        Build(builder, node.InnerHtml, tagTable);

                        if (node.Closed)
                        {
                            builder.Append(parser.GetRawEndTag(node));
                        }

                        node.RemoveAllChildren();
                    }
                }
                else if (node.NodeType == HtmlNodeType.Text)
                {
                    builder.Append(node.InnerText);
                }
                else
                {
                    builder.Append(node.InnerHtml);
                }
            }
        }
Esempio n. 5
0
        private List <IAbstractMarkupData> CreateMarkupData(string input, HtmlTagTable tagTable, HtmlEntitizer entitizer, ConversionItem conversionItem)
        {
            var markupList = new List <IAbstractMarkupData>();
            var parser     = new HtmlHelper(entitizer.Entitize(input, conversionItem.Search.Text), tagTable);

            if (parser.ParseErrors.Any())
            {
                return(ParseTagsFallback(input, markupList));
            }

            foreach (var node in parser.Descendants())
            {
                if (node.NodeType == HtmlNodeType.Element)
                {
                    if (!tagTable.Table[node.OriginalName].HasEndTag)
                    {
                        var ph    = parser.GetRawStartTag(node);
                        var phTag = CreatePlaceHolderTag(ph);
                        markupList.Add(phTag);
                    }
                    else if (tagTable.Table[node.OriginalName].IsEndGhostTag)
                    {
                        var eTag = parser.GetRawEndTag(node);
                        markupList.Add(CreatePlaceHolderTag(eTag));
                    }
                    else
                    {
                        var stTag = parser.GetRawStartTag(node);

                        if (node.Closed)
                        {
                            var eTag     = parser.GetRawEndTag(node);
                            var startTag = CreateStartTag(stTag);
                            var endTag   = CreateEndTag(eTag);
                            var tagPair  = CreateTagPair(startTag, endTag);

                            if (!ContainsTags(node.InnerHtml))
                            {
                                if (!string.IsNullOrEmpty(node.InnerHtml))
                                {
                                    var itext = CreateIText(entitizer.DeEntitize(node.InnerHtml));
                                    tagPair.Add(itext);
                                }

                                // Experimental:
                                // Creation of new formatting
                                CreateNewFormatting(node, tagPair);
                            }
                            else
                            {
                                var list = CreateMarkupData(node.InnerHtml, tagTable, entitizer, conversionItem);

                                foreach (var item in list)
                                {
                                    tagPair.Add(item);
                                }
                            }

                            markupList.Add(tagPair);
                        }
                        else
                        {
                            var phTag = CreatePlaceHolderTag(stTag);
                            markupList.Add(phTag);

                            var list = CreateMarkupData(node.InnerHtml, tagTable, entitizer, conversionItem);

                            foreach (var item in list)
                            {
                                markupList.Add(item);
                            }
                        }

                        node.RemoveAllChildren();
                    }
                }
                else if (node.NodeType == HtmlNodeType.Text)
                {
                    //check for data like {M:>0},{M:<0},{M:≥0} which is treated like a text but tghe intention is to convert them in placeholder
                    if (Regex.IsMatch(input, "(?:>|≤|<|≥)\\d+"))
                    {
                        var originalTextSplited =
                            entitizer.GetOriginalTextSplited(entitizer.DeEntitize(node.InnerText), conversionItem.Search.Text);

                        ReplaceSpecialPlaceholders(originalTextSplited, markupList);
                    }
                    else
                    {
                        markupList.Add(CreateIText(entitizer.DeEntitize(node.InnerText)));
                    }
                }
                else
                {
                    markupList.Add(CreateIText(entitizer.DeEntitize(node.InnerHtml)));
                }
            }

            return(markupList);
        }