Example #1
0
        /// <summary>
        /// Parses an opening tag like '&lt;div>'. Starts with the input stream
        /// pointing to the opening &lt; character.
        /// </summary>
        private Element ProcessTagOpenState(Element currentElement)
        {
            if (!_characterStream.MoveNext())
            {
                _document.ConformanceLevel *= 0.8f;
                return(null);
            }

            switch (_characterStream.Current)
            {
            case '!':
                _characterStream.State = HtmlStates.MarkupDeclarationOpen;
                return(currentElement);

            case '/':
                _characterStream.State = HtmlStates.EndTagOpen;
                return(currentElement);
            }

            var startPosition = _characterStream.CurrentPosition;

            string tagName     = null;
            var    selfClosing = false;

            using (var nameBuffer = _stringBuilderFactory.Create(_maximumNameLength))
            {
                char?terminator;
                _characterStream.State = HtmlStates.TagName;
                if (_stringParser.TakeUntil(nameBuffer, _maximumNameLength, c => char.IsWhiteSpace(c) || c == '>' || c == '/', out terminator))
                {
                    tagName = nameBuffer.ToString().ToLower();
                    if (!terminator.HasValue || terminator == '/')
                    {
                        _characterStream.State = HtmlStates.SelfClosingStartTag;
                        if (_stringParser.Peek() == '>')
                        {
                            _stringParser.TakeOne();
                        }
                        else
                        {
                            _document.ConformanceLevel *= 0.9f;
                        }
                    }
                    else if (terminator == '>')
                    {
                        _characterStream.State = _voidElements.Contains(tagName)
                            ? HtmlStates.SelfClosingStartTag
                            : HtmlStates.Data;
                    }
                    else
                    {
                        _characterStream.State = HtmlStates.AttributeName;
                        selfClosing            = _voidElements.Contains(tagName);
                    }
                }
                else
                {
                    _document.ConformanceLevel *= 0.9f;
                    nameBuffer.Clear();
                    _stringParser.Take(nameBuffer, _maximumNameLength);
                    var buffer = nameBuffer.ToString().ToLower();
                    _characterStream.State = HtmlStates.Data;
                    foreach (var name in _allElements)
                    {
                        if (buffer.StartsWith(name))
                        {
                            nameBuffer.Clear();
                            _characterStream.Reset(startPosition);
                            _stringParser.Take(nameBuffer, name.Length);
                            tagName = nameBuffer.ToString();
                            _characterStream.State = HtmlStates.AttributeName;
                            selfClosing            = _voidElements.Contains(tagName);
                            break;
                        }
                    }
                    if (tagName == null)
                    {
                        _document.ConformanceLevel *= 0.6f;
                    }
                }
            }

            var attributes = _characterStream.State == HtmlStates.AttributeName ? ParseAttributes(selfClosing) : null;

            if (currentElement != null)
            {
                var parentElement = currentElement;

                switch (tagName)
                {
                case "html":
                case "body":
                case "form":
                case "header":
                case "footer":
                    // These elements are not parsed and contain no details. They are included in the output
                    // only as containers for their children
                    currentElement = new UnsupportedElement {
                        Attributes = attributes, SuppressOutput = false
                    };
                    break;

                case "p":
                case "li":
                    // These elements are treated as paragraphs in other markup formats, for
                    // example in markdown they will have a blank line above them to create a
                    // paragraph break.
                    currentElement = new ParagraphElement {
                        Attributes = attributes
                    };
                    break;

                case "blockquote":
                    currentElement = new ParagraphElement
                    {
                        Attributes = attributes,
                        Styles     = new Dictionary <string, string>
                        {
                            { "margin-top", "10px" },
                            { "margin-bottom", "10px" },
                            { "margin-left", "50px" },
                            { "padding-left", "15px" },
                            { "border-left", "3px solid #ccc" }
                        }
                    };
                    break;

                case "div":
                    // Divs are tricky because some pwople use them to group elements with similar
                    // style and other people used them instead of paragraphs. Since divs are by
                    // default block elements it makes more sense in most cases to treat them link
                    // paragraphs unless they have paraphraphs or other divs within them.
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.Division, Attributes = attributes
                    };
                    break;

                case "span":
                    // These elements are treated as inline text. For example in markdown
                    // these are rendered without an extra blank line and are therefore rendered
                    // as part of the prior paragraph
                    currentElement = new SpanElement {
                        Attributes = attributes
                    };
                    break;

                case "a":
                    // Anchor tags are a special case
                    if (attributes != null && attributes.ContainsKey("href"))
                    {
                        currentElement = new AnchorElement {
                            LinkAddress = attributes["href"]
                        }
                    }
                    ;
                    else
                    {
                        currentElement = new UnsupportedElement {
                            Attributes = attributes
                        }
                    };
                    break;

                case "iframe":
                case "img":
                    // Image tags are a special case
                    if (attributes != null && attributes.ContainsKey("src"))
                    {
                        var alt = attributes.ContainsKey("alt") ? attributes["alt"] : null;
                        currentElement = new ImageElement {
                            LinkAddress = attributes["src"], AltText = alt
                        };
                    }
                    else
                    {
                        currentElement = new UnsupportedElement {
                            Attributes = attributes
                        };
                    }
                    break;

                case "h1":
                    currentElement = new HeadingElement {
                        Level = 1
                    };
                    break;

                case "h2":
                    currentElement = new HeadingElement {
                        Level = 2
                    };
                    break;

                case "h3":
                    currentElement = new HeadingElement {
                        Level = 3
                    };
                    break;

                case "h4":
                    currentElement = new HeadingElement {
                        Level = 4
                    };
                    break;

                case "h5":
                    currentElement = new HeadingElement {
                        Level = 5
                    };
                    break;

                case "h6":
                    currentElement = new HeadingElement {
                        Level = 6
                    };
                    break;

                case "strong":
                case "b":
                    // Bold is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "font-weight", "bold" }
                        }
                    };
                    break;

                case "cite":
                case "q":
                case "i":
                case "em":
                    // Italic is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "font-style", "italic" }
                        }
                    };
                    break;

                case "u":
                    // Underline is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "text-decoration", "underline" }
                        }
                    };
                    break;

                case "small":
                    // Small is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "font-size", "smaller" }
                        }
                    };
                    break;

                case "sup":
                    // Superscript is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "vertical-align", "super" },
                            { "font-size", "smaller" }
                        }
                    };
                    break;

                case "sub":
                    // Subscript is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "vertical-align", "sub" },
                            { "font-size", "smaller" }
                        }
                    };
                    break;

                case "br":
                    currentElement = new BreakElement {
                        BreakType = BreakTypes.LineBreak
                    };
                    break;

                case "hr":
                    currentElement = new BreakElement {
                        BreakType = BreakTypes.HorizontalRule
                    };
                    break;

                case "ul":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.BulletList, Attributes = attributes
                    };
                    break;

                case "ol":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.NumberedList, Attributes = attributes
                    };
                    break;

                case "table":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.Table, Attributes = attributes
                    };
                    break;

                case "tr":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.TableDataRow, Attributes = attributes
                    };
                    break;

                case "th":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.TableHeaderRow, Attributes = attributes
                    };
                    break;

                case "td":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.TableDataCell, Attributes = attributes
                    };
                    break;

                default:
                    // All other elements will be excluded from the output document, but will
                    // be parsed just so that we know where they and and the next valid element
                    // begins.
                    currentElement = new UnsupportedElement {
                        Attributes = attributes
                    };
                    break;
                }

                var styleElement = currentElement as IStyleElement;
                if (styleElement != null && attributes != null)
                {
                    if (attributes.ContainsKey("class"))
                    {
                        styleElement.ClassNames = attributes["class"];
                        attributes.Remove("class");
                    }
                    if (attributes.ContainsKey("style"))
                    {
                        if (styleElement.Styles == null)
                        {
                            styleElement.Styles = new Dictionary <string, string>();
                        }
                        var styles = attributes["style"].Split(';').Select(s => s.Trim()).Where(s => s.Length > 0);
                        foreach (var style in styles)
                        {
                            var colonPos = style.IndexOf(':');
                            if (colonPos > 0 && colonPos < style.Length - 1)
                            {
                                var name  = style.Substring(0, colonPos).Trim().ToLower();
                                var value = style.Substring(colonPos + 1).Trim().ToLower();
                                if (!styleElement.Styles.ContainsKey(name))
                                {
                                    styleElement.Styles[name] = value;
                                }
                            }
                            else
                            {
                                _document.ConformanceLevel *= 0.9f;
                            }
                        }
                        attributes.Remove("style");
                    }
                }

                currentElement.Name   = tagName;
                currentElement.Parent = parentElement;
                if (parentElement.SuppressOutput)
                {
                    currentElement.SuppressOutput = true;
                }

                if (parentElement.Children == null)
                {
                    parentElement.Children = new List <IDocumentElement>();
                }
                parentElement.Children.Add(currentElement);

                if (!Begin(currentElement))
                {
                    return(null);
                }
            }

            return(currentElement);
        }
        private bool CheckTable(string trimmedLine)
        {
            Func <string, List <string> > splitColumns = l =>
            {
                var columns = l.Split('|').Select(c => c.Trim()).ToList();
                if (trimmedLine.StartsWith("|"))
                {
                    columns.RemoveAt(0);
                }
                if (trimmedLine.EndsWith("|"))
                {
                    columns.RemoveAt(columns.Count - 1);
                }
                return(columns);
            };

            if ((_characterStream.State == MarkdownStates.ParagraphBreak && trimmedLine.Contains('|')) ||
                ((_characterStream.State == MarkdownStates.Paragraph || _characterStream.State == MarkdownStates.Heading) && trimmedLine[0] == '|'))
            {
                PushElement(new ContainerElement
                {
                    Name          = "table",
                    ContainerType = ContainerTypes.Table,
                    ChildLayout   = new List <string>()
                });
                PushElement(new ContainerElement
                {
                    Name          = "thead",
                    ContainerType = ContainerTypes.TableHeader,
                });
                _characterStream.State = MarkdownStates.TableHeadings;
            }

            if (_characterStream.State == MarkdownStates.TableHeadings)
            {
                if (trimmedLine.All(c => char.IsWhiteSpace(c) || "|-:".Contains(c)))
                {
                    var columnFormats = splitColumns(trimmedLine);

                    for (var i = 0; i < columnFormats.Count; i++)
                    {
                        var format = columnFormats[i];
                        columnFormats[i] = string.Empty;
                        if (format.Length > 1)
                        {
                            if (format[format.Length - 1] == ':')
                            {
                                columnFormats[i] = format[0] == ':' ? "center" : "right";
                            }
                            else if (format[0] == ':')
                            {
                                columnFormats[i] = "left";
                            }
                        }
                    }

                    var tableElement = FindPriorElement <ContainerElement>(c => c.ContainerType == ContainerTypes.Table);
                    if (tableElement != null)
                    {
                        tableElement.ChildLayout = columnFormats;
                    }

                    PopElement(); // Pop the TableHeader

                    PushElement(new ContainerElement
                    {
                        Name          = "tbody",
                        ContainerType = ContainerTypes.TableBody,
                    });

                    _characterStream.State = MarkdownStates.TableRow;
                }
                else
                {
                    PushElement(new ContainerElement {
                        Name = "tr", ContainerType = ContainerTypes.TableHeaderRow
                    });
                    var columns = splitColumns(trimmedLine);
                    foreach (var column in columns)
                    {
                        PushElement(new ContainerElement {
                            Name = "th", ContainerType = ContainerTypes.TableDataCell
                        });
                        ParseText(column);
                        PopElement(); // Pop the th off the stack, adding it to the tr
                    }
                    PopElement();     // Pop the tr, adding it to the table
                }
                return(true);
            }

            if (_characterStream.State == MarkdownStates.TableRow)
            {
                var tableElement  = FindPriorElement <ContainerElement>(c => c.ContainerType == ContainerTypes.Table);
                var columnFormats = tableElement == null ? null : (List <string>)tableElement.ChildLayout;

                PushElement(new ContainerElement {
                    Name = "tr", ContainerType = ContainerTypes.TableDataRow
                });
                var columns = splitColumns(trimmedLine);
                for (var i = 0; i < columns.Count; i++)
                {
                    var td = new ContainerElement {
                        Name = "td", ContainerType = ContainerTypes.TableDataCell
                    };
                    if (columnFormats != null && columnFormats.Count > i && !string.IsNullOrEmpty(columnFormats[i]))
                    {
                        td.Attributes = new Dictionary <string, string>
                        {
                            { "align", columnFormats[i] }
                        };
                    }
                    PushElement(td);
                    ParseText(columns[i]);
                    PopElement(); // Pop the th off the stack, adding it to the tr
                }
                PopElement();     // Pop the tr, adding it to the table
                return(true);
            }

            return(false);
        }