示例#1
0
        /// <summary>
        /// Parses an opening tag like '&lt;div>'. Starts with the input stream
        /// pointing to the opening &lt; character.
        /// </summary>
        private Element ProcessTagOpenState(Element currentElement)
        {
            if (!_characterStream.MoveNext())
            {
                _document.ConformanceLevel *= 0.8f;
                return(null);
            }

            switch (_characterStream.Current)
            {
            case '!':
                _characterStream.State = HtmlStates.MarkupDeclarationOpen;
                return(currentElement);

            case '/':
                _characterStream.State = HtmlStates.EndTagOpen;
                return(currentElement);
            }

            var startPosition = _characterStream.CurrentPosition;

            string tagName     = null;
            var    selfClosing = false;

            using (var nameBuffer = _stringBuilderFactory.Create(_maximumNameLength))
            {
                char?terminator;
                _characterStream.State = HtmlStates.TagName;
                if (_stringParser.TakeUntil(nameBuffer, _maximumNameLength, c => char.IsWhiteSpace(c) || c == '>' || c == '/', out terminator))
                {
                    tagName = nameBuffer.ToString().ToLower();
                    if (!terminator.HasValue || terminator == '/')
                    {
                        _characterStream.State = HtmlStates.SelfClosingStartTag;
                        if (_stringParser.Peek() == '>')
                        {
                            _stringParser.TakeOne();
                        }
                        else
                        {
                            _document.ConformanceLevel *= 0.9f;
                        }
                    }
                    else if (terminator == '>')
                    {
                        _characterStream.State = _voidElements.Contains(tagName)
                            ? HtmlStates.SelfClosingStartTag
                            : HtmlStates.Data;
                    }
                    else
                    {
                        _characterStream.State = HtmlStates.AttributeName;
                        selfClosing            = _voidElements.Contains(tagName);
                    }
                }
                else
                {
                    _document.ConformanceLevel *= 0.9f;
                    nameBuffer.Clear();
                    _stringParser.Take(nameBuffer, _maximumNameLength);
                    var buffer = nameBuffer.ToString().ToLower();
                    _characterStream.State = HtmlStates.Data;
                    foreach (var name in _allElements)
                    {
                        if (buffer.StartsWith(name))
                        {
                            nameBuffer.Clear();
                            _characterStream.Reset(startPosition);
                            _stringParser.Take(nameBuffer, name.Length);
                            tagName = nameBuffer.ToString();
                            _characterStream.State = HtmlStates.AttributeName;
                            selfClosing            = _voidElements.Contains(tagName);
                            break;
                        }
                    }
                    if (tagName == null)
                    {
                        _document.ConformanceLevel *= 0.6f;
                    }
                }
            }

            var attributes = _characterStream.State == HtmlStates.AttributeName ? ParseAttributes(selfClosing) : null;

            if (currentElement != null)
            {
                var parentElement = currentElement;

                switch (tagName)
                {
                case "html":
                case "body":
                case "form":
                case "header":
                case "footer":
                    // These elements are not parsed and contain no details. They are included in the output
                    // only as containers for their children
                    currentElement = new UnsupportedElement {
                        Attributes = attributes, SuppressOutput = false
                    };
                    break;

                case "p":
                case "li":
                    // These elements are treated as paragraphs in other markup formats, for
                    // example in markdown they will have a blank line above them to create a
                    // paragraph break.
                    currentElement = new ParagraphElement {
                        Attributes = attributes
                    };
                    break;

                case "blockquote":
                    currentElement = new ParagraphElement
                    {
                        Attributes = attributes,
                        Styles     = new Dictionary <string, string>
                        {
                            { "margin-top", "10px" },
                            { "margin-bottom", "10px" },
                            { "margin-left", "50px" },
                            { "padding-left", "15px" },
                            { "border-left", "3px solid #ccc" }
                        }
                    };
                    break;

                case "div":
                    // Divs are tricky because some pwople use them to group elements with similar
                    // style and other people used them instead of paragraphs. Since divs are by
                    // default block elements it makes more sense in most cases to treat them link
                    // paragraphs unless they have paraphraphs or other divs within them.
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.Division, Attributes = attributes
                    };
                    break;

                case "span":
                    // These elements are treated as inline text. For example in markdown
                    // these are rendered without an extra blank line and are therefore rendered
                    // as part of the prior paragraph
                    currentElement = new SpanElement {
                        Attributes = attributes
                    };
                    break;

                case "a":
                    // Anchor tags are a special case
                    if (attributes != null && attributes.ContainsKey("href"))
                    {
                        currentElement = new AnchorElement {
                            LinkAddress = attributes["href"]
                        }
                    }
                    ;
                    else
                    {
                        currentElement = new UnsupportedElement {
                            Attributes = attributes
                        }
                    };
                    break;

                case "iframe":
                case "img":
                    // Image tags are a special case
                    if (attributes != null && attributes.ContainsKey("src"))
                    {
                        var alt = attributes.ContainsKey("alt") ? attributes["alt"] : null;
                        currentElement = new ImageElement {
                            LinkAddress = attributes["src"], AltText = alt
                        };
                    }
                    else
                    {
                        currentElement = new UnsupportedElement {
                            Attributes = attributes
                        };
                    }
                    break;

                case "h1":
                    currentElement = new HeadingElement {
                        Level = 1
                    };
                    break;

                case "h2":
                    currentElement = new HeadingElement {
                        Level = 2
                    };
                    break;

                case "h3":
                    currentElement = new HeadingElement {
                        Level = 3
                    };
                    break;

                case "h4":
                    currentElement = new HeadingElement {
                        Level = 4
                    };
                    break;

                case "h5":
                    currentElement = new HeadingElement {
                        Level = 5
                    };
                    break;

                case "h6":
                    currentElement = new HeadingElement {
                        Level = 6
                    };
                    break;

                case "strong":
                case "b":
                    // Bold is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "font-weight", "bold" }
                        }
                    };
                    break;

                case "cite":
                case "q":
                case "i":
                case "em":
                    // Italic is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "font-style", "italic" }
                        }
                    };
                    break;

                case "u":
                    // Underline is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "text-decoration", "underline" }
                        }
                    };
                    break;

                case "small":
                    // Small is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "font-size", "smaller" }
                        }
                    };
                    break;

                case "sup":
                    // Superscript is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "vertical-align", "super" },
                            { "font-size", "smaller" }
                        }
                    };
                    break;

                case "sub":
                    // Subscript is represented as an inline style
                    currentElement = new FormattedElement
                    {
                        ElementType = ElementTypes.InlineText,
                        Styles      = new Dictionary <string, string>
                        {
                            { "vertical-align", "sub" },
                            { "font-size", "smaller" }
                        }
                    };
                    break;

                case "br":
                    currentElement = new BreakElement {
                        BreakType = BreakTypes.LineBreak
                    };
                    break;

                case "hr":
                    currentElement = new BreakElement {
                        BreakType = BreakTypes.HorizontalRule
                    };
                    break;

                case "ul":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.BulletList, Attributes = attributes
                    };
                    break;

                case "ol":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.NumberedList, Attributes = attributes
                    };
                    break;

                case "table":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.Table, Attributes = attributes
                    };
                    break;

                case "tr":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.TableDataRow, Attributes = attributes
                    };
                    break;

                case "th":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.TableHeaderRow, Attributes = attributes
                    };
                    break;

                case "td":
                    currentElement = new ContainerElement {
                        ContainerType = ContainerTypes.TableDataCell, Attributes = attributes
                    };
                    break;

                default:
                    // All other elements will be excluded from the output document, but will
                    // be parsed just so that we know where they and and the next valid element
                    // begins.
                    currentElement = new UnsupportedElement {
                        Attributes = attributes
                    };
                    break;
                }

                var styleElement = currentElement as IStyleElement;
                if (styleElement != null && attributes != null)
                {
                    if (attributes.ContainsKey("class"))
                    {
                        styleElement.ClassNames = attributes["class"];
                        attributes.Remove("class");
                    }
                    if (attributes.ContainsKey("style"))
                    {
                        if (styleElement.Styles == null)
                        {
                            styleElement.Styles = new Dictionary <string, string>();
                        }
                        var styles = attributes["style"].Split(';').Select(s => s.Trim()).Where(s => s.Length > 0);
                        foreach (var style in styles)
                        {
                            var colonPos = style.IndexOf(':');
                            if (colonPos > 0 && colonPos < style.Length - 1)
                            {
                                var name  = style.Substring(0, colonPos).Trim().ToLower();
                                var value = style.Substring(colonPos + 1).Trim().ToLower();
                                if (!styleElement.Styles.ContainsKey(name))
                                {
                                    styleElement.Styles[name] = value;
                                }
                            }
                            else
                            {
                                _document.ConformanceLevel *= 0.9f;
                            }
                        }
                        attributes.Remove("style");
                    }
                }

                currentElement.Name   = tagName;
                currentElement.Parent = parentElement;
                if (parentElement.SuppressOutput)
                {
                    currentElement.SuppressOutput = true;
                }

                if (parentElement.Children == null)
                {
                    parentElement.Children = new List <IDocumentElement>();
                }
                parentElement.Children.Add(currentElement);

                if (!Begin(currentElement))
                {
                    return(null);
                }
            }

            return(currentElement);
        }
        private void ParseText(string text)
        {
            var isBold   = false;
            var isItalic = false;
            var isCode   = false;

            using (var buffer = _stringBuilderFactory.Create())
            {
                Action <IStringBuilder> flush = b =>
                {
                    if (b.Length > 0)
                    {
                        PushElement(new RawTextElement {
                            Text = b.ToString()
                        });
                        PopElement();
                        b.Clear();
                    }
                };

                for (var i = 0; i < text.Length; i++)
                {
                    var prior2  = i > 1 ? text[i - 2] : default(char);
                    var prior1  = i > 0 ? text[i - 1] : default(char);
                    var current = text[i];
                    var next    = i < text.Length - 2 ? text[i + 1] : default(char);

                    if ((prior1 == '*' && current == '*') || (prior1 == '_' && current == '_'))
                    {
                        // Double asterix or double underline turns bold on/off
                        flush(buffer);
                        if (isBold)
                        {
                            PopElement();
                            isBold = false;
                        }
                        else
                        {
                            PushElement(new FormattedElement
                            {
                                Name        = new String(current, 2),
                                ElementType = ElementTypes.InlineText,
                                Styles      = new Dictionary <string, string> {
                                    { "font-weight", "bold" }
                                }
                            });
                            isBold = true;
                        }
                    }
                    else if ((prior1 == '*' || prior1 == '_') && (prior1 != prior2))
                    {
                        // Single asterix or single underline turns italic on/off
                        flush(buffer);
                        if (isItalic)
                        {
                            PopElement();
                            isItalic = false;
                        }
                        else
                        {
                            PushElement(new FormattedElement
                            {
                                Name        = new String(prior1, 1),
                                ElementType = ElementTypes.InlineText,
                                Styles      = new Dictionary <string, string> {
                                    { "font-style", "italic" }
                                }
                            });
                            isItalic = true;
                        }
                        buffer.Append(current);
                    }
                    else if (current == '*' || current == '_')
                    {
                        // When we see the first asterix or underscore, we don't know yet if this
                        // is going to be bold or italic unless this is the last character in the string
                    }
                    else if (current == '`')
                    {
                        // Backticks turn on/off code formatting
                        flush(buffer);
                        if (isCode)
                        {
                            PopElement();
                            isCode = false;
                        }
                        else
                        {
                            PushElement(new FormattedElement
                            {
                                Name        = new String(current, 2),
                                ElementType = ElementTypes.InlineText,
                                ClassNames  = "code"
                            });
                            isCode = true;
                        }
                    }
                    else if (current == '!')
                    {
                        // Can be an image link if it is followed by []
                        if (next != '[')
                        {
                            buffer.Append(current);
                        }
                    }
                    else if (current == '[')
                    {
                        // Open square bracket is the start of a hyperlink

                        if (text.Length < i + 3)
                        {
                            buffer.Append(current);
                            continue;
                        }

                        var firstCloseIndex = text.IndexOf(']', i + 1);
                        if (firstCloseIndex < 0)
                        {
                            buffer.Append(current);
                            continue;
                        }

                        AnchorElement anchor      = null;
                        var           title       = text.Substring(i + 1, firstCloseIndex - i - 1);
                        var           nextChar    = text.Length > firstCloseIndex + 3 ? text[firstCloseIndex + 1] : default(char);
                        var           isImageLink = prior1 == '!';

                        if (nextChar == '(')
                        {
                            // Url is inline with the link
                            var secondCloseIndex = text.IndexOf(')', firstCloseIndex + 2);
                            if (secondCloseIndex == -1)
                            {
                                secondCloseIndex = text.Length;
                            }
                            anchor = new AnchorElement
                            {
                                Name        = "()",
                                LinkAddress = text.Substring(firstCloseIndex + 2, secondCloseIndex - firstCloseIndex - 2)
                            };
                            i = secondCloseIndex;
                        }
                        else if (nextChar == '[')
                        {
                            // Url is provided elsewhere in the document as a reference
                            var secondCloseIndex = text.IndexOf(']', firstCloseIndex + 2);
                            if (secondCloseIndex == -1)
                            {
                                secondCloseIndex = text.Length;
                            }
                            anchor = new AnchorElement
                            {
                                Name        = "[]",
                                LinkAddress = text.Substring(firstCloseIndex + 2, secondCloseIndex - firstCloseIndex - 2)
                            };
                            _anchorsToFixup.Add(anchor);
                            i = secondCloseIndex;
                        }
                        else if (nextChar == ':')
                        {
                            var urlStartIndex = firstCloseIndex + 2;
                            while (char.IsWhiteSpace(text[urlStartIndex]))
                            {
                                urlStartIndex++;
                            }

                            var urlEndIndex = urlStartIndex + 1;
                            while (urlEndIndex < text.Length && !char.IsWhiteSpace(text[urlEndIndex]))
                            {
                                urlEndIndex++;
                            }
                            var url = text.Substring(urlStartIndex, urlEndIndex - urlStartIndex);
                            _references[title] = url;
                            i = urlEndIndex - 1;
                        }
                        else
                        {
                            // Self-referencing
                            anchor = new AnchorElement
                            {
                                Name        = "[]",
                                LinkAddress = title
                            };
                            _anchorsToFixup.Add(anchor);
                            i = firstCloseIndex;
                        }

                        if (anchor != null)
                        {
                            flush(buffer);
                            if (isImageLink)
                            {
                                anchor.LinkType = LinkTypes.Image;
                                anchor.AltText  = title;
                                PushElement(anchor);
                                PopElement(); // Pop the anchor to add it to its parent
                            }
                            else
                            {
                                PushElement(anchor);
                                PushElement(new RawTextElement {
                                    Text = title
                                });
                                PopElement(); // Pop the raw text
                                PopElement(); // Pop the anchor
                            }
                        }
                    }
                    else if (current == '<')
                    {
                        // Angle brackets are the start of a hyperlink with no title
                        var closeIndex = text.IndexOf('>', i + 1);
                        if (closeIndex < 0)
                        {
                            buffer.Append(current);
                            continue;
                        }

                        var url = text.Substring(i + 1, closeIndex - i - 1);

                        flush(buffer);
                        PushElement(new AnchorElement {
                            Name = "<>", LinkAddress = url
                        });
                        PushElement(new RawTextElement {
                            Text = url
                        });
                        PopElement(); // Pop the raw text
                        PopElement(); // Pop the anchor

                        i = closeIndex;
                    }
                    else
                    {
                        buffer.Append(current);
                    }
                }

                flush(buffer);

                if (isItalic) // If italic was opened but not closed then close it here
                {
                    PopElement();
                }

                if (isBold) // If bold was opened but not closed then close it here
                {
                    PopElement();
                }
            }
        }