/// <summary> /// Parses an opening tag like '<div>'. Starts with the input stream /// pointing to the opening < character. /// </summary> private Element ProcessTagOpenState(Element currentElement) { if (!_characterStream.MoveNext()) { _document.ConformanceLevel *= 0.8f; return(null); } switch (_characterStream.Current) { case '!': _characterStream.State = HtmlStates.MarkupDeclarationOpen; return(currentElement); case '/': _characterStream.State = HtmlStates.EndTagOpen; return(currentElement); } var startPosition = _characterStream.CurrentPosition; string tagName = null; var selfClosing = false; using (var nameBuffer = _stringBuilderFactory.Create(_maximumNameLength)) { char?terminator; _characterStream.State = HtmlStates.TagName; if (_stringParser.TakeUntil(nameBuffer, _maximumNameLength, c => char.IsWhiteSpace(c) || c == '>' || c == '/', out terminator)) { tagName = nameBuffer.ToString().ToLower(); if (!terminator.HasValue || terminator == '/') { _characterStream.State = HtmlStates.SelfClosingStartTag; if (_stringParser.Peek() == '>') { _stringParser.TakeOne(); } else { _document.ConformanceLevel *= 0.9f; } } else if (terminator == '>') { _characterStream.State = _voidElements.Contains(tagName) ? HtmlStates.SelfClosingStartTag : HtmlStates.Data; } else { _characterStream.State = HtmlStates.AttributeName; selfClosing = _voidElements.Contains(tagName); } } else { _document.ConformanceLevel *= 0.9f; nameBuffer.Clear(); _stringParser.Take(nameBuffer, _maximumNameLength); var buffer = nameBuffer.ToString().ToLower(); _characterStream.State = HtmlStates.Data; foreach (var name in _allElements) { if (buffer.StartsWith(name)) { nameBuffer.Clear(); _characterStream.Reset(startPosition); _stringParser.Take(nameBuffer, name.Length); tagName = nameBuffer.ToString(); _characterStream.State = HtmlStates.AttributeName; selfClosing = _voidElements.Contains(tagName); break; } } if (tagName == null) { _document.ConformanceLevel *= 0.6f; } } } var attributes = _characterStream.State == HtmlStates.AttributeName ? ParseAttributes(selfClosing) : null; if (currentElement != null) { var parentElement = currentElement; switch (tagName) { case "html": case "body": case "form": case "header": case "footer": // These elements are not parsed and contain no details. They are included in the output // only as containers for their children currentElement = new UnsupportedElement { Attributes = attributes, SuppressOutput = false }; break; case "p": case "li": // These elements are treated as paragraphs in other markup formats, for // example in markdown they will have a blank line above them to create a // paragraph break. currentElement = new ParagraphElement { Attributes = attributes }; break; case "blockquote": currentElement = new ParagraphElement { Attributes = attributes, Styles = new Dictionary <string, string> { { "margin-top", "10px" }, { "margin-bottom", "10px" }, { "margin-left", "50px" }, { "padding-left", "15px" }, { "border-left", "3px solid #ccc" } } }; break; case "div": // Divs are tricky because some pwople use them to group elements with similar // style and other people used them instead of paragraphs. Since divs are by // default block elements it makes more sense in most cases to treat them link // paragraphs unless they have paraphraphs or other divs within them. currentElement = new ContainerElement { ContainerType = ContainerTypes.Division, Attributes = attributes }; break; case "span": // These elements are treated as inline text. For example in markdown // these are rendered without an extra blank line and are therefore rendered // as part of the prior paragraph currentElement = new SpanElement { Attributes = attributes }; break; case "a": // Anchor tags are a special case if (attributes != null && attributes.ContainsKey("href")) { currentElement = new AnchorElement { LinkAddress = attributes["href"] } } ; else { currentElement = new UnsupportedElement { Attributes = attributes } }; break; case "iframe": case "img": // Image tags are a special case if (attributes != null && attributes.ContainsKey("src")) { var alt = attributes.ContainsKey("alt") ? attributes["alt"] : null; currentElement = new ImageElement { LinkAddress = attributes["src"], AltText = alt }; } else { currentElement = new UnsupportedElement { Attributes = attributes }; } break; case "h1": currentElement = new HeadingElement { Level = 1 }; break; case "h2": currentElement = new HeadingElement { Level = 2 }; break; case "h3": currentElement = new HeadingElement { Level = 3 }; break; case "h4": currentElement = new HeadingElement { Level = 4 }; break; case "h5": currentElement = new HeadingElement { Level = 5 }; break; case "h6": currentElement = new HeadingElement { Level = 6 }; break; case "strong": case "b": // Bold is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-weight", "bold" } } }; break; case "cite": case "q": case "i": case "em": // Italic is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-style", "italic" } } }; break; case "u": // Underline is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "text-decoration", "underline" } } }; break; case "small": // Small is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-size", "smaller" } } }; break; case "sup": // Superscript is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "vertical-align", "super" }, { "font-size", "smaller" } } }; break; case "sub": // Subscript is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "vertical-align", "sub" }, { "font-size", "smaller" } } }; break; case "br": currentElement = new BreakElement { BreakType = BreakTypes.LineBreak }; break; case "hr": currentElement = new BreakElement { BreakType = BreakTypes.HorizontalRule }; break; case "ul": currentElement = new ContainerElement { ContainerType = ContainerTypes.BulletList, Attributes = attributes }; break; case "ol": currentElement = new ContainerElement { ContainerType = ContainerTypes.NumberedList, Attributes = attributes }; break; case "table": currentElement = new ContainerElement { ContainerType = ContainerTypes.Table, Attributes = attributes }; break; case "tr": currentElement = new ContainerElement { ContainerType = ContainerTypes.TableDataRow, Attributes = attributes }; break; case "th": currentElement = new ContainerElement { ContainerType = ContainerTypes.TableHeaderRow, Attributes = attributes }; break; case "td": currentElement = new ContainerElement { ContainerType = ContainerTypes.TableDataCell, Attributes = attributes }; break; default: // All other elements will be excluded from the output document, but will // be parsed just so that we know where they and and the next valid element // begins. currentElement = new UnsupportedElement { Attributes = attributes }; break; } var styleElement = currentElement as IStyleElement; if (styleElement != null && attributes != null) { if (attributes.ContainsKey("class")) { styleElement.ClassNames = attributes["class"]; attributes.Remove("class"); } if (attributes.ContainsKey("style")) { if (styleElement.Styles == null) { styleElement.Styles = new Dictionary <string, string>(); } var styles = attributes["style"].Split(';').Select(s => s.Trim()).Where(s => s.Length > 0); foreach (var style in styles) { var colonPos = style.IndexOf(':'); if (colonPos > 0 && colonPos < style.Length - 1) { var name = style.Substring(0, colonPos).Trim().ToLower(); var value = style.Substring(colonPos + 1).Trim().ToLower(); if (!styleElement.Styles.ContainsKey(name)) { styleElement.Styles[name] = value; } } else { _document.ConformanceLevel *= 0.9f; } } attributes.Remove("style"); } } currentElement.Name = tagName; currentElement.Parent = parentElement; if (parentElement.SuppressOutput) { currentElement.SuppressOutput = true; } if (parentElement.Children == null) { parentElement.Children = new List <IDocumentElement>(); } parentElement.Children.Add(currentElement); if (!Begin(currentElement)) { return(null); } } return(currentElement); }
private bool CheckTable(string trimmedLine) { Func <string, List <string> > splitColumns = l => { var columns = l.Split('|').Select(c => c.Trim()).ToList(); if (trimmedLine.StartsWith("|")) { columns.RemoveAt(0); } if (trimmedLine.EndsWith("|")) { columns.RemoveAt(columns.Count - 1); } return(columns); }; if ((_characterStream.State == MarkdownStates.ParagraphBreak && trimmedLine.Contains('|')) || ((_characterStream.State == MarkdownStates.Paragraph || _characterStream.State == MarkdownStates.Heading) && trimmedLine[0] == '|')) { PushElement(new ContainerElement { Name = "table", ContainerType = ContainerTypes.Table, ChildLayout = new List <string>() }); PushElement(new ContainerElement { Name = "thead", ContainerType = ContainerTypes.TableHeader, }); _characterStream.State = MarkdownStates.TableHeadings; } if (_characterStream.State == MarkdownStates.TableHeadings) { if (trimmedLine.All(c => char.IsWhiteSpace(c) || "|-:".Contains(c))) { var columnFormats = splitColumns(trimmedLine); for (var i = 0; i < columnFormats.Count; i++) { var format = columnFormats[i]; columnFormats[i] = string.Empty; if (format.Length > 1) { if (format[format.Length - 1] == ':') { columnFormats[i] = format[0] == ':' ? "center" : "right"; } else if (format[0] == ':') { columnFormats[i] = "left"; } } } var tableElement = FindPriorElement <ContainerElement>(c => c.ContainerType == ContainerTypes.Table); if (tableElement != null) { tableElement.ChildLayout = columnFormats; } PopElement(); // Pop the TableHeader PushElement(new ContainerElement { Name = "tbody", ContainerType = ContainerTypes.TableBody, }); _characterStream.State = MarkdownStates.TableRow; } else { PushElement(new ContainerElement { Name = "tr", ContainerType = ContainerTypes.TableHeaderRow }); var columns = splitColumns(trimmedLine); foreach (var column in columns) { PushElement(new ContainerElement { Name = "th", ContainerType = ContainerTypes.TableDataCell }); ParseText(column); PopElement(); // Pop the th off the stack, adding it to the tr } PopElement(); // Pop the tr, adding it to the table } return(true); } if (_characterStream.State == MarkdownStates.TableRow) { var tableElement = FindPriorElement <ContainerElement>(c => c.ContainerType == ContainerTypes.Table); var columnFormats = tableElement == null ? null : (List <string>)tableElement.ChildLayout; PushElement(new ContainerElement { Name = "tr", ContainerType = ContainerTypes.TableDataRow }); var columns = splitColumns(trimmedLine); for (var i = 0; i < columns.Count; i++) { var td = new ContainerElement { Name = "td", ContainerType = ContainerTypes.TableDataCell }; if (columnFormats != null && columnFormats.Count > i && !string.IsNullOrEmpty(columnFormats[i])) { td.Attributes = new Dictionary <string, string> { { "align", columnFormats[i] } }; } PushElement(td); ParseText(columns[i]); PopElement(); // Pop the th off the stack, adding it to the tr } PopElement(); // Pop the tr, adding it to the table return(true); } return(false); }