/// <summary> /// Parses an opening tag like '<div>'. Starts with the input stream /// pointing to the opening < character. /// </summary> private Element ProcessTagOpenState(Element currentElement) { if (!_characterStream.MoveNext()) { _document.ConformanceLevel *= 0.8f; return(null); } switch (_characterStream.Current) { case '!': _characterStream.State = HtmlStates.MarkupDeclarationOpen; return(currentElement); case '/': _characterStream.State = HtmlStates.EndTagOpen; return(currentElement); } var startPosition = _characterStream.CurrentPosition; string tagName = null; var selfClosing = false; using (var nameBuffer = _stringBuilderFactory.Create(_maximumNameLength)) { char?terminator; _characterStream.State = HtmlStates.TagName; if (_stringParser.TakeUntil(nameBuffer, _maximumNameLength, c => char.IsWhiteSpace(c) || c == '>' || c == '/', out terminator)) { tagName = nameBuffer.ToString().ToLower(); if (!terminator.HasValue || terminator == '/') { _characterStream.State = HtmlStates.SelfClosingStartTag; if (_stringParser.Peek() == '>') { _stringParser.TakeOne(); } else { _document.ConformanceLevel *= 0.9f; } } else if (terminator == '>') { _characterStream.State = _voidElements.Contains(tagName) ? HtmlStates.SelfClosingStartTag : HtmlStates.Data; } else { _characterStream.State = HtmlStates.AttributeName; selfClosing = _voidElements.Contains(tagName); } } else { _document.ConformanceLevel *= 0.9f; nameBuffer.Clear(); _stringParser.Take(nameBuffer, _maximumNameLength); var buffer = nameBuffer.ToString().ToLower(); _characterStream.State = HtmlStates.Data; foreach (var name in _allElements) { if (buffer.StartsWith(name)) { nameBuffer.Clear(); _characterStream.Reset(startPosition); _stringParser.Take(nameBuffer, name.Length); tagName = nameBuffer.ToString(); _characterStream.State = HtmlStates.AttributeName; selfClosing = _voidElements.Contains(tagName); break; } } if (tagName == null) { _document.ConformanceLevel *= 0.6f; } } } var attributes = _characterStream.State == HtmlStates.AttributeName ? ParseAttributes(selfClosing) : null; if (currentElement != null) { var parentElement = currentElement; switch (tagName) { case "html": case "body": case "form": case "header": case "footer": // These elements are not parsed and contain no details. They are included in the output // only as containers for their children currentElement = new UnsupportedElement { Attributes = attributes, SuppressOutput = false }; break; case "p": case "li": // These elements are treated as paragraphs in other markup formats, for // example in markdown they will have a blank line above them to create a // paragraph break. currentElement = new ParagraphElement { Attributes = attributes }; break; case "blockquote": currentElement = new ParagraphElement { Attributes = attributes, Styles = new Dictionary <string, string> { { "margin-top", "10px" }, { "margin-bottom", "10px" }, { "margin-left", "50px" }, { "padding-left", "15px" }, { "border-left", "3px solid #ccc" } } }; break; case "div": // Divs are tricky because some pwople use them to group elements with similar // style and other people used them instead of paragraphs. Since divs are by // default block elements it makes more sense in most cases to treat them link // paragraphs unless they have paraphraphs or other divs within them. currentElement = new ContainerElement { ContainerType = ContainerTypes.Division, Attributes = attributes }; break; case "span": // These elements are treated as inline text. For example in markdown // these are rendered without an extra blank line and are therefore rendered // as part of the prior paragraph currentElement = new SpanElement { Attributes = attributes }; break; case "a": // Anchor tags are a special case if (attributes != null && attributes.ContainsKey("href")) { currentElement = new AnchorElement { LinkAddress = attributes["href"] } } ; else { currentElement = new UnsupportedElement { Attributes = attributes } }; break; case "iframe": case "img": // Image tags are a special case if (attributes != null && attributes.ContainsKey("src")) { var alt = attributes.ContainsKey("alt") ? attributes["alt"] : null; currentElement = new ImageElement { LinkAddress = attributes["src"], AltText = alt }; } else { currentElement = new UnsupportedElement { Attributes = attributes }; } break; case "h1": currentElement = new HeadingElement { Level = 1 }; break; case "h2": currentElement = new HeadingElement { Level = 2 }; break; case "h3": currentElement = new HeadingElement { Level = 3 }; break; case "h4": currentElement = new HeadingElement { Level = 4 }; break; case "h5": currentElement = new HeadingElement { Level = 5 }; break; case "h6": currentElement = new HeadingElement { Level = 6 }; break; case "strong": case "b": // Bold is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-weight", "bold" } } }; break; case "cite": case "q": case "i": case "em": // Italic is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-style", "italic" } } }; break; case "u": // Underline is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "text-decoration", "underline" } } }; break; case "small": // Small is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-size", "smaller" } } }; break; case "sup": // Superscript is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "vertical-align", "super" }, { "font-size", "smaller" } } }; break; case "sub": // Subscript is represented as an inline style currentElement = new FormattedElement { ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "vertical-align", "sub" }, { "font-size", "smaller" } } }; break; case "br": currentElement = new BreakElement { BreakType = BreakTypes.LineBreak }; break; case "hr": currentElement = new BreakElement { BreakType = BreakTypes.HorizontalRule }; break; case "ul": currentElement = new ContainerElement { ContainerType = ContainerTypes.BulletList, Attributes = attributes }; break; case "ol": currentElement = new ContainerElement { ContainerType = ContainerTypes.NumberedList, Attributes = attributes }; break; case "table": currentElement = new ContainerElement { ContainerType = ContainerTypes.Table, Attributes = attributes }; break; case "tr": currentElement = new ContainerElement { ContainerType = ContainerTypes.TableDataRow, Attributes = attributes }; break; case "th": currentElement = new ContainerElement { ContainerType = ContainerTypes.TableHeaderRow, Attributes = attributes }; break; case "td": currentElement = new ContainerElement { ContainerType = ContainerTypes.TableDataCell, Attributes = attributes }; break; default: // All other elements will be excluded from the output document, but will // be parsed just so that we know where they and and the next valid element // begins. currentElement = new UnsupportedElement { Attributes = attributes }; break; } var styleElement = currentElement as IStyleElement; if (styleElement != null && attributes != null) { if (attributes.ContainsKey("class")) { styleElement.ClassNames = attributes["class"]; attributes.Remove("class"); } if (attributes.ContainsKey("style")) { if (styleElement.Styles == null) { styleElement.Styles = new Dictionary <string, string>(); } var styles = attributes["style"].Split(';').Select(s => s.Trim()).Where(s => s.Length > 0); foreach (var style in styles) { var colonPos = style.IndexOf(':'); if (colonPos > 0 && colonPos < style.Length - 1) { var name = style.Substring(0, colonPos).Trim().ToLower(); var value = style.Substring(colonPos + 1).Trim().ToLower(); if (!styleElement.Styles.ContainsKey(name)) { styleElement.Styles[name] = value; } } else { _document.ConformanceLevel *= 0.9f; } } attributes.Remove("style"); } } currentElement.Name = tagName; currentElement.Parent = parentElement; if (parentElement.SuppressOutput) { currentElement.SuppressOutput = true; } if (parentElement.Children == null) { parentElement.Children = new List <IDocumentElement>(); } parentElement.Children.Add(currentElement); if (!Begin(currentElement)) { return(null); } } return(currentElement); }
private void ParseText(string text) { var isBold = false; var isItalic = false; var isCode = false; using (var buffer = _stringBuilderFactory.Create()) { Action <IStringBuilder> flush = b => { if (b.Length > 0) { PushElement(new RawTextElement { Text = b.ToString() }); PopElement(); b.Clear(); } }; for (var i = 0; i < text.Length; i++) { var prior2 = i > 1 ? text[i - 2] : default(char); var prior1 = i > 0 ? text[i - 1] : default(char); var current = text[i]; var next = i < text.Length - 2 ? text[i + 1] : default(char); if ((prior1 == '*' && current == '*') || (prior1 == '_' && current == '_')) { // Double asterix or double underline turns bold on/off flush(buffer); if (isBold) { PopElement(); isBold = false; } else { PushElement(new FormattedElement { Name = new String(current, 2), ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-weight", "bold" } } }); isBold = true; } } else if ((prior1 == '*' || prior1 == '_') && (prior1 != prior2)) { // Single asterix or single underline turns italic on/off flush(buffer); if (isItalic) { PopElement(); isItalic = false; } else { PushElement(new FormattedElement { Name = new String(prior1, 1), ElementType = ElementTypes.InlineText, Styles = new Dictionary <string, string> { { "font-style", "italic" } } }); isItalic = true; } buffer.Append(current); } else if (current == '*' || current == '_') { // When we see the first asterix or underscore, we don't know yet if this // is going to be bold or italic unless this is the last character in the string } else if (current == '`') { // Backticks turn on/off code formatting flush(buffer); if (isCode) { PopElement(); isCode = false; } else { PushElement(new FormattedElement { Name = new String(current, 2), ElementType = ElementTypes.InlineText, ClassNames = "code" }); isCode = true; } } else if (current == '!') { // Can be an image link if it is followed by [] if (next != '[') { buffer.Append(current); } } else if (current == '[') { // Open square bracket is the start of a hyperlink if (text.Length < i + 3) { buffer.Append(current); continue; } var firstCloseIndex = text.IndexOf(']', i + 1); if (firstCloseIndex < 0) { buffer.Append(current); continue; } AnchorElement anchor = null; var title = text.Substring(i + 1, firstCloseIndex - i - 1); var nextChar = text.Length > firstCloseIndex + 3 ? text[firstCloseIndex + 1] : default(char); var isImageLink = prior1 == '!'; if (nextChar == '(') { // Url is inline with the link var secondCloseIndex = text.IndexOf(')', firstCloseIndex + 2); if (secondCloseIndex == -1) { secondCloseIndex = text.Length; } anchor = new AnchorElement { Name = "()", LinkAddress = text.Substring(firstCloseIndex + 2, secondCloseIndex - firstCloseIndex - 2) }; i = secondCloseIndex; } else if (nextChar == '[') { // Url is provided elsewhere in the document as a reference var secondCloseIndex = text.IndexOf(']', firstCloseIndex + 2); if (secondCloseIndex == -1) { secondCloseIndex = text.Length; } anchor = new AnchorElement { Name = "[]", LinkAddress = text.Substring(firstCloseIndex + 2, secondCloseIndex - firstCloseIndex - 2) }; _anchorsToFixup.Add(anchor); i = secondCloseIndex; } else if (nextChar == ':') { var urlStartIndex = firstCloseIndex + 2; while (char.IsWhiteSpace(text[urlStartIndex])) { urlStartIndex++; } var urlEndIndex = urlStartIndex + 1; while (urlEndIndex < text.Length && !char.IsWhiteSpace(text[urlEndIndex])) { urlEndIndex++; } var url = text.Substring(urlStartIndex, urlEndIndex - urlStartIndex); _references[title] = url; i = urlEndIndex - 1; } else { // Self-referencing anchor = new AnchorElement { Name = "[]", LinkAddress = title }; _anchorsToFixup.Add(anchor); i = firstCloseIndex; } if (anchor != null) { flush(buffer); if (isImageLink) { anchor.LinkType = LinkTypes.Image; anchor.AltText = title; PushElement(anchor); PopElement(); // Pop the anchor to add it to its parent } else { PushElement(anchor); PushElement(new RawTextElement { Text = title }); PopElement(); // Pop the raw text PopElement(); // Pop the anchor } } } else if (current == '<') { // Angle brackets are the start of a hyperlink with no title var closeIndex = text.IndexOf('>', i + 1); if (closeIndex < 0) { buffer.Append(current); continue; } var url = text.Substring(i + 1, closeIndex - i - 1); flush(buffer); PushElement(new AnchorElement { Name = "<>", LinkAddress = url }); PushElement(new RawTextElement { Text = url }); PopElement(); // Pop the raw text PopElement(); // Pop the anchor i = closeIndex; } else { buffer.Append(current); } } flush(buffer); if (isItalic) // If italic was opened but not closed then close it here { PopElement(); } if (isBold) // If bold was opened but not closed then close it here { PopElement(); } } }