public static String ToFormattedHtml(MshtmlMarkupServices markupServices, MarkupRange bounds) { StringBuilder sb = new StringBuilder(); HtmlWriter xmlWriter = new HtmlWriter(sb); PrintHtml(xmlWriter, markupServices, bounds); return sb.ToString(); }
private static void PrintHtml(HtmlWriter writer, MshtmlMarkupServices MarkupServices, MarkupRange bounds) { //create a range to span a single position while walking the doc MarkupRange range = MarkupServices.CreateMarkupRange(); range.Start.MoveToPointer(bounds.Start); range.End.MoveToPointer(bounds.Start); //create a context that can be reused while walking the document. MarkupContext context = new MarkupContext(); //move the range.End to the right and print out each element along the way range.End.Right(true, context); while (context.Context != _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_None && range.Start.IsLeftOf(bounds.End)) { string text = null; if (context.Context == _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_Text) { //if this is a text context, then get the text that is between the start and end points. text = range.HtmlText; //the range.HtmlText operation sometimes returns the outer tags for a text node, //so we need to strip the tags. //FIXME: if the Right/Left operations returned the available text value, this wouldn't be necessary. if (text != null) text = StripSurroundingTags(text); } else if (context.Context == _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_ExitScope) { string htmlText = range.HtmlText; if (context.Element.innerHTML == null && htmlText != null && htmlText.IndexOf(" ") != -1) { //HACK: Under these conditions, there was a was an invisible NBSP char in the //document that is not detectable by walking through the document with MarkupServices. //So, we force the text of the element to be the char to ensure that the //whitespace that was visible in the editor is visible in the final document. text = " "; } } //print the context. printContext(writer, context, text, range); //move the start element to the spot where the end currently is so tht there is //only ever a single difference in position range.Start.MoveToPointer(range.End); //move the end to the next position range.End.Right(true, context); } }
/// <summary> /// Utility for printing the correct XHTML for a given MarkupContext. /// </summary> /// <param name="writer"></param> /// <param name="context"></param> /// <param name="text"></param> private static void printContext(HtmlWriter writer, MarkupContext context, string text, MarkupRange range) { switch (context.Context) { case _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_EnterScope: printElementStart(writer, context.Element); if (HtmlLinebreakStripper.IsPreserveWhitespaceTag(context.Element.tagName)) { // <pre> was losing whitespace using the normal markup pointer traversal method writer.WriteString(BalanceHtml(context.Element.innerHTML)); printElementEnd(writer, context.Element); range.End.MoveAdjacentToElement(context.Element, _ELEMENT_ADJACENCY.ELEM_ADJ_AfterEnd); break; } else { if (text != null) writer.WriteString(trimHtmlText(text)); break; } case _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_ExitScope: if (text != null) writer.WriteString(trimHtmlText(text)); printElementEnd(writer, context.Element); break; case _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_None: break; case _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_NoScope: if (context.Element is IHTMLCommentElement || context.Element is IHTMLUnknownElement) { //bugfix: 1777 - comments should just be inserted raw. string html = context.Element.outerHTML; // bugfix: 534222 - embed tag markup generation issues if (html != null && html.ToUpper(CultureInfo.InvariantCulture) != "</EMBED>") writer.WriteString(html); } else { printElementStart(writer, context.Element); if (text == null && context.Element.innerHTML != null) { //Avoid MSHTML bug: in some cases (like title or script elements), MSHTML improperly //reports a tag as being NoScope, even through it clearly has a start and end tag with //text in between. To cover this case, we look for text in a noscope element, and add //it to the XML stream if it is detected. writer.WriteString(context.Element.innerHTML); } printElementEnd(writer, context.Element); } break; case _MARKUP_CONTEXT_TYPE.CONTEXT_TYPE_Text: if (text != null) writer.WriteString(trimHtmlText(text)); break; default: break; } }
/// <summary> /// Utility for properly printing the start tag for an element. /// This utility takes care of including/suppresing attributes and namespaces properly. /// </summary> /// <param name="writer"></param> /// <param name="element"></param> private static void printElementStart(HtmlWriter writer, IHTMLElement element) { string tagName = element.tagName; // If there is no tag name, this is mostly an artificial tag reported by mshtml, // and not really present in the markup // (e.g HTMLTableCaptionClass) if (string.IsNullOrEmpty(tagName)) { return; } //XHTML tags are all lowercase tagName = tagName.ToLower(CultureInfo.InvariantCulture); //this is a standard HTML tag, so just write it out. writer.WriteStartElement(tagName); IHTMLDOMNode node = element as IHTMLDOMNode; IHTMLAttributeCollection attrs = node.attributes as IHTMLAttributeCollection; if (attrs != null) { foreach (IHTMLDOMAttribute attr in attrs) { string attrName = attr.nodeName as string; if (attr.specified) { string attrNameLower = attrName.ToLower(CultureInfo.InvariantCulture); //get the raw attribute value (so that IE doesn't try to expand out paths in the value). string attrValue = element.getAttribute(attrName, 2) as string; if (attrValue == null) { //IE won't return some attributes (like class) using IHTMLElement.getAttribute(), //so if the value is null, try to get the value directly from the DOM Attribute. //Note: we can't use the DOM value by default, because IE will rewrite the value //to contain a fully-qualified path on some attribures (like src and href). attrValue = attr.nodeValue as string; if (attrValue == null) { if ((attrNameLower == "hspace" || attrNameLower == "vspace") && attr.nodeValue is int) { attrValue = ((int)attr.nodeValue).ToString(CultureInfo.InvariantCulture); } else if (attrNameLower == "style") { //Avoid bug: Images that are resized with the editor insert a STYLE attribute. //IE won't return the style attribute using the standard API, so we have to grab //it from the style object attrValue = element.style.cssText; } else if (attrNameLower == "colspan") { attrValue = (element as IHTMLTableCell).colSpan.ToString(CultureInfo.InvariantCulture); } else if (attrNameLower == "rowspan") { attrValue = (element as IHTMLTableCell).rowSpan.ToString(CultureInfo.InvariantCulture); } else if (attrNameLower == "align" && attr.nodeValue is int) { // This is not documented anywhere. Just discovered the values empirically on IE7 (Vista). switch ((int)attr.nodeValue) { case 1: attrValue = "left"; break; case 2: attrValue = "center"; break; case 3: attrValue = "right"; break; case 4: attrValue = "texttop"; break; case 5: attrValue = "absmiddle"; break; case 6: attrValue = "baseline"; break; case 7: attrValue = "absbottom"; break; case 8: attrValue = "bottom"; break; case 9: attrValue = "middle"; break; case 10: attrValue = "top"; break; } } } Debug.WriteLineIf(attrValue != null && attrName != "id", String.Format(CultureInfo.InvariantCulture, "{0}.{1} attribute value not retreived", tagName, attrName), element.outerHTML); } // Minimized attributes are not allowed, according // to section 4.5 of XHTML 1.0 specification. // TODO: Deal with simple values that are not strings if (attrValue == null && attrNameLower != "id") attrValue = attrName; if (attrName != null && attrValue != null) { //write out this attribute. writer.WriteAttributeString(attrName, attrValue); } } } } }
/// <summary> /// Utility for properly printing the end tag for an element. /// This utility takes care of including/suppressing end tags for empty nodes properly. /// </summary> /// <param name="writer"></param> /// <param name="element"></param> private static void printElementEnd(HtmlWriter writer, IHTMLElement element) { // No tagName, no end tag. if (string.IsNullOrEmpty(element.tagName)) { return; } if (ElementFilters.RequiresEndTag(element)) { writer.WriteEndElement(true); } else { writer.WriteEndElement(false); } }