protected string GetXmlContents(string xmlText) { XmlDocumentWithLocation xmldoc = new XmlDocumentWithLocation(loadAsReadOnly); xmldoc.LoadXml(xmlText); XmlElementWithLocation rootElement = (XmlElementWithLocation)xmldoc.FirstChild; Console.WriteLine("originalxml = " + xmlText); Console.WriteLine("innerText = " + rootElement.InnerText); Console.WriteLine("innerXml = " + rootElement.InnerXml); Console.WriteLine("-----------"); string xmlContents = InternalUtilities.GetXmlNodeInnerContents(rootElement); return(xmlContents); }
/// <summary> /// Sets the inner XML/text of the given XML node, escaping as necessary. /// </summary> /// <param name="node"></param> /// <param name="s">Can be empty string, but not null.</param> internal static void SetXmlNodeInnerContents(XmlElementWithLocation node, string s) { ErrorUtilities.VerifyThrow(s != null, "Need value to set."); if (s.IndexOf('<') != -1) { // If the value looks like it probably contains XML markup ... try { // Attempt to store it verbatim as XML. node.InnerXml = s; return; } catch (XmlException) { // But that may fail, in the event that "s" is not really well-formed // XML. Eat the exception and fall through below ... } } // The value does not contain valid XML markup. Store it as text, so it gets // escaped properly. node.InnerText = s; }
/// <summary> /// Extracts the inner XML/text of the given XML node, unescaping as necessary. /// </summary> /// <param name="node"></param> /// <returns>Inner XML/text of specified node.</returns> internal static string GetXmlNodeInnerContents(XmlElementWithLocation node) { // XmlNode.InnerXml gives back a string that consists of the set of characters // in between the opening and closing elements of the XML node, without doing any // unescaping. Any "strange" character sequences (like "<![CDATA[...]]>" will remain // exactly so and will not be translated or interpreted. The only modification that // .InnerXml will do is that it will normalize any Xml contained within. This means // normalizing whitespace between XML attributes and quote characters that surround XML // attributes. If PreserveWhitespace is false, then it will also normalize whitespace // between elements. // // XmlNode.InnerText strips out any Xml contained within, and then unescapes the rest // of the text. So if the remaining text contains certain character sequences such as // "&" or "<![CDATA[...]]>", these will be translated into their equivalent representations. // // It's hard to explain, but much easier to demonstrate with examples: // // Original XML XmlNode.InnerText XmlNode.InnerXml // =========================== ============================== ====================================== // // <a><![CDATA[whatever]]></a> whatever <![CDATA[whatever]]> // // <a>123<MyNode/>456</a> 123456 123<MyNode />456 // // <a>123456</a> 123456 123456 // // <a>123<MyNode b='<'/>456</a> 123456 123<MyNode b="<" />456 // // <a>123&456</a> 123&456 123&456 // So the trick for MSBuild when interpreting a property value is to know which one to // use ... InnerXml or InnerText. There are two basic scenarios we care about. // // 1.) The first scenario is that the user is trying to create a property whose // contents are actually XML. That is to say that the contents may be written // to a XML file, or may be passed in as a string to XmlDocument.LoadXml. // In this case, we would want to use XmlNode.InnerXml, because we DO NOT want // character sequences to be unescaped. If we did unescape them, then whatever // XML parser tried to read in the stream as XML later on would totally barf. // // 2.) The second scenario is the the user is trying to create a property that // is just intended to be treated as a string. That string may be very large // and could contain all sorts of whitespace, carriage returns, special characters, // etc. But in the end, it's just a big string. In this case, whatever // task is actually processing this string ... it's not going to know anything // about character sequences such as & and <. These character sequences // are specific to XML markup. So, here we want to use XmlNode.InnerText so that // the character sequences get unescaped into their actual character before // the string is passed to the task (or wherever else the property is used). // Of course, if the string value of the property needs to contain characters // like <, >, &, etc., then the user must XML escape these characters otherwise // the XML parser reading the project file will croak. Or if the user doesn't // want to escape every instance of these characters, he can surround the whole // thing with a CDATA tag. Again, if he does this, we don't want the task to // receive the C, D, A, T, A as part of the string ... this should be stripped off. // Again, using XmlNode.InnerText takes care of this. // // 2b.) A variation of the second scenario is that the user is trying to create a property // that is just intended to be a string, but wants to comment out part of the string. // For example, it's a semicolon separated list that's going ultimately to end up in a list. // eg. (DDB #56841) // // <BuildDirectories> // <!-- // env\TestTools\tshell\pkg; // --> // ndp\fx\src\VSIP\FrameWork; // ndp\fx\src\xmlTools; // ddsuites\src\vs\xmlTools; // </BuildDirectories> // // In this case, we want to treat the string as text, so that we don't retrieve the comment. // We only want to retrieve the comment if there's some other XML in there. The // mere presence of an XML comment shouldn't make us think the value is XML. // // Given these two scenarios, how do we know whether the user intended to treat // a property value as XML or text? We use a simple heuristic which is that if // XmlNode.InnerXml contains any "<" characters, then there pretty much has to be // XML in there, so we'll just use XmlNode.InnerXml. If there are no "<" characters that aren't merely comments, // then we assume it's to be treated as text and we use XmlNode.InnerText. Also, if // it looks like the whole thing is one big CDATA block, then we also use XmlNode.InnerText. // XmlNode.InnerXml is much more expensive than InnerText. Don't use it for trivial cases. // (single child node with a trivial value or no child nodes) if (!node.HasChildNodes) { return(string.Empty); } if (node.ChildNodes.Count == 1 && (node.FirstChild.NodeType == XmlNodeType.Text || node.FirstChild.NodeType == XmlNodeType.CDATA)) { return(node.InnerText); } string innerXml = node.InnerXml; // If there is no markup under the XML node (detected by the presence // of a '<' sign int firstLessThan = innerXml.IndexOf('<'); if (firstLessThan == -1) { // return the inner text so it gets properly unescaped return(node.InnerText); } bool containsNoTagsOtherThanComments = ContainsNoTagsOtherThanComments(innerXml, firstLessThan); // ... or if the only XML is comments, if (containsNoTagsOtherThanComments) { // return the inner text so the comments are stripped // (this is how one might comment out part of a list in a property value) return(node.InnerText); } // ...or it looks like the whole thing is a big CDATA tag ... bool startsWithCData = (innerXml.IndexOf("<![CDATA[", StringComparison.Ordinal) == 0); if (startsWithCData) { // return the inner text so it gets properly extracted from the CDATA return(node.InnerText); } // otherwise, it looks like genuine XML; return the inner XML so that // tags and comments are preserved and any XML escaping is preserved return(innerXml); }