/// <summary></summary> /// <param name="content"></param> /// <param name="includeXmlDeclaration"></param> /// <exception>Throws if there are parsing errors</exception> /// <returns></returns> public static XmlDocument GetXmlDomFromHtml(string content, bool includeXmlDeclaration = false) { var dom = new XmlDocument(); content = AddFillerToKeepTidyFromRemovingEmptyElements(content); //in BL-2250, we found that in previous versions, this method would turn, for example, "<u> </u>" REMOVEWHITESPACE. //That is fixed now, but this is needed to give to clean up existing books. content = content.Replace(@"REMOVEWHITESPACE", ""); // tidy likes to insert newlines before <b>, <u>, <i>, and these other elements and convert any existing whitespace // there to a space. (span was found by pursuing BL-7558) content = new Regex(@"<([ubi]|em|strong|sup|sub|span[^>]*)>").Replace(content, "REMOVEWHITESPACE<$1>"); // fix for <br></br> tag doubling content = content.Replace("<br></br>", "<br />"); // fix for > and similar in <style> element protected by CDATA. // At present we only need to account for this occurring once. // See Browser.SaveCustomizedCssRules. var startOfCdata = content.IndexOf(Browser.CdataPrefix, StringComparison.InvariantCulture); const string restoreCdataHere = "/****RestoreCDATAHere*****/"; var endOfCdata = content.IndexOf(Browser.CdataSuffix, StringComparison.InvariantCulture); var savedCdata = ""; if (startOfCdata >= 0 && endOfCdata >= startOfCdata) { endOfCdata += Browser.CdataSuffix.Length; savedCdata = content.Substring(startOfCdata, endOfCdata - startOfCdata); content = content.Substring(0, startOfCdata) + restoreCdataHere + content.Substring(endOfCdata, content.Length - endOfCdata); } var removedSvgs = new List <string>(); content = RemoveSvgs(content, removedSvgs); //using (var temp = new TempFile()) var temp = new TempFile(); { RobustFile.WriteAllText(temp.Path, content, Encoding.UTF8); using (var tidy = RobustIO.DocumentFromFile(temp.Path)) { tidy.ShowWarnings = false; tidy.Quiet = true; tidy.WrapAt = 0; // prevents textarea wrapping. tidy.AddTidyMetaElement = false; tidy.OutputXml = true; tidy.CharacterEncoding = EncodingType.Utf8; tidy.InputCharacterEncoding = EncodingType.Utf8; tidy.OutputCharacterEncoding = EncodingType.Utf8; tidy.DocType = DocTypeMode.Omit; //when it supports html5, then we will let it out it //maybe try this? tidy.Markup = true; tidy.AddXmlDeclaration = includeXmlDeclaration; //NB: this does not prevent tidy from deleting <span data-libray='somethingImportant'></span> tidy.MergeSpans = AutoBool.No; tidy.DropEmptyParagraphs = false; tidy.MergeDivs = AutoBool.No; var errors = tidy.CleanAndRepair(); if (!string.IsNullOrEmpty(errors)) { throw new ApplicationException(errors); } var newContents = tidy.Save(); try { newContents = RestoreSvgs(newContents, removedSvgs); newContents = RemoveFillerInEmptyElements(newContents); newContents = newContents.Replace(" ", " "); //REVIEW: 1) are there others? & and such are fine. 2) shoul we to convert back to on save? // The regex here is mainly for the \s* as a convenient way to remove whatever whitespace TIDY // has inserted. It's a fringe benefit that we can use the[biu]|... to deal with all these elements in one replace. newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE\s*<([biu]|em|strong|sup|sub|span[^>]*)>", "<$1>"); //In BL2250, we still had REMOVEWHITESPACE sticking around sometimes. The way we reproduced it was //with <u> </u>. That is, we started with //"REMOVEWHITESPACE <u> </u>", then libtidy (properly) removed the <u></u>, leaving us with only //"REMOVEWHITESPACE". newContents = Regex.Replace(newContents, @"REMOVEWHITESPACE", ""); // remove blank lines at the end of style blocks newContents = Regex.Replace(newContents, @"\s+<\/style>", "</style>"); // remove <br> elements immediately preceding </p> close tag (BL-2557) // These are apparently inserted by ckeditor as far as we can tell. They don't show up on // fields that have never had a ckeditor activated, and always show up on fields that have // received focus and activated an inline ckeditor. The ideal ckeditor use case appears // to be for data entry as part of a web page that get stored separately, with the data // obtained something like the following in javascript: // ckedit.on('blur', function(evt) { // var editor = evt['editor']; // var data = editor.getData(); // <at this point, the data looks okay, with any <br> element before the </p> tag.> // <store the data somewhere: the following lines have no effect, and may be silly.> // var div = mapCkeditDiv[editor.id]; // div.innerHTML = data; // }); // Examining the initial value of div.innerHTML shows the unwanted <br> element, but it is // not in the data returned by editor.getData(). Since assigning to div.innerHTML doesn't // affect what gets written to the file, this hack was implemented instead. newContents = Regex.Replace(newContents, @"(<br></br>|<br ?/>)[\r\n]*</p>", "</p>"); newContents = newContents.Replace(restoreCdataHere, savedCdata); // Don't let spaces between <strong>, <em>, or <u> elements be removed. (BL-2484) dom.PreserveWhitespace = true; dom.LoadXml(newContents); } catch (Exception e) { var exceptionWithHtmlContents = new Exception(string.Format("{0}{2}{2}{1}", e.Message, newContents, Environment.NewLine)); throw exceptionWithHtmlContents; } } } try { //It's a mystery but http://jira.palaso.org/issues/browse/BL-46 was reported by several people on Win XP, even though a look at html tidy dispose indicates that it does dispose (and thus close) the stream. // Therefore, I'm moving the dispose to an explict call so that I can catch the error and ignore it, leaving an extra file in Temp. temp.Dispose(); //enhance... could make a version of this which collects up any failed deletes and re-attempts them with each call to this } catch (Exception error) { //swallow Debug.Fail("Repro of http://jira.palaso.org/issues/browse/BL-46 "); } //this is a hack... each time we write the content, we add a new <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> //so for now, we remove it when we read it in. It'll get added again when we write it out RemoveAllContentTypesMetas(dom); return(dom); }