private void RemoveUnwantedContentInternal(HtmlDom dom, Book.Book book, bool removeInactiveLanguages, EpubMaker epubMaker, ISet <string> warningMessages, bool keepPageLabels = false) { // The ControlForInvoke can be null for tests. If it's not null, we better not need an Invoke! Debug.Assert(ControlForInvoke == null || !ControlForInvoke.InvokeRequired); // should be called on UI thread. Debug.Assert(dom != null && dom.Body != null); // Collect all the page divs. var pageElts = new List <XmlElement>(); if (epubMaker != null) { pageElts.Add((XmlElement)dom.Body.FirstChild); // already have a single-page dom prepared for export } else { foreach (XmlElement page in book.GetPageElements()) { pageElts.Add(page); } } RemoveEnterpriseFeaturesIfNeeded(book, pageElts, warningMessages); // Remove any left-over bubbles foreach (XmlElement elt in dom.RawDom.SafeSelectNodes("//label")) { if (HasClass(elt, "bubble")) { elt.ParentNode.RemoveChild(elt); } } // Remove page labels and descriptions. Also remove pages (or other div elements) that users have // marked invisible. (The last mimics the effect of bookLayout/languageDisplay.less for editing // or PDF published books.) foreach (XmlElement elt in dom.RawDom.SafeSelectNodes("//div")) { if (!book.IsTemplateBook) { if (!keepPageLabels && HasClass(elt, "pageLabel")) { elt.ParentNode.RemoveChild(elt); } if (HasClass(elt, "pageDescription")) { elt.ParentNode.RemoveChild(elt); } } // REVIEW: is this needed now with the new strategy? if (HasClass(elt, "bloom-editable") && HasClass(elt, "bloom-visibility-user-off")) { elt.ParentNode.RemoveChild(elt); } } // Our recordingmd5 attribute is not allowed by epub foreach (XmlElement elt in HtmlDom.SelectAudioSentenceElementsWithRecordingMd5(dom.RawDom.DocumentElement)) { elt.RemoveAttribute("recordingmd5"); } // Users should not be able to edit content of published books foreach (XmlElement elt in dom.RawDom.SafeSelectNodes("//div[@contenteditable]")) { elt.RemoveAttribute("contenteditable"); } foreach (var div in dom.Body.SelectNodes("//div[@role='textbox']").Cast <XmlElement>()) { div.RemoveAttribute("role"); // this isn't an editable textbox in an ebook div.RemoveAttribute("aria-label"); // don't want this without a role div.RemoveAttribute("spellcheck"); // too late for spell checking in an ebook div.RemoveAttribute("content-editable"); // too late for editing in an ebook } // Clean up img elements (BL-6035/BL-6036 and BL-7218) foreach (var img in dom.Body.SelectNodes("//img").Cast <XmlElement>()) { // Ensuring a proper alt attribute is handled elsewhere var src = img.GetOptionalStringAttribute("src", null); if (String.IsNullOrEmpty(src) || src == "placeHolder.png") { // If this is a template book, then the whole point of the book is to not have content. So then we want to preserve the placeholders so // that people looking at the book on Bloom Library can see how the template pages are constructed. if (!book.IsTemplateBook) { // If the image file doesn't exist, we want to find out about it. But if there is no // image file, epubcheck complains and it doesn't do any good anyway. img.ParentNode.RemoveChild(img); } } else { var parent = img.ParentNode as XmlElement; parent.RemoveAttribute("title"); // We don't want this in published books. img.RemoveAttribute( "title"); // We don't want this in published books. (probably doesn't exist) img.RemoveAttribute("type"); // This is invalid, but has appeared for svg branding images. } } if (epubMaker != null) { // epub-check doesn't like these attributes (BL-6036). I suppose BloomReader might find them useful. foreach (var div in dom.Body.SelectNodes("//div[contains(@class, 'split-pane-component-inner')]").Cast <XmlElement>()) { div.RemoveAttribute("min-height"); div.RemoveAttribute("min-width"); } } // These elements are inserted and supposedly removed by the ckeditor javascript code. // But at least one book created by our test team still has one output to an epub. If it // exists, it probably has a style attribute (position:fixed) that epubcheck won't like. // (fixed position way off the screen to hide it) foreach (var div in dom.Body.SelectNodes("//*[@data-cke-hidden-sel]").Cast <XmlElement>()) { div.ParentNode.RemoveChild(div); } // Finally we try to remove elements (except image descriptions) that aren't visible. // To accurately determine visibility, we point a real browser at the document. // We've had some problems with this, which we now think are fixed; if it doesn't work, for // BloomReader we just allow the document to be a little bigger than it needs to be. // BloomReader will obey rules like display:none. // For epubs, we don't; display:none is not reliably obeyed, so the reader could see // unexpected things. HtmlDom displayDom = null; foreach (XmlElement page in pageElts) { EnsureAllThingsThatCanBeHiddenHaveIds(page); if (displayDom == null) { displayDom = book.GetHtmlDomWithJustOnePage(page); } else { var pageNode = displayDom.RawDom.ImportNode(page, true); displayDom.Body.AppendChild(pageNode); } } if (displayDom == null) { return; } if (epubMaker != null) { epubMaker.AddEpubVisibilityStylesheetAndClass(displayDom); } if (this != _latestInstance) { return; } if (!_browser.NavigateAndWaitTillDone(displayDom, 10000, "publish", () => this != _latestInstance, false)) { // We started having problems with timeouts here (BL-7892). // We may as well carry on. We only need the browser to have navigated so calls to IsDisplayed(elt) // below will give accurate answers. Even if the browser hasn't gotten that far yet (e.g., in // a long document), it may stay ahead of us. We'll report a failure (currently only for epubs, see above) // if we actually can't find the element we need in IsDisplayed(). Debug.WriteLine("Failed to navigate fully to RemoveUnwantedContentInternal DOM"); Logger.WriteEvent("Failed to navigate fully to RemoveUnwantedContentInternal DOM"); } if (this != _latestInstance) { return; } var toBeDeleted = new List <XmlElement>(); // Deleting the elements in place during the foreach messes up the list and some things that should be deleted aren't // (See BL-5234). So we gather up the elements to be deleted and delete them afterwards. foreach (XmlElement page in pageElts) { // BL-9501 Don't remove pages from template books, which are often empty but we still want to show their components if (!book.IsTemplateBook) { // As the constant's name here suggests, in theory, we could include divs // that don't have .bloom-editable, and all their children. // But I'm not smart enough to write that selector and for bloomds, all we're doing here is saving space, // so those other divs we are missing doesn't seem to matter as far as I can think. var kSelectThingsThatCanBeHiddenButAreNotText = ".//img"; var selector = removeInactiveLanguages ? kSelectThingsThatCanBeHidden : kSelectThingsThatCanBeHiddenButAreNotText; foreach (XmlElement elt in page.SafeSelectNodes(selector)) { // Even when they are not displayed we want to keep image descriptions if they aren't empty. // This is necessary for retaining any associated audio files to play. // (If they are empty, they won't have any audio and may trigger embedding an unneeded font.) // See https://issues.bloomlibrary.org/youtrack/issue/BL-7237. // As noted above, if the displayDom is not sufficiently loaded for a definitive // answer to IsDisplayed, we will throw when making epubs but not for bloom reader. if (!IsDisplayed(elt, epubMaker != null) && !IsNonEmptyImageDescription(elt)) { toBeDeleted.Add(elt); } } foreach (var elt in toBeDeleted) { elt.ParentNode.RemoveChild(elt); } } // We need the font information for wanted text elements as well. This is a side-effect but related to // unwanted elements in that we don't need fonts that are used only by unwanted elements. Note that // elements don't need to be actually visible to provide computed style information such as font-family. foreach (XmlElement elt in page.SafeSelectNodes(".//div")) { StoreFontUsed(elt); } RemoveTempIds(page); // don't need temporary IDs any more. toBeDeleted.Clear(); } }