예제 #1
0
        private void RemoveUnwantedContentInternal(HtmlDom dom, Book.Book book, bool removeInactiveLanguages, EpubMaker epubMaker, ISet <string> warningMessages, bool keepPageLabels = false)
        {
            // The ControlForInvoke can be null for tests.  If it's not null, we better not need an Invoke!
            Debug.Assert(ControlForInvoke == null || !ControlForInvoke.InvokeRequired);           // should be called on UI thread.
            Debug.Assert(dom != null && dom.Body != null);

            // Collect all the page divs.
            var pageElts = new List <XmlElement>();

            if (epubMaker != null)
            {
                pageElts.Add((XmlElement)dom.Body.FirstChild);                  // already have a single-page dom prepared for export
            }
            else
            {
                foreach (XmlElement page in book.GetPageElements())
                {
                    pageElts.Add(page);
                }
            }

            RemoveEnterpriseFeaturesIfNeeded(book, pageElts, warningMessages);

            // Remove any left-over bubbles
            foreach (XmlElement elt in dom.RawDom.SafeSelectNodes("//label"))
            {
                if (HasClass(elt, "bubble"))
                {
                    elt.ParentNode.RemoveChild(elt);
                }
            }
            // Remove page labels and descriptions.  Also remove pages (or other div elements) that users have
            // marked invisible.  (The last mimics the effect of bookLayout/languageDisplay.less for editing
            // or PDF published books.)
            foreach (XmlElement elt in dom.RawDom.SafeSelectNodes("//div"))
            {
                if (!book.IsTemplateBook)
                {
                    if (!keepPageLabels && HasClass(elt, "pageLabel"))
                    {
                        elt.ParentNode.RemoveChild(elt);
                    }

                    if (HasClass(elt, "pageDescription"))
                    {
                        elt.ParentNode.RemoveChild(elt);
                    }
                }

                // REVIEW: is this needed now with the new strategy?
                if (HasClass(elt, "bloom-editable") && HasClass(elt, "bloom-visibility-user-off"))
                {
                    elt.ParentNode.RemoveChild(elt);
                }
            }
            // Our recordingmd5 attribute is not allowed by epub
            foreach (XmlElement elt in HtmlDom.SelectAudioSentenceElementsWithRecordingMd5(dom.RawDom.DocumentElement))
            {
                elt.RemoveAttribute("recordingmd5");
            }
            // Users should not be able to edit content of published books
            foreach (XmlElement elt in dom.RawDom.SafeSelectNodes("//div[@contenteditable]"))
            {
                elt.RemoveAttribute("contenteditable");
            }

            foreach (var div in dom.Body.SelectNodes("//div[@role='textbox']").Cast <XmlElement>())
            {
                div.RemoveAttribute("role");                                    // this isn't an editable textbox in an ebook
                div.RemoveAttribute("aria-label");                              // don't want this without a role
                div.RemoveAttribute("spellcheck");                              // too late for spell checking in an ebook
                div.RemoveAttribute("content-editable");                        // too late for editing in an ebook
            }

            // Clean up img elements (BL-6035/BL-6036 and BL-7218)
            foreach (var img in dom.Body.SelectNodes("//img").Cast <XmlElement>())
            {
                // Ensuring a proper alt attribute is handled elsewhere
                var src = img.GetOptionalStringAttribute("src", null);
                if (String.IsNullOrEmpty(src) || src == "placeHolder.png")
                {
                    // If this is a template book, then the whole point of the book is to not have content. So then we want to preserve the placeholders so
                    // that people looking at the book on Bloom Library can see how the template pages are constructed.
                    if (!book.IsTemplateBook)
                    {
                        // If the image file doesn't exist, we want to find out about it.  But if there is no
                        // image file, epubcheck complains and it doesn't do any good anyway.
                        img.ParentNode.RemoveChild(img);
                    }
                }
                else
                {
                    var parent = img.ParentNode as XmlElement;
                    parent.RemoveAttribute("title");      // We don't want this in published books.
                    img.RemoveAttribute(
                        "title");                         // We don't want this in published books.  (probably doesn't exist)
                    img.RemoveAttribute("type");          // This is invalid, but has appeared for svg branding images.
                }
            }

            if (epubMaker != null)
            {
                // epub-check doesn't like these attributes (BL-6036).  I suppose BloomReader might find them useful.
                foreach (var div in dom.Body.SelectNodes("//div[contains(@class, 'split-pane-component-inner')]").Cast <XmlElement>())
                {
                    div.RemoveAttribute("min-height");
                    div.RemoveAttribute("min-width");
                }
            }

            // These elements are inserted and supposedly removed by the ckeditor javascript code.
            // But at least one book created by our test team still has one output to an epub.  If it
            // exists, it probably has a style attribute (position:fixed) that epubcheck won't like.
            // (fixed position way off the screen to hide it)
            foreach (var div in dom.Body.SelectNodes("//*[@data-cke-hidden-sel]").Cast <XmlElement>())
            {
                div.ParentNode.RemoveChild(div);
            }

            // Finally we try to remove elements (except image descriptions) that aren't visible.
            // To accurately determine visibility, we point a real browser at the document.
            // We've had some problems with this, which we now think are fixed; if it doesn't work, for
            // BloomReader we just allow the document to be a little bigger than it needs to be.
            // BloomReader will obey rules like display:none.
            // For epubs, we don't; display:none is not reliably obeyed, so the reader could see
            // unexpected things.

            HtmlDom displayDom = null;

            foreach (XmlElement page in pageElts)
            {
                EnsureAllThingsThatCanBeHiddenHaveIds(page);
                if (displayDom == null)
                {
                    displayDom = book.GetHtmlDomWithJustOnePage(page);
                }
                else
                {
                    var pageNode = displayDom.RawDom.ImportNode(page, true);
                    displayDom.Body.AppendChild(pageNode);
                }
            }
            if (displayDom == null)
            {
                return;
            }
            if (epubMaker != null)
            {
                epubMaker.AddEpubVisibilityStylesheetAndClass(displayDom);
            }
            if (this != _latestInstance)
            {
                return;
            }
            if (!_browser.NavigateAndWaitTillDone(displayDom, 10000, "publish", () => this != _latestInstance,
                                                  false))
            {
                // We started having problems with timeouts here (BL-7892).
                // We may as well carry on. We only need the browser to have navigated so calls to IsDisplayed(elt)
                // below will give accurate answers. Even if the browser hasn't gotten that far yet (e.g., in
                // a long document), it may stay ahead of us. We'll report a failure (currently only for epubs, see above)
                // if we actually can't find the element we need in IsDisplayed().
                Debug.WriteLine("Failed to navigate fully to RemoveUnwantedContentInternal DOM");
                Logger.WriteEvent("Failed to navigate fully to RemoveUnwantedContentInternal DOM");
            }
            if (this != _latestInstance)
            {
                return;
            }

            var toBeDeleted = new List <XmlElement>();

            // Deleting the elements in place during the foreach messes up the list and some things that should be deleted aren't
            // (See BL-5234). So we gather up the elements to be deleted and delete them afterwards.
            foreach (XmlElement page in pageElts)
            {
                // BL-9501 Don't remove pages from template books, which are often empty but we still want to show their components
                if (!book.IsTemplateBook)
                {
                    // As the constant's name here suggests, in theory, we could include divs
                    // that don't have .bloom-editable, and all their children.
                    // But I'm not smart enough to write that selector and for bloomds, all we're doing here is saving space,
                    // so those other divs we are missing doesn't seem to matter as far as I can think.
                    var kSelectThingsThatCanBeHiddenButAreNotText = ".//img";
                    var selector = removeInactiveLanguages
                                                ? kSelectThingsThatCanBeHidden
                                                : kSelectThingsThatCanBeHiddenButAreNotText;
                    foreach (XmlElement elt in page.SafeSelectNodes(selector))
                    {
                        // Even when they are not displayed we want to keep image descriptions if they aren't empty.
                        // This is necessary for retaining any associated audio files to play.
                        // (If they are empty, they won't have any audio and may trigger embedding an unneeded font.)
                        // See https://issues.bloomlibrary.org/youtrack/issue/BL-7237.
                        // As noted above, if the displayDom is not sufficiently loaded for a definitive
                        // answer to IsDisplayed, we will throw when making epubs but not for bloom reader.
                        if (!IsDisplayed(elt, epubMaker != null) && !IsNonEmptyImageDescription(elt))
                        {
                            toBeDeleted.Add(elt);
                        }
                    }

                    foreach (var elt in toBeDeleted)
                    {
                        elt.ParentNode.RemoveChild(elt);
                    }
                }
                // We need the font information for wanted text elements as well.  This is a side-effect but related to
                // unwanted elements in that we don't need fonts that are used only by unwanted elements.  Note that
                // elements don't need to be actually visible to provide computed style information such as font-family.
                foreach (XmlElement elt in page.SafeSelectNodes(".//div"))
                {
                    StoreFontUsed(elt);
                }
                RemoveTempIds(page);                 // don't need temporary IDs any more.
                toBeDeleted.Clear();
            }
        }