Пример #1
0
        /// <summary>
        /// Rearrange the DOM of the box to have block box with boxes before the inner block box and after.
        /// </summary>
        /// <param name="box">the box that has the problem</param>
        private static void CorrectBlockInsideInlineImp(CssBox box)
        {
            if (box.Boxes.Count > 1)
            {
                var leftBlock = CssBox.CreateBlock(box);

                while (ContainsInlinesOnlyDeep(box.Boxes[0]))
                {
                    box.Boxes[0].ParentBox = leftBlock;
                }
                leftBlock.SetBeforeBox(box.Boxes[0]);

                var splitBox = box.Boxes[1];
                splitBox.ParentBox = null;

                CorrectBlockSplitBadBox(box, splitBox, leftBlock);

                if (box.Boxes.Count > 2)
                {
                    var rightBox = CssBox.CreateBox(box, null, box.Boxes[2]);
                    while (box.Boxes.Count > 3)
                    {
                        box.Boxes[3].ParentBox = rightBox;
                    }
                }
                box.Display = CssConstants.Block;
            }
            else
            {
                box.Boxes[0].Display = CssConstants.Block;
            }
        }
Пример #2
0
        /// <summary>
        /// Split bad box that has inline and block boxes into two parts, the left - before the block box
        /// and right - after the block box.
        /// </summary>
        /// <param name="parentBox">the parent box that has the problem</param>
        /// <param name="badBox">the box to split into different boxes</param>
        /// <param name="leftBlock">the left block box that is created for the split</param>
        private static void CorrectBlockSplitBadBox(CssBox parentBox, CssBox badBox, CssBox leftBlock)
        {
            var leftbox = CssBox.CreateBox(leftBlock, badBox.HtmlTag);

            leftbox.InheritStyle(badBox, true);

            bool hadLeft = false;

            while (badBox.Boxes[0].IsInline && ContainsInlinesOnlyDeep(badBox.Boxes[0]))
            {
                hadLeft = true;
                badBox.Boxes[0].ParentBox = leftbox;
            }

            var splitBox = badBox.Boxes[0];

            if (!ContainsInlinesOnlyDeep(splitBox))
            {
                CorrectBlockSplitBadBox(parentBox, splitBox, leftBlock);
                splitBox.ParentBox = null;
            }
            else
            {
                splitBox.ParentBox = parentBox;
            }

            if (badBox.Boxes.Count > 0)
            {
                CssBox rightBox;
                if (splitBox.ParentBox != null || parentBox.Boxes.Count < 2)
                {
                    rightBox = CssBox.CreateBox(parentBox, badBox.HtmlTag);
                    rightBox.InheritStyle(badBox, true);

                    if (parentBox.Boxes.Count > 2)
                    {
                        rightBox.SetBeforeBox(parentBox.Boxes[1]);
                    }

                    splitBox.SetBeforeBox(rightBox);
                }
                else
                {
                    rightBox = parentBox.Boxes[2];
                }

                while (badBox.Boxes.Count > 0)
                {
                    badBox.Boxes[0].ParentBox = rightBox;
                }
            }
            else if (splitBox.ParentBox != null && parentBox.Boxes.Count > 1)
            {
                splitBox.SetBeforeBox(parentBox.Boxes[1]);
                if (splitBox.HtmlTag != null && splitBox.HtmlTag.Name == "br" && (hadLeft || leftBlock.Boxes.Count > 1))
                {
                    splitBox.Display = CssConstants.Inline;
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Add html text anon box to the current box, this box will have the rendered text<br/>
        /// Adding box also for text that contains only whitespaces because we don't know yet if
        /// the box is preformatted. At later stage they will be removed if not relevant.
        /// </summary>
        /// <param name="source">the html source to parse</param>
        /// <param name="startIdx">the start of the html part</param>
        /// <param name="tagIdx">the index of the next html tag</param>
        /// <param name="curBox">the current box in html tree parsing</param>
        private static void AddTextBox(string source, int startIdx, int tagIdx, ref CssBox curBox)
        {
            var text = tagIdx > startIdx ? new SubString(source, startIdx, tagIdx - startIdx) : null;

            if (text != null)
            {
                var abox = CssBox.CreateBox(curBox);
                abox.Text = text;
            }
        }
Пример #4
0
        /// <summary>
        /// Rearrange the DOM of the box to have block box with boxes before the inner block box and after.
        /// </summary>
        /// <param name="box">the box that has the problem</param>
        private static CssBox CorrectBlockInsideInlineImp(CssBox box)
        {
            if (box.Display == CssConstants.Inline)
            {
                box.Display = CssConstants.Block;
            }

            if (box.Boxes.Count > 1 || box.Boxes[0].Boxes.Count > 1)
            {
                var leftBlock = CssBox.CreateBlock(box);

                while (ContainsInlinesOnlyDeep(box.Boxes[0]))
                {
                    box.Boxes[0].ParentBox = leftBlock;
                }
                leftBlock.SetBeforeBox(box.Boxes[0]);

                var splitBox = box.Boxes[1];
                splitBox.ParentBox = null;

                CorrectBlockSplitBadBox(box, splitBox, leftBlock);

                // remove block that did not get any inner elements
                if (leftBlock.Boxes.Count < 1)
                {
                    leftBlock.ParentBox = null;
                }

                int minBoxes = leftBlock.ParentBox != null ? 2 : 1;
                if (box.Boxes.Count > minBoxes)
                {
                    // create temp box to handle the tail elements and then get them back so no deep hierarchy is created
                    var tempRightBox = CssBox.CreateBox(box, null, box.Boxes[minBoxes]);
                    while (box.Boxes.Count > minBoxes + 1)
                    {
                        box.Boxes[minBoxes + 1].ParentBox = tempRightBox;
                    }

                    return(tempRightBox);
                }
            }
            else if (box.Boxes[0].Display == CssConstants.Inline)
            {
                box.Boxes[0].Display = CssConstants.Block;
            }

            return(null);
        }
Пример #5
0
        /// <summary>
        /// Parses the source html to css boxes tree structure.
        /// </summary>
        /// <param name="source">the html source to parse</param>
        public static CssBox ParseDocument(string source)
        {
            var root   = CssBox.CreateBlock();
            var curBox = root;

            int endIdx   = 0;
            int startIdx = 0;

            while (startIdx >= 0)
            {
                var tagIdx = source.IndexOf('<', startIdx);
                if (tagIdx >= 0 && tagIdx < source.Length)
                {
                    // add the html text as anon css box to the structure
                    AddTextBox(source, startIdx, tagIdx, ref curBox);

                    if (source[tagIdx + 1] == '!')
                    {
                        // skip the html crap elements (<!-- bla -->) (<!crap bla>)
                        startIdx = source.IndexOf(">", tagIdx + 2);
                        endIdx   = startIdx > 0 ? startIdx + 1 : tagIdx + 2;
                    }
                    else
                    {
                        // parse element tag to css box structure
                        endIdx = ParseHtmlTag(source, tagIdx, ref curBox) + 1;
                    }
                }
                startIdx = tagIdx > -1 && endIdx > 0 ? endIdx : -1;
            }

            // handle pices of html without proper structure
            if (endIdx < source.Length)
            {
                // there is text after the end of last element
                var endText = new SubString(source, endIdx, source.Length - endIdx);
                if (!endText.IsEmptyOrWhitespace())
                {
                    var abox = CssBox.CreateBox(root);
                    abox.Text = endText;
                }
            }

            return(root);
        }
Пример #6
0
        /// <summary>
        /// Parse the html part, the part from prev parsing index to the beginning of the next html tag.<br/>
        /// </summary>
        /// <param name="source">the html source to parse</param>
        /// <param name="tagIdx">the index of the next html tag</param>
        /// <param name="curBox">the current box in html tree parsing</param>
        /// <returns>the end of the parsed part, the new start index</returns>
        private static int ParseHtmlTag(string source, int tagIdx, ref CssBox curBox)
        {
            var endIdx = source.IndexOf('>', tagIdx + 1);

            if (endIdx > 0)
            {
                string tagName;
                Dictionary <string, string> tagAttributes;
                var length = endIdx - tagIdx + 1 - (source[endIdx - 1] == '/' ? 1 : 0);
                if (ParseHtmlTag(source, tagIdx, length, out tagName, out tagAttributes))
                {
                    if (!HtmlUtils.IsSingleTag(tagName) && curBox.ParentBox != null)
                    {
                        // need to find the parent tag to go one level up
                        curBox = DomUtils.FindParent(curBox.ParentBox, tagName, curBox);
                    }
                }
                else if (!string.IsNullOrEmpty(tagName))
                {
                    //new SubString(source, lastEnd + 1, tagmatch.Index - lastEnd - 1)
                    var isSingle = HtmlUtils.IsSingleTag(tagName) || source[endIdx - 1] == '/';
                    var tag      = new HtmlTag(tagName, isSingle, tagAttributes);

                    if (isSingle)
                    {
                        // the current box is not changed
                        CssBox.CreateBox(tag, curBox);
                    }
                    else
                    {
                        // go one level down, make the new box the current box
                        curBox = CssBox.CreateBox(tag, curBox);
                    }
                }
                else
                {
                    endIdx = tagIdx + 1;
                }
            }

            return(endIdx);
        }
Пример #7
0
        /// <summary>
        /// Parses the source html to css boxes tree structure.
        /// </summary>
        /// <param name="source">the html source to parse</param>
        public static CssBox ParseDocument(string source)
        {
            var root   = CssBox.CreateBlock();
            var curBox = root;

            int endIdx   = 0;
            int startIdx = 0;

            while (startIdx >= 0)
            {
                var tagIdx = source.IndexOf('<', startIdx);
                if (tagIdx >= 0 && tagIdx < source.Length)
                {
                    // add the html text as anon css box to the structure
                    AddTextBox(source, startIdx, tagIdx, ref curBox);

                    if (source[tagIdx + 1] == '!')
                    {
                        if (source[tagIdx + 2] == '-')
                        {
                            // skip the html comment elements (<!-- bla -->)
                            startIdx = source.IndexOf("-->", tagIdx + 2);
                            endIdx   = startIdx > 0 ? startIdx + 3 : tagIdx + 2;
                        }
                        else
                        {
                            // skip the html crap elements (<!crap bla>)
                            startIdx = source.IndexOf(">", tagIdx + 2);
                            endIdx   = startIdx > 0 ? startIdx + 1 : tagIdx + 2;
                        }
                    }
                    else
                    {
                        // parse element tag to css box structure
                        endIdx = ParseHtmlTag(source, tagIdx, ref curBox) + 1;

                        if (curBox.HtmlTag != null &&
                            curBox.HtmlTag.Name.Equals(HtmlConstants.Style, StringComparison.OrdinalIgnoreCase))
                        {
                            var endIdxS = endIdx;
                            endIdx = source.IndexOf("</style>", endIdx, StringComparison.OrdinalIgnoreCase);
                            if (endIdx > -1)
                            {
                                AddTextBox(source, endIdxS, endIdx, ref curBox);
                            }
                        }
                    }
                }

                startIdx = tagIdx > -1 && endIdx > 0 ? endIdx : -1;
            }

            // handle pieces of html without proper structure
            if (endIdx > -1 && endIdx < source.Length)
            {
                // there is text after the end of last element
                var endText = new SubString(source, endIdx, source.Length - endIdx);
                if (!endText.IsEmptyOrWhitespace())
                {
                    var abox = CssBox.CreateBox(root);
                    abox.Text = endText;
                }
            }

            return(root);
        }
Пример #8
0
        /// <summary>
        /// Split bad box that has inline and block boxes into two parts, the left - before the block box
        /// and right - after the block box.
        /// </summary>
        /// <param name="parentBox">the parent box that has the problem</param>
        /// <param name="badBox">the box to split into different boxes</param>
        /// <param name="leftBlock">the left block box that is created for the split</param>
        private static void CorrectBlockSplitBadBox(CssBox parentBox, CssBox badBox, CssBox leftBlock)
        {
            CssBox leftbox = null;

            while (badBox.Boxes[0].IsInline && ContainsInlinesOnlyDeep(badBox.Boxes[0]))
            {
                if (leftbox == null)
                {
                    // if there is no elements in the left box there is no reason to keep it
                    leftbox = CssBox.CreateBox(leftBlock, badBox.HtmlTag);
                    leftbox.InheritStyle(badBox, true);
                }
                badBox.Boxes[0].ParentBox = leftbox;
            }

            var splitBox = badBox.Boxes[0];

            if (!ContainsInlinesOnlyDeep(splitBox))
            {
                CorrectBlockSplitBadBox(parentBox, splitBox, leftBlock);
                splitBox.ParentBox = null;
            }
            else
            {
                splitBox.ParentBox = parentBox;
            }

            if (badBox.Boxes.Count > 0)
            {
                CssBox rightBox;
                if (splitBox.ParentBox != null || parentBox.Boxes.Count < 3)
                {
                    rightBox = CssBox.CreateBox(parentBox, badBox.HtmlTag);
                    rightBox.InheritStyle(badBox, true);

                    if (parentBox.Boxes.Count > 2)
                    {
                        rightBox.SetBeforeBox(parentBox.Boxes[1]);
                    }

                    if (splitBox.ParentBox != null)
                    {
                        splitBox.SetBeforeBox(rightBox);
                    }
                }
                else
                {
                    rightBox = parentBox.Boxes[2];
                }

                rightBox.SetAllBoxes(badBox);
            }
            else if (splitBox.ParentBox != null && parentBox.Boxes.Count > 1)
            {
                splitBox.SetBeforeBox(parentBox.Boxes[1]);
                if (splitBox.HtmlTag != null && splitBox.HtmlTag.Name == "br" && (leftbox != null || leftBlock.Boxes.Count > 1))
                {
                    splitBox.Display = CssConstants.Inline;
                }
            }
        }
Пример #9
0
        /// <summary>
        /// Split bad box that has inline and block boxes into two parts, the left - before the block box
        /// and right - after the block box.
        /// </summary>
        /// <param name="parentBox">the parent box that has the problem</param>
        /// <param name="badBox">the box to split into different boxes</param>
        /// <param name="leftBlock">the left block box that is created for the split</param>
        private static void CorrectBlockSplitBadBox(CssBox parentBox, CssBox badBox, CssBox leftBlock)
        {
            CssBox leftbox = null;

            //This checks if the first child of badbox isInline and if it has only inline elements
            while (badBox.Boxes[0].IsInline && ContainsInlinesOnlyDeep(badBox.Boxes[0]))
            {
                if (leftbox == null)
                {
                    // if there is no elements in the left box there is no reason to keep it
                    leftbox = CssBox.CreateBox(leftBlock, badBox.HtmlTag);
                    leftbox.InheritStyle(badBox, true);
                }
                //puts the HTML of leftbox (which is a new box?) into the parent ofbadbox (it's previous parent)
                badBox.Boxes[0].ParentBox = leftbox;
            }

            var splitBox = badBox.Boxes[0];

            if (!ContainsInlinesOnlyDeep(splitBox))
            {
                CorrectBlockSplitBadBox(parentBox, splitBox, leftBlock);
                splitBox.ParentBox = null;
            }
            else
            {
                splitBox.ParentBox = parentBox;
            }

            if (badBox.Boxes.Count > 0)
            {
                CssBox rightBox;
                if (splitBox.ParentBox != null || parentBox.Boxes.Count < 3)
                {
                    rightBox = CssBox.CreateBox(parentBox, badBox.HtmlTag);
                    rightBox.InheritStyle(badBox, true);

                    if (parentBox.Boxes.Count > 2)
                    {
                        rightBox.SetBeforeBox(parentBox.Boxes[1]);
                    }

                    if (splitBox.ParentBox != null)
                    {
                        splitBox.SetBeforeBox(rightBox);
                    }
                }
                else
                {
                    rightBox = parentBox.Boxes[2];
                }

                rightBox.SetAllBoxes(badBox);
            }
            else if (splitBox.ParentBox != null && parentBox.Boxes.Count > 1)
            {
                splitBox.SetBeforeBox(parentBox.Boxes[1]);
                if (splitBox.HtmlTag != null && splitBox.HtmlTag.Name == "br" && (leftbox != null || leftBlock.Boxes.Count > 1))
                {
                    splitBox.Display = CssConstants.Inline;
                }
            }
        }
Пример #10
0
        /// <summary>
        /// Parses the document
        /// </summary>
        public static CssBox ParseDocument(string document)
        {
            document = RemoveHtmlComments(document);

            int    lastEnd = -1;
            CssBox root    = null;
            CssBox curBox  = null;
            var    tags    = RegexParserUtils.Match(RegexParserUtils.HtmlTag, document);

            foreach (Match tagmatch in tags)
            {
                string text = tagmatch.Index > 0 ? document.Substring(lastEnd + 1, tagmatch.Index - lastEnd - 1) : String.Empty;

                var emptyText = String.IsNullOrEmpty(text.Trim());
                if (!emptyText)
                {
                    if (curBox == null)
                    {
                        root = curBox = CssBox.CreateBlock();
                    }

                    var abox = CssBox.CreateBox(curBox);
                    abox.Text = text;
                }

                var tag = ParseHtmlTag(tagmatch.Value);

                if (tag.IsClosing)
                {
                    // handle tags that have no content but whitespace
                    if (emptyText && curBox != null && curBox.Boxes.Count == 0 && !string.IsNullOrEmpty(text))
                    {
                        var abox = CssBox.CreateBox(curBox);
                        abox.Text = " ";
                    }

                    // need to find the parent tag to go one level up
                    curBox = DomUtils.FindParent(root, tag.Name, curBox);
                }
                else if (tag.IsSingle)
                {
                    // the current box is not changed
                    new CssBox(curBox, tag);
                }
                else
                {
                    // go one level down, make the new box the current box
                    curBox = new CssBox(curBox, tag);
                }

                if (root == null && curBox != null)
                {
                    root         = curBox;
                    root.Display = CssConstants.Block;
                }

                lastEnd = tagmatch.Index + tagmatch.Length - 1;
            }

            if (root == null)
            {
                root = CssBox.CreateBlock();
                var abox = CssBox.CreateBox(root);
                abox.Text = document;
            }
            else if (lastEnd < document.Length)
            {
                var endText = document.Substring(lastEnd + 1);
                if (!string.IsNullOrEmpty(endText.Trim()))
                {
                    var abox = CssBox.CreateBox(root);
                    abox.Text = endText;
                }
            }

            return(root);
        }