/// <summary> /// Rearrange the DOM of the box to have block box with boxes before the inner block box and after. /// </summary> /// <param name="box">the box that has the problem</param> private static void CorrectBlockInsideInlineImp(CssBox box) { if (box.Boxes.Count > 1) { var leftBlock = CssBox.CreateBlock(box); while (ContainsInlinesOnlyDeep(box.Boxes[0])) { box.Boxes[0].ParentBox = leftBlock; } leftBlock.SetBeforeBox(box.Boxes[0]); var splitBox = box.Boxes[1]; splitBox.ParentBox = null; CorrectBlockSplitBadBox(box, splitBox, leftBlock); if (box.Boxes.Count > 2) { var rightBox = CssBox.CreateBox(box, null, box.Boxes[2]); while (box.Boxes.Count > 3) { box.Boxes[3].ParentBox = rightBox; } } box.Display = CssConstants.Block; } else { box.Boxes[0].Display = CssConstants.Block; } }
/// <summary> /// Split bad box that has inline and block boxes into two parts, the left - before the block box /// and right - after the block box. /// </summary> /// <param name="parentBox">the parent box that has the problem</param> /// <param name="badBox">the box to split into different boxes</param> /// <param name="leftBlock">the left block box that is created for the split</param> private static void CorrectBlockSplitBadBox(CssBox parentBox, CssBox badBox, CssBox leftBlock) { var leftbox = CssBox.CreateBox(leftBlock, badBox.HtmlTag); leftbox.InheritStyle(badBox, true); bool hadLeft = false; while (badBox.Boxes[0].IsInline && ContainsInlinesOnlyDeep(badBox.Boxes[0])) { hadLeft = true; badBox.Boxes[0].ParentBox = leftbox; } var splitBox = badBox.Boxes[0]; if (!ContainsInlinesOnlyDeep(splitBox)) { CorrectBlockSplitBadBox(parentBox, splitBox, leftBlock); splitBox.ParentBox = null; } else { splitBox.ParentBox = parentBox; } if (badBox.Boxes.Count > 0) { CssBox rightBox; if (splitBox.ParentBox != null || parentBox.Boxes.Count < 2) { rightBox = CssBox.CreateBox(parentBox, badBox.HtmlTag); rightBox.InheritStyle(badBox, true); if (parentBox.Boxes.Count > 2) { rightBox.SetBeforeBox(parentBox.Boxes[1]); } splitBox.SetBeforeBox(rightBox); } else { rightBox = parentBox.Boxes[2]; } while (badBox.Boxes.Count > 0) { badBox.Boxes[0].ParentBox = rightBox; } } else if (splitBox.ParentBox != null && parentBox.Boxes.Count > 1) { splitBox.SetBeforeBox(parentBox.Boxes[1]); if (splitBox.HtmlTag != null && splitBox.HtmlTag.Name == "br" && (hadLeft || leftBlock.Boxes.Count > 1)) { splitBox.Display = CssConstants.Inline; } } }
/// <summary> /// Add html text anon box to the current box, this box will have the rendered text<br/> /// Adding box also for text that contains only whitespaces because we don't know yet if /// the box is preformatted. At later stage they will be removed if not relevant. /// </summary> /// <param name="source">the html source to parse</param> /// <param name="startIdx">the start of the html part</param> /// <param name="tagIdx">the index of the next html tag</param> /// <param name="curBox">the current box in html tree parsing</param> private static void AddTextBox(string source, int startIdx, int tagIdx, ref CssBox curBox) { var text = tagIdx > startIdx ? new SubString(source, startIdx, tagIdx - startIdx) : null; if (text != null) { var abox = CssBox.CreateBox(curBox); abox.Text = text; } }
/// <summary> /// Rearrange the DOM of the box to have block box with boxes before the inner block box and after. /// </summary> /// <param name="box">the box that has the problem</param> private static CssBox CorrectBlockInsideInlineImp(CssBox box) { if (box.Display == CssConstants.Inline) { box.Display = CssConstants.Block; } if (box.Boxes.Count > 1 || box.Boxes[0].Boxes.Count > 1) { var leftBlock = CssBox.CreateBlock(box); while (ContainsInlinesOnlyDeep(box.Boxes[0])) { box.Boxes[0].ParentBox = leftBlock; } leftBlock.SetBeforeBox(box.Boxes[0]); var splitBox = box.Boxes[1]; splitBox.ParentBox = null; CorrectBlockSplitBadBox(box, splitBox, leftBlock); // remove block that did not get any inner elements if (leftBlock.Boxes.Count < 1) { leftBlock.ParentBox = null; } int minBoxes = leftBlock.ParentBox != null ? 2 : 1; if (box.Boxes.Count > minBoxes) { // create temp box to handle the tail elements and then get them back so no deep hierarchy is created var tempRightBox = CssBox.CreateBox(box, null, box.Boxes[minBoxes]); while (box.Boxes.Count > minBoxes + 1) { box.Boxes[minBoxes + 1].ParentBox = tempRightBox; } return(tempRightBox); } } else if (box.Boxes[0].Display == CssConstants.Inline) { box.Boxes[0].Display = CssConstants.Block; } return(null); }
/// <summary> /// Parses the source html to css boxes tree structure. /// </summary> /// <param name="source">the html source to parse</param> public static CssBox ParseDocument(string source) { var root = CssBox.CreateBlock(); var curBox = root; int endIdx = 0; int startIdx = 0; while (startIdx >= 0) { var tagIdx = source.IndexOf('<', startIdx); if (tagIdx >= 0 && tagIdx < source.Length) { // add the html text as anon css box to the structure AddTextBox(source, startIdx, tagIdx, ref curBox); if (source[tagIdx + 1] == '!') { // skip the html crap elements (<!-- bla -->) (<!crap bla>) startIdx = source.IndexOf(">", tagIdx + 2); endIdx = startIdx > 0 ? startIdx + 1 : tagIdx + 2; } else { // parse element tag to css box structure endIdx = ParseHtmlTag(source, tagIdx, ref curBox) + 1; } } startIdx = tagIdx > -1 && endIdx > 0 ? endIdx : -1; } // handle pices of html without proper structure if (endIdx < source.Length) { // there is text after the end of last element var endText = new SubString(source, endIdx, source.Length - endIdx); if (!endText.IsEmptyOrWhitespace()) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }
/// <summary> /// Parse the html part, the part from prev parsing index to the beginning of the next html tag.<br/> /// </summary> /// <param name="source">the html source to parse</param> /// <param name="tagIdx">the index of the next html tag</param> /// <param name="curBox">the current box in html tree parsing</param> /// <returns>the end of the parsed part, the new start index</returns> private static int ParseHtmlTag(string source, int tagIdx, ref CssBox curBox) { var endIdx = source.IndexOf('>', tagIdx + 1); if (endIdx > 0) { string tagName; Dictionary <string, string> tagAttributes; var length = endIdx - tagIdx + 1 - (source[endIdx - 1] == '/' ? 1 : 0); if (ParseHtmlTag(source, tagIdx, length, out tagName, out tagAttributes)) { if (!HtmlUtils.IsSingleTag(tagName) && curBox.ParentBox != null) { // need to find the parent tag to go one level up curBox = DomUtils.FindParent(curBox.ParentBox, tagName, curBox); } } else if (!string.IsNullOrEmpty(tagName)) { //new SubString(source, lastEnd + 1, tagmatch.Index - lastEnd - 1) var isSingle = HtmlUtils.IsSingleTag(tagName) || source[endIdx - 1] == '/'; var tag = new HtmlTag(tagName, isSingle, tagAttributes); if (isSingle) { // the current box is not changed CssBox.CreateBox(tag, curBox); } else { // go one level down, make the new box the current box curBox = CssBox.CreateBox(tag, curBox); } } else { endIdx = tagIdx + 1; } } return(endIdx); }
/// <summary> /// Parses the source html to css boxes tree structure. /// </summary> /// <param name="source">the html source to parse</param> public static CssBox ParseDocument(string source) { var root = CssBox.CreateBlock(); var curBox = root; int endIdx = 0; int startIdx = 0; while (startIdx >= 0) { var tagIdx = source.IndexOf('<', startIdx); if (tagIdx >= 0 && tagIdx < source.Length) { // add the html text as anon css box to the structure AddTextBox(source, startIdx, tagIdx, ref curBox); if (source[tagIdx + 1] == '!') { if (source[tagIdx + 2] == '-') { // skip the html comment elements (<!-- bla -->) startIdx = source.IndexOf("-->", tagIdx + 2); endIdx = startIdx > 0 ? startIdx + 3 : tagIdx + 2; } else { // skip the html crap elements (<!crap bla>) startIdx = source.IndexOf(">", tagIdx + 2); endIdx = startIdx > 0 ? startIdx + 1 : tagIdx + 2; } } else { // parse element tag to css box structure endIdx = ParseHtmlTag(source, tagIdx, ref curBox) + 1; if (curBox.HtmlTag != null && curBox.HtmlTag.Name.Equals(HtmlConstants.Style, StringComparison.OrdinalIgnoreCase)) { var endIdxS = endIdx; endIdx = source.IndexOf("</style>", endIdx, StringComparison.OrdinalIgnoreCase); if (endIdx > -1) { AddTextBox(source, endIdxS, endIdx, ref curBox); } } } } startIdx = tagIdx > -1 && endIdx > 0 ? endIdx : -1; } // handle pieces of html without proper structure if (endIdx > -1 && endIdx < source.Length) { // there is text after the end of last element var endText = new SubString(source, endIdx, source.Length - endIdx); if (!endText.IsEmptyOrWhitespace()) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }
/// <summary> /// Split bad box that has inline and block boxes into two parts, the left - before the block box /// and right - after the block box. /// </summary> /// <param name="parentBox">the parent box that has the problem</param> /// <param name="badBox">the box to split into different boxes</param> /// <param name="leftBlock">the left block box that is created for the split</param> private static void CorrectBlockSplitBadBox(CssBox parentBox, CssBox badBox, CssBox leftBlock) { CssBox leftbox = null; while (badBox.Boxes[0].IsInline && ContainsInlinesOnlyDeep(badBox.Boxes[0])) { if (leftbox == null) { // if there is no elements in the left box there is no reason to keep it leftbox = CssBox.CreateBox(leftBlock, badBox.HtmlTag); leftbox.InheritStyle(badBox, true); } badBox.Boxes[0].ParentBox = leftbox; } var splitBox = badBox.Boxes[0]; if (!ContainsInlinesOnlyDeep(splitBox)) { CorrectBlockSplitBadBox(parentBox, splitBox, leftBlock); splitBox.ParentBox = null; } else { splitBox.ParentBox = parentBox; } if (badBox.Boxes.Count > 0) { CssBox rightBox; if (splitBox.ParentBox != null || parentBox.Boxes.Count < 3) { rightBox = CssBox.CreateBox(parentBox, badBox.HtmlTag); rightBox.InheritStyle(badBox, true); if (parentBox.Boxes.Count > 2) { rightBox.SetBeforeBox(parentBox.Boxes[1]); } if (splitBox.ParentBox != null) { splitBox.SetBeforeBox(rightBox); } } else { rightBox = parentBox.Boxes[2]; } rightBox.SetAllBoxes(badBox); } else if (splitBox.ParentBox != null && parentBox.Boxes.Count > 1) { splitBox.SetBeforeBox(parentBox.Boxes[1]); if (splitBox.HtmlTag != null && splitBox.HtmlTag.Name == "br" && (leftbox != null || leftBlock.Boxes.Count > 1)) { splitBox.Display = CssConstants.Inline; } } }
/// <summary> /// Split bad box that has inline and block boxes into two parts, the left - before the block box /// and right - after the block box. /// </summary> /// <param name="parentBox">the parent box that has the problem</param> /// <param name="badBox">the box to split into different boxes</param> /// <param name="leftBlock">the left block box that is created for the split</param> private static void CorrectBlockSplitBadBox(CssBox parentBox, CssBox badBox, CssBox leftBlock) { CssBox leftbox = null; //This checks if the first child of badbox isInline and if it has only inline elements while (badBox.Boxes[0].IsInline && ContainsInlinesOnlyDeep(badBox.Boxes[0])) { if (leftbox == null) { // if there is no elements in the left box there is no reason to keep it leftbox = CssBox.CreateBox(leftBlock, badBox.HtmlTag); leftbox.InheritStyle(badBox, true); } //puts the HTML of leftbox (which is a new box?) into the parent ofbadbox (it's previous parent) badBox.Boxes[0].ParentBox = leftbox; } var splitBox = badBox.Boxes[0]; if (!ContainsInlinesOnlyDeep(splitBox)) { CorrectBlockSplitBadBox(parentBox, splitBox, leftBlock); splitBox.ParentBox = null; } else { splitBox.ParentBox = parentBox; } if (badBox.Boxes.Count > 0) { CssBox rightBox; if (splitBox.ParentBox != null || parentBox.Boxes.Count < 3) { rightBox = CssBox.CreateBox(parentBox, badBox.HtmlTag); rightBox.InheritStyle(badBox, true); if (parentBox.Boxes.Count > 2) { rightBox.SetBeforeBox(parentBox.Boxes[1]); } if (splitBox.ParentBox != null) { splitBox.SetBeforeBox(rightBox); } } else { rightBox = parentBox.Boxes[2]; } rightBox.SetAllBoxes(badBox); } else if (splitBox.ParentBox != null && parentBox.Boxes.Count > 1) { splitBox.SetBeforeBox(parentBox.Boxes[1]); if (splitBox.HtmlTag != null && splitBox.HtmlTag.Name == "br" && (leftbox != null || leftBlock.Boxes.Count > 1)) { splitBox.Display = CssConstants.Inline; } } }
/// <summary> /// Parses the document /// </summary> public static CssBox ParseDocument(string document) { document = RemoveHtmlComments(document); int lastEnd = -1; CssBox root = null; CssBox curBox = null; var tags = RegexParserUtils.Match(RegexParserUtils.HtmlTag, document); foreach (Match tagmatch in tags) { string text = tagmatch.Index > 0 ? document.Substring(lastEnd + 1, tagmatch.Index - lastEnd - 1) : String.Empty; var emptyText = String.IsNullOrEmpty(text.Trim()); if (!emptyText) { if (curBox == null) { root = curBox = CssBox.CreateBlock(); } var abox = CssBox.CreateBox(curBox); abox.Text = text; } var tag = ParseHtmlTag(tagmatch.Value); if (tag.IsClosing) { // handle tags that have no content but whitespace if (emptyText && curBox != null && curBox.Boxes.Count == 0 && !string.IsNullOrEmpty(text)) { var abox = CssBox.CreateBox(curBox); abox.Text = " "; } // need to find the parent tag to go one level up curBox = DomUtils.FindParent(root, tag.Name, curBox); } else if (tag.IsSingle) { // the current box is not changed new CssBox(curBox, tag); } else { // go one level down, make the new box the current box curBox = new CssBox(curBox, tag); } if (root == null && curBox != null) { root = curBox; root.Display = CssConstants.Block; } lastEnd = tagmatch.Index + tagmatch.Length - 1; } if (root == null) { root = CssBox.CreateBlock(); var abox = CssBox.CreateBox(root); abox.Text = document; } else if (lastEnd < document.Length) { var endText = document.Substring(lastEnd + 1); if (!string.IsNullOrEmpty(endText.Trim())) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }