/// <summary> /// Rearrange the DOM of the box to have block box with boxes before the inner block box and after. /// </summary> /// <param name="box">the box that has the problem</param> private static void CorrectBlockInsideInlineImp(CssBox box) { if (box.Boxes.Count > 1) { var leftBlock = CssBox.CreateBlock(box); while (ContainsInlinesOnlyDeep(box.Boxes[0])) { box.Boxes[0].ParentBox = leftBlock; } leftBlock.SetBeforeBox(box.Boxes[0]); var splitBox = box.Boxes[1]; splitBox.ParentBox = null; CorrectBlockSplitBadBox(box, splitBox, leftBlock); if (box.Boxes.Count > 2) { var rightBox = CssBox.CreateBox(box, null, box.Boxes[2]); while (box.Boxes.Count > 3) { box.Boxes[3].ParentBox = rightBox; } } box.Display = CssConstants.Block; } else { box.Boxes[0].Display = CssConstants.Block; } }
/// <summary> /// Makes block boxes be among only block boxes and all inline boxes have block parent box.<br/> /// Inline boxes should live in a pool of Inline boxes only so they will define a single block.<br/> /// At the end of this process a block box will have only block siblings and inline box will have /// only inline siblings. /// </summary> /// <param name="box">the current box to correct its sub-tree</param> private static void CorrectInlineBoxesParent(CssBox box) { if (ContainsVariantBoxes(box)) { for (int i = 0; i < box.Boxes.Count; i++) { if (box.Boxes[i].IsInline) { var newbox = CssBox.CreateBlock(box, null, box.Boxes[i++]); while (i < box.Boxes.Count && box.Boxes[i].IsInline) { box.Boxes[i].ParentBox = newbox; } } } } if (!DomUtils.ContainsInlinesOnly(box)) { foreach (var childBox in box.Boxes) { CorrectInlineBoxesParent(childBox); } } }
/// <summary> /// Makes block boxes be among only block boxes and all inline boxes have block parent box.<br/> /// Inline boxes should live in a pool of Inline boxes only so they will define a single block.<br/> /// At the end of this process a block box will have only block siblings and inline box will have /// only inline siblings. /// /// </summary> /// <param name="box">the current box to correct its sub-tree</param> private static void CorrectInlineBoxesParent(CssBox box) { if (ContainsVariantBoxes(box)) //if this box contains inline and block elements { for (int i = 0; i < box.Boxes.Count; i++) { //if the child isInline box take the child and add it to newbox // then set newbox as the parentbox if (box.Boxes[i].IsInline) { //creates new box in parent(box) at position it was in currently in var newbox = CssBox.CreateBlock(box, null, box.Boxes[i++]); while (i < box.Boxes.Count && box.Boxes[i].IsInline) { box.Boxes[i].ParentBox = newbox; } } } } if (!DomUtils.ContainsInlinesOnly(box)) { foreach (var childBox in box.Boxes) { CorrectInlineBoxesParent(childBox); } } }
/// <summary> /// Rearrange the DOM of the box to have block box with boxes before the inner block box and after. /// </summary> /// <param name="box">the box that has the problem</param> private static CssBox CorrectBlockInsideInlineImp(CssBox box) { if (box.Display == CssConstants.Inline) { box.Display = CssConstants.Block; } if (box.Boxes.Count > 1 || box.Boxes[0].Boxes.Count > 1) { var leftBlock = CssBox.CreateBlock(box); while (ContainsInlinesOnlyDeep(box.Boxes[0])) { box.Boxes[0].ParentBox = leftBlock; } leftBlock.SetBeforeBox(box.Boxes[0]); var splitBox = box.Boxes[1]; splitBox.ParentBox = null; CorrectBlockSplitBadBox(box, splitBox, leftBlock); // remove block that did not get any inner elements if (leftBlock.Boxes.Count < 1) { leftBlock.ParentBox = null; } int minBoxes = leftBlock.ParentBox != null ? 2 : 1; if (box.Boxes.Count > minBoxes) { // create temp box to handle the tail elements and then get them back so no deep hierarchy is created var tempRightBox = CssBox.CreateBox(box, null, box.Boxes[minBoxes]); while (box.Boxes.Count > minBoxes + 1) { box.Boxes[minBoxes + 1].ParentBox = tempRightBox; } return(tempRightBox); } } else if (box.Boxes[0].Display == CssConstants.Inline) { box.Boxes[0].Display = CssConstants.Block; } return(null); }
/// <summary> /// Parses the source html to css boxes tree structure. /// </summary> /// <param name="source">the html source to parse</param> public static CssBox ParseDocument(string source) { var root = CssBox.CreateBlock(); var curBox = root; int endIdx = 0; int startIdx = 0; while (startIdx >= 0) { var tagIdx = source.IndexOf('<', startIdx); if (tagIdx >= 0 && tagIdx < source.Length) { // add the html text as anon css box to the structure AddTextBox(source, startIdx, tagIdx, ref curBox); if (source[tagIdx + 1] == '!') { // skip the html crap elements (<!-- bla -->) (<!crap bla>) startIdx = source.IndexOf(">", tagIdx + 2); endIdx = startIdx > 0 ? startIdx + 1 : tagIdx + 2; } else { // parse element tag to css box structure endIdx = ParseHtmlTag(source, tagIdx, ref curBox) + 1; } } startIdx = tagIdx > -1 && endIdx > 0 ? endIdx : -1; } // handle pices of html without proper structure if (endIdx < source.Length) { // there is text after the end of last element var endText = new SubString(source, endIdx, source.Length - endIdx); if (!endText.IsEmptyOrWhitespace()) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }
/// <summary> /// Go over all image boxes and if its display style is set to block, put it inside another block but set the image to inline. /// </summary> /// <param name="box">the current box to correct its sub-tree</param> private static void CorrectImgBoxes(CssBox box) { for (int i = box.Boxes.Count - 1; i >= 0; i--) { var childBox = box.Boxes[i]; if (childBox is CssBoxImage && childBox.Display == CssConstants.Block) { var block = CssBox.CreateBlock(childBox.ParentBox, null, childBox); childBox.ParentBox = block; childBox.Display = CssConstants.Inline; } else { // recursive CorrectImgBoxes(childBox); } } }
/// <summary> /// Correct the DOM tree recursively by replacing "br" html boxes with anonymous blocks that respect br spec.<br/> /// If the "br" tag is after inline box then the anon block will have zero height only acting as newline, /// but if it is after block box then it will have min-height of the font size so it will create empty line. /// </summary> /// <param name="box">the current box to correct its sub-tree</param> /// <param name="followingBlock">used to know if the br is following a box so it should create an empty line or not so it only /// move to a new line</param> private static void CorrectLineBreaksBlocks(CssBox box, ref bool followingBlock) { followingBlock = followingBlock || box.IsBlock; foreach (CssBox childBox in box.Boxes) { CorrectLineBreaksBlocks(childBox, ref followingBlock); followingBlock = childBox.Words.Count == 0 && (followingBlock || childBox.IsBlock); } int lastBr = -1; CssBox brBox; do { brBox = null; for (int i = 0; i < box.Boxes.Count && brBox == null; i++) { if (i > lastBr && box.Boxes[i].IsBrElement) { brBox = box.Boxes[i]; lastBr = i; } else if (box.Boxes[i].Words.Count > 0) { followingBlock = false; } else if (box.Boxes[i].IsBlock) { followingBlock = true; } } if (brBox != null) { CssBox anonBlock = CssBox.CreateBlock(box, new HtmlTag("br"), brBox); if (followingBlock) { anonBlock.Height = ".95em"; // atodo: check the height to min-height when it is supported } brBox.ParentBox = null; } } while (brBox != null); }
/// <summary> /// Correct the DOM tree recursively by replacing "br" html boxes with anonymous blocks that respect br spec.<br/> /// If the "br" tag is after inline box then the anon block will have zero height only acting as newline, /// but if it is after block box then it will have min-height of the font size so it will create empty line. /// </summary> /// <param name="box">the current box to correct its sub-tree</param> private static void CorrectLineBreaksBlocks(CssBox box) { int lastBr = -1; CssBox brBox; do { brBox = null; CssBox prevBox = null; for (int i = 0; i < box.Boxes.Count && brBox == null; i++) { if (i > lastBr && box.Boxes[i].HtmlTag != null && box.Boxes[i].HtmlTag.Name == "br") { brBox = box.Boxes[i]; lastBr = i; } else { prevBox = box.Boxes[i]; } } if (brBox != null) { var anonBlock = CssBox.CreateBlock(box, new HtmlTag("br"), brBox); if (prevBox == null || prevBox.Display != CssConstants.Inline) { anonBlock.Height = ".9em"; // atodo: check the height to min-height when it is supported } brBox.ParentBox = null; } } while (brBox != null); foreach (var childBox in box.Boxes) { CorrectLineBreaksBlocks(childBox); } }
/// <summary> /// Parses the source html to css boxes tree structure. /// </summary> /// <param name="source">the html source to parse</param> public static CssBox ParseDocument(string source) { var root = CssBox.CreateBlock(); var curBox = root; int endIdx = 0; int startIdx = 0; while (startIdx >= 0) { var tagIdx = source.IndexOf('<', startIdx); if (tagIdx >= 0 && tagIdx < source.Length) { // add the html text as anon css box to the structure AddTextBox(source, startIdx, tagIdx, ref curBox); if (source[tagIdx + 1] == '!') { if (source[tagIdx + 2] == '-') { // skip the html comment elements (<!-- bla -->) startIdx = source.IndexOf("-->", tagIdx + 2); endIdx = startIdx > 0 ? startIdx + 3 : tagIdx + 2; } else { // skip the html crap elements (<!crap bla>) startIdx = source.IndexOf(">", tagIdx + 2); endIdx = startIdx > 0 ? startIdx + 1 : tagIdx + 2; } } else { // parse element tag to css box structure endIdx = ParseHtmlTag(source, tagIdx, ref curBox) + 1; if (curBox.HtmlTag != null && curBox.HtmlTag.Name.Equals(HtmlConstants.Style, StringComparison.OrdinalIgnoreCase)) { var endIdxS = endIdx; endIdx = source.IndexOf("</style>", endIdx, StringComparison.OrdinalIgnoreCase); if (endIdx > -1) { AddTextBox(source, endIdxS, endIdx, ref curBox); } } } } startIdx = tagIdx > -1 && endIdx > 0 ? endIdx : -1; } // handle pieces of html without proper structure if (endIdx > -1 && endIdx < source.Length) { // there is text after the end of last element var endText = new SubString(source, endIdx, source.Length - endIdx); if (!endText.IsEmptyOrWhitespace()) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }
/// <summary> /// Parses the document /// </summary> public static CssBox ParseDocument(string document) { document = RemoveHtmlComments(document); int lastEnd = -1; CssBox root = null; CssBox curBox = null; var tags = RegexParserUtils.Match(RegexParserUtils.HtmlTag, document); foreach (Match tagmatch in tags) { string text = tagmatch.Index > 0 ? document.Substring(lastEnd + 1, tagmatch.Index - lastEnd - 1) : String.Empty; var emptyText = String.IsNullOrEmpty(text.Trim()); if (!emptyText) { if (curBox == null) { root = curBox = CssBox.CreateBlock(); } var abox = CssBox.CreateBox(curBox); abox.Text = text; } var tag = ParseHtmlTag(tagmatch.Value); if (tag.IsClosing) { // handle tags that have no content but whitespace if (emptyText && curBox != null && curBox.Boxes.Count == 0 && !string.IsNullOrEmpty(text)) { var abox = CssBox.CreateBox(curBox); abox.Text = " "; } // need to find the parent tag to go one level up curBox = DomUtils.FindParent(root, tag.Name, curBox); } else if (tag.IsSingle) { // the current box is not changed new CssBox(curBox, tag); } else { // go one level down, make the new box the current box curBox = new CssBox(curBox, tag); } if (root == null && curBox != null) { root = curBox; root.Display = CssConstants.Block; } lastEnd = tagmatch.Index + tagmatch.Length - 1; } if (root == null) { root = CssBox.CreateBlock(); var abox = CssBox.CreateBox(root); abox.Text = document; } else if (lastEnd < document.Length) { var endText = document.Substring(lastEnd + 1); if (!string.IsNullOrEmpty(endText.Trim())) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }