/// <summary> /// Parse the html part, the part from prev parsing index to the beginning of the next html tag.<br/> /// </summary> /// <param name="source">the html source to parse</param> /// <param name="tagIdx">the index of the next html tag</param> /// <param name="curBox">the current box in html tree parsing</param> /// <returns>the end of the parsed part, the new start index</returns> private static int ParseHtmlTag(string source, int tagIdx, ref CssBox curBox) { var endIdx = source.IndexOf('>', tagIdx + 1); if (endIdx > 0) { string tagName; Dictionary <string, string> tagAttributes; var length = endIdx - tagIdx + 1 - (source[endIdx - 1] == '/' ? 1 : 0); if (ParseHtmlTag(source, tagIdx, length, out tagName, out tagAttributes)) { if (!HtmlUtils.IsSingleTag(tagName) && curBox.ParentBox != null) { // need to find the parent tag to go one level up curBox = DomUtils.FindParent(curBox.ParentBox, tagName, curBox); } } else if (!string.IsNullOrEmpty(tagName)) { //new SubString(source, lastEnd + 1, tagmatch.Index - lastEnd - 1) var isSingle = HtmlUtils.IsSingleTag(tagName) || source[endIdx - 1] == '/'; var tag = new HtmlTag(tagName, isSingle, tagAttributes); if (isSingle) { // the current box is not changed CssBox.CreateBox(tag, curBox); } else { // go one level down, make the new box the current box curBox = CssBox.CreateBox(tag, curBox); } } else { endIdx = tagIdx + 1; } } return(endIdx); }
/// <summary> /// Parses the document /// </summary> public static CssBox ParseDocument(string document) { document = RemoveHtmlComments(document); int lastEnd = -1; CssBox root = null; CssBox curBox = null; var tags = RegexParserUtils.Match(RegexParserUtils.HtmlTag, document); foreach (Match tagmatch in tags) { string text = tagmatch.Index > 0 ? document.Substring(lastEnd + 1, tagmatch.Index - lastEnd - 1) : String.Empty; var emptyText = String.IsNullOrEmpty(text.Trim()); if (!emptyText) { if (curBox == null) { root = curBox = CssBox.CreateBlock(); } var abox = CssBox.CreateBox(curBox); abox.Text = text; } var tag = ParseHtmlTag(tagmatch.Value); if (tag.IsClosing) { // handle tags that have no content but whitespace if (emptyText && curBox != null && curBox.Boxes.Count == 0 && !string.IsNullOrEmpty(text)) { var abox = CssBox.CreateBox(curBox); abox.Text = " "; } // need to find the parent tag to go one level up curBox = DomUtils.FindParent(root, tag.Name, curBox); } else if (tag.IsSingle) { // the current box is not changed new CssBox(curBox, tag); } else { // go one level down, make the new box the current box curBox = new CssBox(curBox, tag); } if (root == null && curBox != null) { root = curBox; root.Display = CssConstants.Block; } lastEnd = tagmatch.Index + tagmatch.Length - 1; } if (root == null) { root = CssBox.CreateBlock(); var abox = CssBox.CreateBox(root); abox.Text = document; } else if (lastEnd < document.Length) { var endText = document.Substring(lastEnd + 1); if (!string.IsNullOrEmpty(endText.Trim())) { var abox = CssBox.CreateBox(root); abox.Text = endText; } } return(root); }