/// <summary> /// Get a collection of lines from a HtmlNode /// </summary> public IList <HtmlLine> GetLines(HtmlNode node, ConvertOptions options) { HtmlLineCollection lineCollection = new HtmlLineCollection(); GetLines(node, lineCollection); IList <HtmlLine> lines = lineCollection.Lines; // remove consequitive empty lines HtmlLine previous = null; for (int i = lines.Count - 1; i >= 0; i--) { HtmlLine line = lines[i]; line.NormalisedValue = NormaliseText(line.Text); if (previous != null) { if (line.Text == null && previous.Text == null) { lines.RemoveAt(i + 1); } } previous = line; } return(lines); }
private void RecurseToLines(HtmlNode node, HtmlLineCollection lines) { if (node.HasChildNodes) { foreach (HtmlNode subnode in node.ChildNodes) { GetLines(subnode, lines); } } }
private void GetLines(HtmlNode node, HtmlLineCollection lines) { string filterout = node.GetAttributeValue("filtered-out", null); if (!string.IsNullOrEmpty(filterout)) { // ignore filtered out nodes } else if (node.NodeType == HtmlNodeType.Comment) { // ignore comments completely } else if (node.NodeType == HtmlNodeType.Text) { // get text string text = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(text)) { return; } text = SystemPlus.Net.HtmlTools.HtmlDecode(text); // check the text is meaningful if (!string.IsNullOrWhiteSpace(text)) { lastWritten = text; if (lines.CurrentLine == null) { lines.AddTextLine(node.Name, node.ParentNode.XPath); } lines.AddText(lastWritten); } } else { if (node.Name == "style" || node.Name == "script" || node.Name == "noscript") { // ignore these } else if (node.Name == "img") { string src = node.GetAttributeDecoded("src", null); string alt = node.GetAttributeDecoded("alt", null); if (src != null) { lastWritten = "\r\n"; lines.AddImageLine(node.Name, node.XPath, src, alt); } } else if (node.Name == "br" || node.Name == "hr") { lastWritten = "\r\n"; lines.AddTextLine(node.Name, node.XPath); } else if (node.Name == "div" || node.Name == "p" || node.Name == "ul" || node.Name == "ol" || node.Name == "li" || node.Name == "h1" || node.Name == "h2" || node.Name == "h3" || node.Name == "h4" || node.Name == "h5" || node.Name == "h6") { lastWritten = "\r\n"; lines.AddTextLine(node.Name, node.XPath); RecurseToLines(node, lines); if (node.Name == "ul" || node.Name == "ol" || node.Name == "li") { lastWritten = "\r\n"; lines.AddTextLine(node.Name, node.XPath); } } else { if (lastWritten != null && !char.IsWhiteSpace(lastWritten.Last())) { lastWritten = " "; if (lines.CurrentLine == null) { lines.AddTextLine(node.Name, node.XPath); } lines.AddText(lastWritten); } RecurseToLines(node, lines); } } }