Example #1
0
        /// <summary>
        /// Get a collection of lines from a HtmlNode
        /// </summary>
        public IList <HtmlLine> GetLines(HtmlNode node, ConvertOptions options)
        {
            HtmlLineCollection lineCollection = new HtmlLineCollection();

            GetLines(node, lineCollection);

            IList <HtmlLine> lines = lineCollection.Lines;

            // remove consequitive empty lines
            HtmlLine previous = null;

            for (int i = lines.Count - 1; i >= 0; i--)
            {
                HtmlLine line = lines[i];

                line.NormalisedValue = NormaliseText(line.Text);

                if (previous != null)
                {
                    if (line.Text == null && previous.Text == null)
                    {
                        lines.RemoveAt(i + 1);
                    }
                }

                previous = line;
            }

            return(lines);
        }
Example #2
0
 private void RecurseToLines(HtmlNode node, HtmlLineCollection lines)
 {
     if (node.HasChildNodes)
     {
         foreach (HtmlNode subnode in node.ChildNodes)
         {
             GetLines(subnode, lines);
         }
     }
 }
Example #3
0
        private void GetLines(HtmlNode node, HtmlLineCollection lines)
        {
            string filterout = node.GetAttributeValue("filtered-out", null);

            if (!string.IsNullOrEmpty(filterout))
            {
                // ignore filtered out nodes
            }
            else if (node.NodeType == HtmlNodeType.Comment)
            {
                // ignore comments completely
            }
            else if (node.NodeType == HtmlNodeType.Text)
            {
                // get text
                string text = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(text))
                {
                    return;
                }

                text = SystemPlus.Net.HtmlTools.HtmlDecode(text);

                // check the text is meaningful
                if (!string.IsNullOrWhiteSpace(text))
                {
                    lastWritten = text;

                    if (lines.CurrentLine == null)
                    {
                        lines.AddTextLine(node.Name, node.ParentNode.XPath);
                    }

                    lines.AddText(lastWritten);
                }
            }
            else
            {
                if (node.Name == "style" || node.Name == "script" || node.Name == "noscript")
                {
                    // ignore these
                }
                else if (node.Name == "img")
                {
                    string src = node.GetAttributeDecoded("src", null);
                    string alt = node.GetAttributeDecoded("alt", null);

                    if (src != null)
                    {
                        lastWritten = "\r\n";
                        lines.AddImageLine(node.Name, node.XPath, src, alt);
                    }
                }
                else if (node.Name == "br" || node.Name == "hr")
                {
                    lastWritten = "\r\n";
                    lines.AddTextLine(node.Name, node.XPath);
                }
                else if (node.Name == "div" || node.Name == "p" || node.Name == "ul" || node.Name == "ol" || node.Name == "li" || node.Name == "h1" || node.Name == "h2" || node.Name == "h3" || node.Name == "h4" || node.Name == "h5" || node.Name == "h6")
                {
                    lastWritten = "\r\n";
                    lines.AddTextLine(node.Name, node.XPath);

                    RecurseToLines(node, lines);

                    if (node.Name == "ul" || node.Name == "ol" || node.Name == "li")
                    {
                        lastWritten = "\r\n";
                        lines.AddTextLine(node.Name, node.XPath);
                    }
                }
                else
                {
                    if (lastWritten != null && !char.IsWhiteSpace(lastWritten.Last()))
                    {
                        lastWritten = " ";

                        if (lines.CurrentLine == null)
                        {
                            lines.AddTextLine(node.Name, node.XPath);
                        }

                        lines.AddText(lastWritten);
                    }

                    RecurseToLines(node, lines);
                }
            }
        }