예제 #1
0
파일: HtmlToText.cs 프로젝트: jvbsl/LiCo
 internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
 {
     foreach (HtmlNode subnode in node.ChildNodes)
     {
         ConvertTo(subnode, outText, textInfo);
     }
 }
예제 #2
0
파일: HtmlToText.cs 프로젝트: jvbsl/LiCo
        internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
        {
            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText, textInfo);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;
                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                if (html.Length == 0)
                {
                    break;
                }

                if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
                {
                    html = html.TrimStart();
                    if (html.Length == 0)
                    {
                        break;
                    }

                    textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
                }

                outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
                if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
                {
                    outText.Write(' ');
                }

                break;

            case HtmlNodeType.Element:
                string endElementString = null;
                bool   isInline;
                bool   skip      = false;
                int    listIndex = 0;
                switch (node.Name)
                {
                case "nav":
                    skip     = true;
                    isInline = false;
                    break;

                case "body":
                case "section":
                case "article":
                case "aside":
                case "h1":
                case "h2":
                case "header":
                case "footer":
                case "address":
                case "main":
                case "div":
                case "p":         // stylistic - adjust as you tend to use
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("\r\n");
                    }

                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "br":
                    outText.Write("\r\n");
                    skip = true;
                    textInfo.WritePrecedingWhiteSpace = false;
                    isInline = true;
                    break;

                case "a":
                    if (node.Attributes.Contains("href"))
                    {
                        string href = node.Attributes["href"].Value.Trim();
                        if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase) == -1)
                        {
                            endElementString = "<" + href + ">";
                        }
                    }

                    isInline = true;
                    break;

                case "li":
                    if (textInfo.ListIndex > 0)
                    {
                        outText.Write("\r\n{0}.\t", textInfo.ListIndex++);
                    }
                    else
                    {
                        outText.Write(
                            "\r\n*\t");         //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
                    }

                    isInline = false;
                    break;

                case "ol":
                    listIndex = 1;
                    goto case "ul";

                case "ul"
                    :         //not handling nested lists any differently at this stage - that is getting close to rendering problems
                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "img":         //inline-block in reality
                    if (node.Attributes.Contains("alt"))
                    {
                        outText.Write('[' + node.Attributes["alt"].Value);
                        endElementString = "]";
                    }

                    if (node.Attributes.Contains("src"))
                    {
                        outText.Write('<' + node.Attributes["src"].Value + '>');
                    }

                    isInline = true;
                    break;

                default:
                    isInline = true;
                    break;
                }

                if (!skip && node.HasChildNodes)
                {
                    ConvertContentTo(node, outText,
                                     isInline
                                ? textInfo
                                : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten)
                    {
                        ListIndex = listIndex
                    });
                }

                if (endElementString != null)
                {
                    outText.Write(endElementString);
                }

                break;
            }
        }