예제 #1
0
 internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
 {
     foreach (HtmlNode subnode in node.ChildNodes)
     {
         ConvertTo(subnode, outText, textInfo);
     }
 }
예제 #2
0
 internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
 {
     foreach (HtmlNode subnode in node.ChildNodes)
     {
         ConvertTo(subnode, outText, textInfo);
     }
 }
예제 #3
0
        internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
        {
            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText, textInfo);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }
                // get text
                html = ((HtmlTextNode)node).Text;
                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }
                // check the text is meaningful and not a bunch of whitespaces
                if (html.Length == 0)
                {
                    break;
                }
                if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
                {
                    html = html.TrimStart();
                    if (html.Length == 0)
                    {
                        break;
                    }
                    textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
                }
                outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
                if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
                {
                    outText.Write(' ');
                }
                break;

            case HtmlNodeType.Element:
                string endElementString = null;
                bool   isInline;
                bool   skip      = false;
                int    listIndex = 0;
                switch (node.Name)
                {
                case "nav":
                    skip     = true;
                    isInline = false;
                    break;

                case "body":
                case "section":
                case "article":
                case "aside":
                case "h1":
                case "h2":
                case "header":
                case "footer":
                case "address":
                case "main":
                case "div":
                case "p":
                case "table":        // stylistic - adjust as you tend to use
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("\r\n");
                    }
                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "br":
                    outText.Write("\r\n");
                    skip = true;
                    textInfo.WritePrecedingWhiteSpace = false;
                    isInline = true;
                    break;

                case "a":
                    if (node.Attributes.Contains("href"))
                    {
                        string href = node.Attributes["href"].Value.Trim();
                        if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase) == -1)
                        {
                            endElementString = "<" + href + ">";
                        }
                    }
                    isInline = true;
                    break;

                case "li":
                    if (textInfo.ListIndex > 0)
                    {
                        outText.Write("\r\n{0}.\t", textInfo.ListIndex++);
                    }
                    else
                    {
                        outText.Write("\r\n*\t");         //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
                    }
                    isInline = false;
                    break;

                case "ol":
                    listIndex = 1;
                    goto case "ul";

                case "ul":         //not handling nested lists any differently at this stage - that is getting close to rendering problems
                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "img":         //inline-block in reality
                    if (node.Attributes.Contains("alt"))
                    {
                        outText.Write('[' + node.Attributes["alt"].Value);
                        endElementString = "]";
                    }
                    if (node.Attributes.Contains("src"))
                    {
                        outText.Write('<' + node.Attributes["src"].Value + '>');
                    }
                    isInline = true;
                    break;

                default:
                    isInline = true;
                    break;
                }
                if (!skip && node.HasChildNodes)
                {
                    ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten)
                    {
                        ListIndex = listIndex
                    });
                }
                if (endElementString != null)
                {
                    outText.Write(endElementString);
                }
                break;
            }
        }
예제 #4
0
 internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
 {
     string html;
     switch (node.NodeType)
     {
         case HtmlNodeType.Comment:
             // don't output comments
             break;
         case HtmlNodeType.Document:
             ConvertContentTo(node, outText, textInfo);
             break;
         case HtmlNodeType.Text:
             // script and style must not be output
             string parentName = node.ParentNode.Name;
             if ((parentName == "script") || (parentName == "style"))
             {
                 break;
             }
             // get text
             html = ((HtmlTextNode)node).Text;
             // is it in fact a special closing node output as text?
             if (HtmlNode.IsOverlappedClosingElement(html))
             {
                 break;
             }
             // check the text is meaningful and not a bunch of whitespaces
             if (html.Length == 0)
             {
                 break;
             }
             if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
             {
                 html= html.TrimStart();
                 if (html.Length == 0) { break; }
                 textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
             }
             outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
             if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
             {
                 outText.Write(' ');
             }
                 break;
         case HtmlNodeType.Element:
             string endElementString = null;
             bool isInline;
             bool skip = false;
             int listIndex = 0;
             switch (node.Name)
             {
                 case "nav":
                     skip = true;
                     isInline = false;
                     break;
                 case "body":
                 case "section":
                 case "article":
                 case "aside":
                 case "h1":
                 case "h2":
                 case "header":
                 case "footer":
                 case "address":
                 case "main":
                 case "div":
                 case "p":
                 case "table":// stylistic - adjust as you tend to use
                     if (textInfo.IsFirstTextOfDocWritten)
                     {
                         outText.Write("\r\n");
                     }
                     endElementString = "\r\n";
                     isInline = false;
                     break;
                 case "br":
                     outText.Write("\r\n");
                     skip = true;
                     textInfo.WritePrecedingWhiteSpace = false;
                     isInline = true;
                     break;
                 case "a":
                     if (node.Attributes.Contains("href"))
                     {
                         string href = node.Attributes["href"].Value.Trim();
                         if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1)
                         {
                             endElementString =  "<" + href + ">";
                         }
                     }
                     isInline = true;
                     break;
                 case "li":
                     if(textInfo.ListIndex>0)
                     {
                         outText.Write("\r\n{0}.\t", textInfo.ListIndex++);
                     }
                     else
                     {
                         outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
                     }
                     isInline = false;
                     break;
                 case "ol":
                     listIndex = 1;
                     goto case "ul";
                 case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
                     endElementString = "\r\n";
                     isInline = false;
                     break;
                 case "img": //inline-block in reality
                     if (node.Attributes.Contains("alt"))
                     {
                         outText.Write('[' + node.Attributes["alt"].Value);
                         endElementString = "]";
                     }
                     if (node.Attributes.Contains("src"))
                     {
                         outText.Write('<' + node.Attributes["src"].Value + '>');
                     }
                     isInline = true;
                     break;
                 default:
                     isInline = true;
                     break;
             }
             if (!skip && node.HasChildNodes)
             {
                 ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });
             }
             if (endElementString != null)
             {
                 outText.Write(endElementString);
             }
             break;
     }
 }