Beispiel #1
0
    /// <summary>
    /// 段落分析
    /// </summary>
    /// <param name="paragraph"></param>
    /// <param name="root"></param>
    /// <param name="subTitle"></param>
    void AnlayzeParagraph(HtmlNode paragraph, MyHtmlNode root, String subTitle = "")
    {
        //原始HTML的第二阶层无法保证嵌套结构是正确的,
        //所以决定第二阶层不分层
        if (paragraph.ChildNodes.Count == 1)
        {
            if (paragraph.ChildNodes[0].Name == "#text")
            {
                if (subTitle != String.Empty)
                {
                    var txtnode = new MyHtmlNode();
                    txtnode.Content = subTitle;
                    root.Children.Add(txtnode);
                    return;
                }
            }
        }

        foreach (var node in paragraph.ChildNodes)
        {
            if (node.Name == "div")
            {
                if (node.Attributes.Contains("type"))
                {
                    if (node.Attributes["type"].Value == "content")
                    {
                        foreach (var child in node.ChildNodes)
                        {
                            if (child.Name == "table")
                            {
                                var tablenode = new MyHtmlNode();
                                tablenode.Content = String.Empty;
                                TableId++;
                                tablenode.TableId = TableId;
                                var tablecontentlist = HTMLTable.GetTable(child, TableId);
                                TableList.Add(TableId, tablecontentlist);
                                root.Children.Add(tablenode);
                                continue;
                            }
                            if (child.Name == "hidden")
                            {
                                continue;
                            }
                            var content = Normalizer.Normalize(child.InnerText);
                            if (!String.IsNullOrEmpty(content))
                            {
                                var contentnode = new MyHtmlNode();
                                contentnode.Content = subTitle + content;
                                root.Children.Add(contentnode);
                            }
                            else
                            {
                                if (subTitle != String.Empty)
                                {
                                    var contentnode = new MyHtmlNode();
                                    contentnode.Content = subTitle;
                                    root.Children.Add(contentnode);
                                }
                            }
                            subTitle = String.Empty;
                        }
                    }
                    if (node.Attributes["type"].Value == "paragraph")
                    {
                        var title = String.Empty;
                        if (node.Attributes.Contains("title"))
                        {
                            title = node.Attributes["title"].Value;
                            title = Normalizer.Normalize(title);
                        }
                        AnlayzeParagraph(node, root, title);
                    }
                }
            }
            if (node.Name == "table")
            {
                var tablenode = new MyHtmlNode();
                tablenode.Content = String.Empty;
                TableId++;
                tablenode.TableId = TableId;
                var tablecontentlist = HTMLTable.GetTable(node, TableId);
                TableList.Add(TableId, tablecontentlist);
                root.Children.Add(tablenode);
            }
        }
    }