/// <summary> /// 段落分析 /// </summary> /// <param name="paragraph"></param> /// <param name="root"></param> /// <param name="subTitle"></param> void AnlayzeParagraph(HtmlNode paragraph, MyHtmlNode root, String subTitle = "") { //原始HTML的第二阶层无法保证嵌套结构是正确的, //所以决定第二阶层不分层 if (paragraph.ChildNodes.Count == 1) { if (paragraph.ChildNodes[0].Name == "#text") { if (subTitle != String.Empty) { var txtnode = new MyHtmlNode(); txtnode.Content = subTitle; root.Children.Add(txtnode); return; } } } foreach (var node in paragraph.ChildNodes) { if (node.Name == "div") { if (node.Attributes.Contains("type")) { if (node.Attributes["type"].Value == "content") { foreach (var child in node.ChildNodes) { if (child.Name == "table") { var tablenode = new MyHtmlNode(); tablenode.Content = String.Empty; TableId++; tablenode.TableId = TableId; var tablecontentlist = HTMLTable.GetTable(child, TableId); TableList.Add(TableId, tablecontentlist); root.Children.Add(tablenode); continue; } if (child.Name == "hidden") { continue; } var content = Normalizer.Normalize(child.InnerText); if (!String.IsNullOrEmpty(content)) { var contentnode = new MyHtmlNode(); contentnode.Content = subTitle + content; root.Children.Add(contentnode); } else { if (subTitle != String.Empty) { var contentnode = new MyHtmlNode(); contentnode.Content = subTitle; root.Children.Add(contentnode); } } subTitle = String.Empty; } } if (node.Attributes["type"].Value == "paragraph") { var title = String.Empty; if (node.Attributes.Contains("title")) { title = node.Attributes["title"].Value; title = Normalizer.Normalize(title); } AnlayzeParagraph(node, root, title); } } } if (node.Name == "table") { var tablenode = new MyHtmlNode(); tablenode.Content = String.Empty; TableId++; tablenode.TableId = TableId; var tablecontentlist = HTMLTable.GetTable(node, TableId); TableList.Add(TableId, tablecontentlist); root.Children.Add(tablenode); } } }