/// <summary> /// 分析 /// </summary> /// <param name="htmlfile"></param> /// <param name="TextFileName"></param> /// <returns></returns> public MyRootHtmlNode Anlayze(string htmlfile, string TextFileName) { TableId = 0; DetailItemId = 0; TableList = new Dictionary <int, List <String> >(); DetailItemList = new Dictionary <int, List <String> >(); //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf"> var doc = new HtmlDocument(); doc.Load(htmlfile); var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']"); var root = new MyRootHtmlNode(); if (node == null) { return(root); } root.Content = node[0].Attributes["title"].Value; //第二层是所有的一定是Paragraph foreach (var SecondLayerNode in node[0].ChildNodes) { //Console.WriteLine(SecondLayerNode.Name); //跳过#text的节 if (SecondLayerNode.Name == "div") { var title = String.Empty; if (SecondLayerNode.Attributes.Contains("title")) { title = SecondLayerNode.Attributes["title"].Value; } else { title = SecondLayerNode.InnerText; } var secondNode = new MyHtmlNode(); secondNode.Content = title; AnlayzeParagraph(SecondLayerNode, secondNode); FindContentWithList(secondNode.Children); for (int i = 0; i < secondNode.Children.Count - 1; i++) { secondNode.Children[i].NextBrother = secondNode.Children[i + 1]; } for (int i = 1; i < secondNode.Children.Count; i++) { secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1]; } root.Children.Add(secondNode); } } //特殊字符的矫正 foreach (var x1 in root.Children) { x1.Content = CorrectHTML(x1.Content); foreach (var x2 in x1.Children) { x2.Content = CorrectHTML(x2.Content); } } //最后一个段落的检索 var LastParagrah = root.Children.Last(); if (LastParagrah.Children.Count > 0) { //重大合同:1232951 var LastSentence = LastParagrah.Children.Last().Content; var sentence = DateUtility.ConvertUpperToLower(LastSentence); var dateList = DateUtility.GetDate(sentence); if (dateList.Count > 0) { var strDate = dateList.Last(); if (!String.IsNullOrEmpty(strDate)) { var strBefore = Utility.GetStringBefore(sentence, strDate); if (!String.IsNullOrEmpty(strBefore)) { //尾部除去 LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1); strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4); LastParagrah.Children.Add(new MyHtmlNode() { Content = strBefore }); LastParagrah.Children.Add(new MyHtmlNode() { Content = strDate }); } } } } //根据文本文件内容进行调整 if (File.Exists(TextFileName)) { //重大合同之外,其实都无需做 AdjustItemList(root, TextFileName); AdjustTwoLine(root, TextFileName); } for (int i = 0; i < root.Children.Count - 1; i++) { root.Children[i].NextBrother = root.Children[i + 1]; } for (int i = 1; i < root.Children.Count; i++) { root.Children[i].PreviewBrother = root.Children[i - 1]; } for (int i = 0; i < root.Children.Count; i++) { root.Children[i].PositionId = i + 1; for (int j = 0; j < root.Children[i].Children.Count; j++) { root.Children[i].Children[j].PositionId = (i + 1) * 100 + j + 1; } } root.TableList = TableList; root.DetailItemList = DetailItemList; return(root); }
public static MyRootHtmlNode Anlayze(string htmlfile) { TableId = 0; DetailItemId = 0; TableList = new Dictionary <int, List <String> >(); DetailItemList = new Dictionary <int, List <String> >(); //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf"> var doc = new HtmlDocument(); doc.Load(htmlfile); var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']"); var root = new MyRootHtmlNode(); root.Content = node[0].Attributes["title"].Value; //第二层是所有的一定是Paragraph foreach (var SecondLayerNode in node[0].ChildNodes) { //Console.WriteLine(SecondLayerNode.Name); //跳过#text的节 if (SecondLayerNode.Name == "div") { var title = ""; if (SecondLayerNode.Attributes.Contains("title")) { title = SecondLayerNode.Attributes["title"].Value; } else { title = SecondLayerNode.InnerText; } var secondNode = new MyHtmlNode(); secondNode.Content = title; AnlayzeParagraph(SecondLayerNode, secondNode); FindContentWithList(secondNode.Children); for (int i = 0; i < secondNode.Children.Count - 1; i++) { secondNode.Children[i].NextBrother = secondNode.Children[i + 1]; } for (int i = 1; i < secondNode.Children.Count; i++) { secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1]; } root.Children.Add(secondNode); } } //最后一个段落的检索 var LastParagrah = root.Children.Last(); if (LastParagrah.Children.Count > 0) { //重大合同:1232951 var LastSentence = LastParagrah.Children.Last().Content; var sentence = Utility.ConvertUpperDateToLittle(LastSentence); var strDate = RegularTool.GetDate(sentence); if (!String.IsNullOrEmpty(strDate)) { var strBefore = Utility.GetStringBefore(sentence, strDate); if (!String.IsNullOrEmpty(strBefore)) { //尾部除去 LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1); strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4); LastParagrah.Children.Add(new MyHtmlNode() { Content = strBefore }); LastParagrah.Children.Add(new MyHtmlNode() { Content = strDate }); } } } for (int i = 0; i < root.Children.Count - 1; i++) { root.Children[i].NextBrother = root.Children[i + 1]; } for (int i = 1; i < root.Children.Count; i++) { root.Children[i].PreviewBrother = root.Children[i - 1]; } root.TableList = TableList; root.DetailItemList = DetailItemList; var txtfilename = htmlfile.Replace("html", "txt"); if (File.Exists(txtfilename)) { Adjust(root, txtfilename); } return(root); }
/// <summary> /// 段落分析 /// </summary> /// <param name="paragraph"></param> /// <param name="root"></param> /// <param name="subTitle"></param> void AnlayzeParagraph(HtmlNode paragraph, MyHtmlNode root, String subTitle = "") { //原始HTML的第二阶层无法保证嵌套结构是正确的, //所以决定第二阶层不分层 if (paragraph.ChildNodes.Count == 1) { if (paragraph.ChildNodes[0].Name == "#text") { if (subTitle != String.Empty) { var txtnode = new MyHtmlNode(); txtnode.Content = subTitle; root.Children.Add(txtnode); return; } } } foreach (var node in paragraph.ChildNodes) { if (node.Name == "div") { if (node.Attributes.Contains("type")) { if (node.Attributes["type"].Value == "content") { foreach (var child in node.ChildNodes) { if (child.Name == "table") { var tablenode = new MyHtmlNode(); tablenode.Content = String.Empty; TableId++; tablenode.TableId = TableId; var tablecontentlist = HTMLTable.GetTable(child, TableId); TableList.Add(TableId, tablecontentlist); root.Children.Add(tablenode); continue; } if (child.Name == "hidden") { continue; } var content = Normalizer.Normalize(child.InnerText); if (!String.IsNullOrEmpty(content)) { var contentnode = new MyHtmlNode(); contentnode.Content = subTitle + content; root.Children.Add(contentnode); } else { if (subTitle != String.Empty) { var contentnode = new MyHtmlNode(); contentnode.Content = subTitle; root.Children.Add(contentnode); } } subTitle = String.Empty; } } if (node.Attributes["type"].Value == "paragraph") { var title = String.Empty; if (node.Attributes.Contains("title")) { title = node.Attributes["title"].Value; title = Normalizer.Normalize(title); } AnlayzeParagraph(node, root, title); } } } if (node.Name == "table") { var tablenode = new MyHtmlNode(); tablenode.Content = String.Empty; TableId++; tablenode.TableId = TableId; var tablecontentlist = HTMLTable.GetTable(node, TableId); TableList.Add(TableId, tablecontentlist); root.Children.Add(tablenode); } } }
static void AnlayzeParagraph(HtmlNode paragraph, MyHtmlNode root, String subTitle = "") { //原始HTML的第二阶层无法保证嵌套结构是正确的, //所以决定第二阶层不分层 foreach (var node in paragraph.ChildNodes) { if (node.Name == "div") { if (node.Attributes.Contains("type")) { if (node.Attributes["type"].Value == "content") { if (node.ChildNodes.Count == 3 && node.ChildNodes[1].Name == "table") { var tablenode = new MyHtmlNode(); TableId++; tablenode.TableId = TableId; GetTable(node.ChildNodes[1]); } else { var content = Normalizer.Normalize(node.InnerText); if (!String.IsNullOrEmpty(content)) { var s = new MyHtmlNode(); s.Content = subTitle + content; root.Children.Add(s); } else { if (subTitle != "") { var s = new MyHtmlNode(); s.Content = subTitle; root.Children.Add(s); } } subTitle = ""; } } if (node.Attributes["type"].Value == "paragraph") { var title = ""; if (node.Attributes.Contains("title")) { title = node.Attributes["title"].Value; title = Normalizer.Normalize(title); } AnlayzeParagraph(node, root, title); } } } if (node.Name == "table") { var tablenode = new MyHtmlNode(); TableId++; tablenode.TableId = TableId; GetTable(node); } } }