//获得日期 public static List <LocAndValue <DateTime> > LocateDate(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <DateTime> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { list.Add(new LocAndValue <DateTime>() { Loc = sentence.PositionId, Type = "日期", Value = DateUtility.GetWorkDay(year, month, day) }); } } } } return(list); }
//表示日期范围的字符串 public static List <string> GetRangeDate(string str) { str = DateUtility.ConvertUpperToLower(str); var startReg = new string[] { @"\d+年\d+月\d+日" }; var MidWordList = new string[] { "至" }; var endReg = new string[] { @"\d+年\d+月\d+日", @"\d+月\d+日", @"\d+日" }; var strList = new List <string>(); foreach (var start in startReg) { foreach (var mid in MidWordList) { foreach (var end in endReg) { Regex r = new Regex(start + mid + end); foreach (var item in r.Matches(str).ToList()) { if (!string.IsNullOrEmpty(item.Value)) { strList.Add(item.Value); } } } } } return(strList); }
public static List <string> GetDate(string str) { //中文数字转阿拉伯数字 str = DateUtility.ConvertUpperToLower(str); Regex r = new Regex(@"\d+年\d+月\d+日"); var strList = new List <string>(); foreach (var item in r.Matches(str).ToList()) { if (!string.IsNullOrEmpty(item.Value)) { strList.Add(item.Value); } } return(strList); }
//获得日期 public static List <LocAndValue <(DateTime StartDate, DateTime EndDate)> > LocateDateRange(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <(DateTime StartDate, DateTime EndDate)> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetRangeDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); DateTime ST = new DateTime(); DateTime ED = new DateTime(); if (DateNumberList.Count == 6) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Year = DateNumberList[3]; Month = DateNumberList[4]; Day = DateNumberList[5]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 5) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Month = DateNumberList[3]; Day = DateNumberList[4]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 4) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Day = DateNumberList[3]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } } } } return(list); }
/// <summary> /// 分析 /// </summary> /// <param name="htmlfile"></param> /// <param name="TextFileName"></param> /// <returns></returns> public MyRootHtmlNode Anlayze(string htmlfile, string TextFileName) { TableId = 0; DetailItemId = 0; TableList = new Dictionary <int, List <String> >(); DetailItemList = new Dictionary <int, List <String> >(); //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf"> var doc = new HtmlDocument(); doc.Load(htmlfile); var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']"); var root = new MyRootHtmlNode(); if (node == null) { return(root); } root.Content = node[0].Attributes["title"].Value; //第二层是所有的一定是Paragraph foreach (var SecondLayerNode in node[0].ChildNodes) { //Console.WriteLine(SecondLayerNode.Name); //跳过#text的节 if (SecondLayerNode.Name == "div") { var title = String.Empty; if (SecondLayerNode.Attributes.Contains("title")) { title = SecondLayerNode.Attributes["title"].Value; } else { title = SecondLayerNode.InnerText; } var secondNode = new MyHtmlNode(); secondNode.Content = title; AnlayzeParagraph(SecondLayerNode, secondNode); FindContentWithList(secondNode.Children); for (int i = 0; i < secondNode.Children.Count - 1; i++) { secondNode.Children[i].NextBrother = secondNode.Children[i + 1]; } for (int i = 1; i < secondNode.Children.Count; i++) { secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1]; } root.Children.Add(secondNode); } } //特殊字符的矫正 foreach (var x1 in root.Children) { x1.Content = CorrectHTML(x1.Content); foreach (var x2 in x1.Children) { x2.Content = CorrectHTML(x2.Content); } } //最后一个段落的检索 var LastParagrah = root.Children.Last(); if (LastParagrah.Children.Count > 0) { //重大合同:1232951 var LastSentence = LastParagrah.Children.Last().Content; var sentence = DateUtility.ConvertUpperToLower(LastSentence); var dateList = DateUtility.GetDate(sentence); if (dateList.Count > 0) { var strDate = dateList.Last(); if (!String.IsNullOrEmpty(strDate)) { var strBefore = Utility.GetStringBefore(sentence, strDate); if (!String.IsNullOrEmpty(strBefore)) { //尾部除去 LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1); strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4); LastParagrah.Children.Add(new MyHtmlNode() { Content = strBefore }); LastParagrah.Children.Add(new MyHtmlNode() { Content = strDate }); } } } } //根据文本文件内容进行调整 if (File.Exists(TextFileName)) { //重大合同之外,其实都无需做 AdjustItemList(root, TextFileName); AdjustTwoLine(root, TextFileName); } for (int i = 0; i < root.Children.Count - 1; i++) { root.Children[i].NextBrother = root.Children[i + 1]; } for (int i = 1; i < root.Children.Count; i++) { root.Children[i].PreviewBrother = root.Children[i - 1]; } for (int i = 0; i < root.Children.Count; i++) { root.Children[i].PositionId = i + 1; for (int j = 0; j < root.Children[i].Children.Count; j++) { root.Children[i].Children[j].PositionId = (i + 1) * 100 + j + 1; } } root.TableList = TableList; root.DetailItemList = DetailItemList; return(root); }