Пример #1
0
    //获得日期
    public static List <LocAndValue <DateTime> > LocateDate(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <DateTime> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty);
                var datelist = DateUtility.GetDate(OrgString);
                foreach (var strDate in datelist)
                {
                    var    DateNumberList = RegularTool.GetNumberList(strDate);
                    String Year = DateNumberList[0];
                    String Month = DateNumberList[1];
                    String Day = DateNumberList[2];
                    int    year; int month; int day;
                    if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                    {
                        list.Add(new LocAndValue <DateTime>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期",
                            Value = DateUtility.GetWorkDay(year, month, day)
                        });
                    }
                }
            }
        }
        return(list);
    }
Пример #2
0
    //表示日期范围的字符串
    public static List <string> GetRangeDate(string str)
    {
        str = DateUtility.ConvertUpperToLower(str);
        var startReg    = new string[] { @"\d+年\d+月\d+日" };
        var MidWordList = new string[] { "至" };
        var endReg      = new string[] { @"\d+年\d+月\d+日", @"\d+月\d+日", @"\d+日" };
        var strList     = new List <string>();

        foreach (var start in startReg)
        {
            foreach (var mid in MidWordList)
            {
                foreach (var end in endReg)
                {
                    Regex r = new Regex(start + mid + end);
                    foreach (var item in r.Matches(str).ToList())
                    {
                        if (!string.IsNullOrEmpty(item.Value))
                        {
                            strList.Add(item.Value);
                        }
                    }
                }
            }
        }

        return(strList);
    }
Пример #3
0
    public static List <string> GetDate(string str)
    {
        //中文数字转阿拉伯数字
        str = DateUtility.ConvertUpperToLower(str);
        Regex r       = new Regex(@"\d+年\d+月\d+日");
        var   strList = new List <string>();

        foreach (var item in r.Matches(str).ToList())
        {
            if (!string.IsNullOrEmpty(item.Value))
            {
                strList.Add(item.Value);
            }
        }
        return(strList);
    }
Пример #4
0
    //获得日期
    public static List <LocAndValue <(DateTime StartDate, DateTime EndDate)> > LocateDateRange(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <(DateTime StartDate, DateTime EndDate)> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var OrgString = sentence.Content;
                OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty);
                var datelist = DateUtility.GetRangeDate(OrgString);
                foreach (var strDate in datelist)
                {
                    var      DateNumberList = RegularTool.GetNumberList(strDate);
                    DateTime ST             = new DateTime();
                    DateTime ED             = new DateTime();
                    if (DateNumberList.Count == 6)
                    {
                        String Year = DateNumberList[0];
                        String Month = DateNumberList[1];
                        String Day = DateNumberList[2];
                        int    year; int month; int day;
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ST = DateUtility.GetWorkDay(year, month, day);
                        }
                        Year  = DateNumberList[3];
                        Month = DateNumberList[4];
                        Day   = DateNumberList[5];
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ED = DateUtility.GetWorkDay(year, month, day);
                        }
                        list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期范围",
                            Value = (ST, ED)
                        });
                    }
                    if (DateNumberList.Count == 5)
                    {
                        String Year = DateNumberList[0];
                        String Month = DateNumberList[1];
                        String Day = DateNumberList[2];
                        int    year; int month; int day;
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ST = DateUtility.GetWorkDay(year, month, day);
                        }
                        Month = DateNumberList[3];
                        Day   = DateNumberList[4];
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ED = DateUtility.GetWorkDay(year, month, day);
                        }
                        list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期范围",
                            Value = (ST, ED)
                        });
                    }
                    if (DateNumberList.Count == 4)
                    {
                        String Year = DateNumberList[0];
                        String Month = DateNumberList[1];
                        String Day = DateNumberList[2];
                        int    year; int month; int day;
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ST = DateUtility.GetWorkDay(year, month, day);
                        }
                        Day = DateNumberList[3];
                        if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day))
                        {
                            ED = DateUtility.GetWorkDay(year, month, day);
                        }
                        list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>()
                        {
                            Loc   = sentence.PositionId,
                            Type  = "日期范围",
                            Value = (ST, ED)
                        });
                    }
                }
            }
        }
        return(list);
    }
Пример #5
0
    /// <summary>
    /// 分析
    /// </summary>
    /// <param name="htmlfile"></param>
    /// <param name="TextFileName"></param>
    /// <returns></returns>
    public MyRootHtmlNode Anlayze(string htmlfile, string TextFileName)
    {
        TableId        = 0;
        DetailItemId   = 0;
        TableList      = new Dictionary <int, List <String> >();
        DetailItemList = new Dictionary <int, List <String> >();
        //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf">
        var doc = new HtmlDocument();

        doc.Load(htmlfile);
        var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']");
        var root = new MyRootHtmlNode();

        if (node == null)
        {
            return(root);
        }
        root.Content = node[0].Attributes["title"].Value;
        //第二层是所有的一定是Paragraph
        foreach (var SecondLayerNode in node[0].ChildNodes)
        {
            //Console.WriteLine(SecondLayerNode.Name);
            //跳过#text的节
            if (SecondLayerNode.Name == "div")
            {
                var title = String.Empty;
                if (SecondLayerNode.Attributes.Contains("title"))
                {
                    title = SecondLayerNode.Attributes["title"].Value;
                }
                else
                {
                    title = SecondLayerNode.InnerText;
                }
                var secondNode = new MyHtmlNode();
                secondNode.Content = title;
                AnlayzeParagraph(SecondLayerNode, secondNode);
                FindContentWithList(secondNode.Children);
                for (int i = 0; i < secondNode.Children.Count - 1; i++)
                {
                    secondNode.Children[i].NextBrother = secondNode.Children[i + 1];
                }

                for (int i = 1; i < secondNode.Children.Count; i++)
                {
                    secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1];
                }
                root.Children.Add(secondNode);
            }
        }

        //特殊字符的矫正
        foreach (var x1 in root.Children)
        {
            x1.Content = CorrectHTML(x1.Content);
            foreach (var x2 in x1.Children)
            {
                x2.Content = CorrectHTML(x2.Content);
            }
        }

        //最后一个段落的检索
        var LastParagrah = root.Children.Last();

        if (LastParagrah.Children.Count > 0)
        {
            //重大合同:1232951
            var LastSentence = LastParagrah.Children.Last().Content;
            var sentence     = DateUtility.ConvertUpperToLower(LastSentence);
            var dateList     = DateUtility.GetDate(sentence);
            if (dateList.Count > 0)
            {
                var strDate = dateList.Last();
                if (!String.IsNullOrEmpty(strDate))
                {
                    var strBefore = Utility.GetStringBefore(sentence, strDate);
                    if (!String.IsNullOrEmpty(strBefore))
                    {
                        //尾部除去
                        LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1);
                        strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4);
                        LastParagrah.Children.Add(new MyHtmlNode()
                        {
                            Content = strBefore
                        });
                        LastParagrah.Children.Add(new MyHtmlNode()
                        {
                            Content = strDate
                        });
                    }
                }
            }
        }

        //根据文本文件内容进行调整
        if (File.Exists(TextFileName))
        {
            //重大合同之外,其实都无需做
            AdjustItemList(root, TextFileName);
            AdjustTwoLine(root, TextFileName);
        }

        for (int i = 0; i < root.Children.Count - 1; i++)
        {
            root.Children[i].NextBrother = root.Children[i + 1];
        }
        for (int i = 1; i < root.Children.Count; i++)
        {
            root.Children[i].PreviewBrother = root.Children[i - 1];
        }
        for (int i = 0; i < root.Children.Count; i++)
        {
            root.Children[i].PositionId = i + 1;
            for (int j = 0; j < root.Children[i].Children.Count; j++)
            {
                root.Children[i].Children[j].PositionId = (i + 1) * 100 + j + 1;
            }
        }
        root.TableList      = TableList;
        root.DetailItemList = DetailItemList;
        return(root);
    }