Esempio n. 1
0
    /// <summary>
    /// 分析
    /// </summary>
    /// <param name="htmlfile"></param>
    /// <param name="TextFileName"></param>
    /// <returns></returns>
    public MyRootHtmlNode Anlayze(string htmlfile, string TextFileName)
    {
        TableId        = 0;
        DetailItemId   = 0;
        TableList      = new Dictionary <int, List <String> >();
        DetailItemList = new Dictionary <int, List <String> >();
        //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf">
        var doc = new HtmlDocument();

        doc.Load(htmlfile);
        var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']");
        var root = new MyRootHtmlNode();

        if (node == null)
        {
            return(root);
        }
        root.Content = node[0].Attributes["title"].Value;
        //第二层是所有的一定是Paragraph
        foreach (var SecondLayerNode in node[0].ChildNodes)
        {
            //Console.WriteLine(SecondLayerNode.Name);
            //跳过#text的节
            if (SecondLayerNode.Name == "div")
            {
                var title = String.Empty;
                if (SecondLayerNode.Attributes.Contains("title"))
                {
                    title = SecondLayerNode.Attributes["title"].Value;
                }
                else
                {
                    title = SecondLayerNode.InnerText;
                }
                var secondNode = new MyHtmlNode();
                secondNode.Content = title;
                AnlayzeParagraph(SecondLayerNode, secondNode);
                FindContentWithList(secondNode.Children);
                for (int i = 0; i < secondNode.Children.Count - 1; i++)
                {
                    secondNode.Children[i].NextBrother = secondNode.Children[i + 1];
                }

                for (int i = 1; i < secondNode.Children.Count; i++)
                {
                    secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1];
                }
                root.Children.Add(secondNode);
            }
        }

        //特殊字符的矫正
        foreach (var x1 in root.Children)
        {
            x1.Content = CorrectHTML(x1.Content);
            foreach (var x2 in x1.Children)
            {
                x2.Content = CorrectHTML(x2.Content);
            }
        }

        //最后一个段落的检索
        var LastParagrah = root.Children.Last();

        if (LastParagrah.Children.Count > 0)
        {
            //重大合同:1232951
            var LastSentence = LastParagrah.Children.Last().Content;
            var sentence     = DateUtility.ConvertUpperToLower(LastSentence);
            var dateList     = DateUtility.GetDate(sentence);
            if (dateList.Count > 0)
            {
                var strDate = dateList.Last();
                if (!String.IsNullOrEmpty(strDate))
                {
                    var strBefore = Utility.GetStringBefore(sentence, strDate);
                    if (!String.IsNullOrEmpty(strBefore))
                    {
                        //尾部除去
                        LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1);
                        strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4);
                        LastParagrah.Children.Add(new MyHtmlNode()
                        {
                            Content = strBefore
                        });
                        LastParagrah.Children.Add(new MyHtmlNode()
                        {
                            Content = strDate
                        });
                    }
                }
            }
        }

        //根据文本文件内容进行调整
        if (File.Exists(TextFileName))
        {
            //重大合同之外,其实都无需做
            AdjustItemList(root, TextFileName);
            AdjustTwoLine(root, TextFileName);
        }

        for (int i = 0; i < root.Children.Count - 1; i++)
        {
            root.Children[i].NextBrother = root.Children[i + 1];
        }
        for (int i = 1; i < root.Children.Count; i++)
        {
            root.Children[i].PreviewBrother = root.Children[i - 1];
        }
        for (int i = 0; i < root.Children.Count; i++)
        {
            root.Children[i].PositionId = i + 1;
            for (int j = 0; j < root.Children[i].Children.Count; j++)
            {
                root.Children[i].Children[j].PositionId = (i + 1) * 100 + j + 1;
            }
        }
        root.TableList      = TableList;
        root.DetailItemList = DetailItemList;
        return(root);
    }
Esempio n. 2
0
    public static MyRootHtmlNode Anlayze(string htmlfile)
    {
        TableId        = 0;
        DetailItemId   = 0;
        TableList      = new Dictionary <int, List <String> >();
        DetailItemList = new Dictionary <int, List <String> >();
        //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf">
        var doc = new HtmlDocument();

        doc.Load(htmlfile);
        var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']");
        var root = new MyRootHtmlNode();

        root.Content = node[0].Attributes["title"].Value;
        //第二层是所有的一定是Paragraph
        foreach (var SecondLayerNode in node[0].ChildNodes)
        {
            //Console.WriteLine(SecondLayerNode.Name);
            //跳过#text的节
            if (SecondLayerNode.Name == "div")
            {
                var title = "";
                if (SecondLayerNode.Attributes.Contains("title"))
                {
                    title = SecondLayerNode.Attributes["title"].Value;
                }
                else
                {
                    title = SecondLayerNode.InnerText;
                }
                var secondNode = new MyHtmlNode();
                secondNode.Content = title;
                AnlayzeParagraph(SecondLayerNode, secondNode);
                FindContentWithList(secondNode.Children);
                for (int i = 0; i < secondNode.Children.Count - 1; i++)
                {
                    secondNode.Children[i].NextBrother = secondNode.Children[i + 1];
                }

                for (int i = 1; i < secondNode.Children.Count; i++)
                {
                    secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1];
                }
                root.Children.Add(secondNode);
            }
        }

        //最后一个段落的检索
        var LastParagrah = root.Children.Last();

        if (LastParagrah.Children.Count > 0)
        {
            //重大合同:1232951
            var LastSentence = LastParagrah.Children.Last().Content;
            var sentence     = Utility.ConvertUpperDateToLittle(LastSentence);
            var strDate      = RegularTool.GetDate(sentence);
            if (!String.IsNullOrEmpty(strDate))
            {
                var strBefore = Utility.GetStringBefore(sentence, strDate);
                if (!String.IsNullOrEmpty(strBefore))
                {
                    //尾部除去
                    LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1);
                    strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4);
                    LastParagrah.Children.Add(new MyHtmlNode()
                    {
                        Content = strBefore
                    });
                    LastParagrah.Children.Add(new MyHtmlNode()
                    {
                        Content = strDate
                    });
                }
            }
        }
        for (int i = 0; i < root.Children.Count - 1; i++)
        {
            root.Children[i].NextBrother = root.Children[i + 1];
        }

        for (int i = 1; i < root.Children.Count; i++)
        {
            root.Children[i].PreviewBrother = root.Children[i - 1];
        }
        root.TableList      = TableList;
        root.DetailItemList = DetailItemList;

        var txtfilename = htmlfile.Replace("html", "txt");

        if (File.Exists(txtfilename))
        {
            Adjust(root, txtfilename);
        }
        return(root);
    }
Esempio n. 3
0
    /// <summary>
    /// 段落分析
    /// </summary>
    /// <param name="paragraph"></param>
    /// <param name="root"></param>
    /// <param name="subTitle"></param>
    void AnlayzeParagraph(HtmlNode paragraph, MyHtmlNode root, String subTitle = "")
    {
        //原始HTML的第二阶层无法保证嵌套结构是正确的,
        //所以决定第二阶层不分层
        if (paragraph.ChildNodes.Count == 1)
        {
            if (paragraph.ChildNodes[0].Name == "#text")
            {
                if (subTitle != String.Empty)
                {
                    var txtnode = new MyHtmlNode();
                    txtnode.Content = subTitle;
                    root.Children.Add(txtnode);
                    return;
                }
            }
        }

        foreach (var node in paragraph.ChildNodes)
        {
            if (node.Name == "div")
            {
                if (node.Attributes.Contains("type"))
                {
                    if (node.Attributes["type"].Value == "content")
                    {
                        foreach (var child in node.ChildNodes)
                        {
                            if (child.Name == "table")
                            {
                                var tablenode = new MyHtmlNode();
                                tablenode.Content = String.Empty;
                                TableId++;
                                tablenode.TableId = TableId;
                                var tablecontentlist = HTMLTable.GetTable(child, TableId);
                                TableList.Add(TableId, tablecontentlist);
                                root.Children.Add(tablenode);
                                continue;
                            }
                            if (child.Name == "hidden")
                            {
                                continue;
                            }
                            var content = Normalizer.Normalize(child.InnerText);
                            if (!String.IsNullOrEmpty(content))
                            {
                                var contentnode = new MyHtmlNode();
                                contentnode.Content = subTitle + content;
                                root.Children.Add(contentnode);
                            }
                            else
                            {
                                if (subTitle != String.Empty)
                                {
                                    var contentnode = new MyHtmlNode();
                                    contentnode.Content = subTitle;
                                    root.Children.Add(contentnode);
                                }
                            }
                            subTitle = String.Empty;
                        }
                    }
                    if (node.Attributes["type"].Value == "paragraph")
                    {
                        var title = String.Empty;
                        if (node.Attributes.Contains("title"))
                        {
                            title = node.Attributes["title"].Value;
                            title = Normalizer.Normalize(title);
                        }
                        AnlayzeParagraph(node, root, title);
                    }
                }
            }
            if (node.Name == "table")
            {
                var tablenode = new MyHtmlNode();
                tablenode.Content = String.Empty;
                TableId++;
                tablenode.TableId = TableId;
                var tablecontentlist = HTMLTable.GetTable(node, TableId);
                TableList.Add(TableId, tablecontentlist);
                root.Children.Add(tablenode);
            }
        }
    }
Esempio n. 4
0
 static void AnlayzeParagraph(HtmlNode paragraph, MyHtmlNode root, String subTitle = "")
 {
     //原始HTML的第二阶层无法保证嵌套结构是正确的,
     //所以决定第二阶层不分层
     foreach (var node in paragraph.ChildNodes)
     {
         if (node.Name == "div")
         {
             if (node.Attributes.Contains("type"))
             {
                 if (node.Attributes["type"].Value == "content")
                 {
                     if (node.ChildNodes.Count == 3 && node.ChildNodes[1].Name == "table")
                     {
                         var tablenode = new MyHtmlNode();
                         TableId++;
                         tablenode.TableId = TableId;
                         GetTable(node.ChildNodes[1]);
                     }
                     else
                     {
                         var content = Normalizer.Normalize(node.InnerText);
                         if (!String.IsNullOrEmpty(content))
                         {
                             var s = new MyHtmlNode();
                             s.Content = subTitle + content;
                             root.Children.Add(s);
                         }
                         else
                         {
                             if (subTitle != "")
                             {
                                 var s = new MyHtmlNode();
                                 s.Content = subTitle;
                                 root.Children.Add(s);
                             }
                         }
                         subTitle = "";
                     }
                 }
                 if (node.Attributes["type"].Value == "paragraph")
                 {
                     var title = "";
                     if (node.Attributes.Contains("title"))
                     {
                         title = node.Attributes["title"].Value;
                         title = Normalizer.Normalize(title);
                     }
                     AnlayzeParagraph(node, root, title);
                 }
             }
         }
         if (node.Name == "table")
         {
             var tablenode = new MyHtmlNode();
             TableId++;
             tablenode.TableId = TableId;
             GetTable(node);
         }
     }
 }