Exemple #1
0
        /// <summary>
        /// 向本地myTable中写入一条记录。
        /// </summary>
        /// <param name="conventionRow"></param>
        public void writeRow_local(ConventionRow conventionRow)
        {
            DataRow myRow = myTable.NewRow();

            myRow["Guid"]           = conventionRow.Guid;
            myRow["Depth"]          = conventionRow.Depth;
            myRow["ParentNodeGuid"] = conventionRow.ParentNodeGuid;
            myRow["Category"]       = conventionRow.Category;
            myRow["TitleCn"]        = conventionRow.TitleCn;
            myRow["TitleEn"]        = conventionRow.TitleEn;
            myRow["TagCn"]          = conventionRow.TagCn;
            //myRow["TagEn"] = conventionRow.TagEn;
            //myRow["QueryGuid"] = conventionRow.QueryGuid;
            //myRow["Note"] = conventionRow.Note;
            myRow["Display"]        = conventionRow.Display;
            myRow["SequenceNumber"] = conventionRow.SequenceNumber;
            myRow["IDFolder"]       = conventionRow.IDFolder;
            myRow["TitleCnFolder"]  = conventionRow.TitleCnFolder;
            myRow["TitleEnFolder"]  = conventionRow.TitleEnFolder;
            myRow["Purposes"]       = conventionRow.Purposes;
            myRow["ShortTitleCn"]   = conventionRow.ShortTitleCn;
            myRow["ShortTitleEn"]   = conventionRow.ShortTitleEn;
            //myRow["LastEditDate"] = conventionRow.LastEditDate;
            //myRow["CreationDate"] = conventionRow.CreationDate;
            myRow["ConventionTypeKey"] = conventionRow.ConventionTypeKey;
            myTable.Rows.Add(myRow);
        }
Exemple #2
0
 /// <summary>
 /// 根据指定的父节点创建子记录
 /// </summary>
 /// <param name="parentConventionRow"></param>
 /// <param name="titleCn"></param>
 /// <param name="sequenceNumber"></param>
 /// <param name="category"></param>
 /// <param name="tagcn"></param>
 public ConventionRow(ConventionRow parentConventionRow, string titleCn,
                      int sequenceNumber, ConventionOptions.CATEGORY category, string tagcn = null)
 {
     this._TitleCn  = this._TitleEn = this._ShortTitleCn = this._ShortTitleEn = titleCn;
     this._Category = (int)category;
     this._Guid     = Guid.NewGuid();
     ConventionRow_Init();
     this._ParentNodeGuid = parentConventionRow.Guid;
     this._SequenceNumber = sequenceNumber;
     this._Depth          = parentConventionRow.Depth + 1;
     this._IDFolder       = parentConventionRow.IDFolder + "#" + this.Guid;
     this._TitleCnFolder  = this._TitleEnFolder = parentConventionRow.TitleCnFolder + "#@`" + this.TitleCn;
     this._TagCn          = tagcn;
 }
Exemple #3
0
        private void btnAdd_Click(object sender, EventArgs e)
        {
            Guid guid = new Guid();

            ConventionOptions.CATEGORY isCategory;
            if (tbGuid.Text == string.Empty)
            {
                guid        = Guid.NewGuid();
                tbGuid.Text = guid.ToString();
            }
            else
            {
                guid = new Guid(this.tbGuid.Text);
            }
            if (cbkIsCategory.Checked)
            {
                isCategory = ConventionOptions.CATEGORY.IS_CATEGORY;
            }
            else
            {
                isCategory = ConventionOptions.CATEGORY.IS_CONTENT;
            }
            try
            {
                ConventionRow tempRow = new ConventionRow(guid, new Guid(this.tbParentGuid.Text),
                                                          int.Parse(this.tbParentDepth.Text) + 1, this.tbTitle.Text, int.Parse(this.tbSNum.Text),
                                                          isCategory, this.tbParentIDfloder.Text + "#" + guid, this.tbParentTitleCNFolder.Text + "#@`" + this.tbTitle.Text, this.tbTag.Text);
                SQLUtils sqlUtils = SQLUtils.getInstance();
                sqlUtils.writeRow_local(tempRow);
                sqlUtils.updateTable();
                this.toolStripStatusLabel1.Text              = "添加成功";
                this.frm_WordRead.tbParentDepth.Text         = this.tbParentDepth.Text;
                this.frm_WordRead.tbParentGuid.Text          = this.tbGuid.Text;
                this.frm_WordRead.tbParentIDfolder.Text      = this.tbParentIDfloder.Text + "#" + this.tbGuid.Text;
                this.frm_WordRead.tbParentTitleCnFolder.Text = this.tbParentTitleCNFolder.Text + "@#`" + this.tbTitle.Text;
            }
            catch (Exception err)
            {
                MessageBox.Show(err.Message);
                this.toolStripStatusLabel1.Text = "添加失败" + err.Message;
            }
        }
Exemple #4
0
        /// <summary>
        ///  附录需要在word里按目录要求,手动改为一级或者二级标题的格式
        /// </summary>
        /// <param name="rootConvention"></param>
        public ReturnInfo ReadHtml(ConventionRow rootConvention)
        {
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.Load(htmlPath);
            HtmlNode                 htmlRootNode = doc.DocumentNode;
            HtmlNodeCollection       title1Nodes_init;
            HtmlNodeCollection       title2Nodes_init;
            List <string>            str_contentList   = new List <string>();
            List <string>            str_titleList     = new List <string>();
            List <string>            str_title1List    = new List <string>();
            List <string>            str_title2List    = new List <string>();
            HtmlNodeCollection       contentNodes      = new HtmlNodeCollection(htmlRootNode.Clone());
            Dictionary <int, string> dic_title1Content = new Dictionary <int, string>();
            HtmlNodeCollection       titleNodes        = new HtmlNodeCollection(htmlRootNode.Clone());
            HtmlNodeCollection       title1Nodes       = new HtmlNodeCollection(htmlRootNode.Clone());
            HtmlNodeCollection       title2Nodes       = new HtmlNodeCollection(htmlRootNode.Clone());
            HtmlNodeCollection       ftNoteRefnodes    = new HtmlNodeCollection(htmlRootNode.Clone());
            string htmlTxt = htmlRootNode.InnerHtml;

            //正文识别标题

            #region (废弃选项:一级标题粗体识别)
            //if (method == ReadMethod.TITLE1_BOLD)
            //{
            //    //一级标题
            //    title1Nodes_init = htmlRootNode.SelectNodes(title1_select);
            //    //二级标题可能所在span
            //    title2Nodes_init = htmlRootNode.SelectNodes(title2_select);
            //    #region 找出一级标题,HtmlNode保存在title1Nodes,文本存储在 str_title1List

            //    if (title1Nodes_init != null)
            //    {
            //        for (int i = 0; i < title1Nodes_init.Count; i++)
            //        {
            //            if ((title1Nodes_init[i].ParentNode.Name == "p" && title1Nodes_init[i].ParentNode.ParentNode.Name == "div" && title1Nodes_init[i].HasChildNodes)
            //                || (title1Nodes_init[i].Name == "h1" && title1Nodes_init[i].ParentNode.Name == "div")
            //                || (title1Nodes_init[i].Name == "h2" && title1Nodes_init[i].ParentNode.Name == "div")
            //                || (title1Nodes_init[i].ParentNode.Name == "a" && title1Nodes_init[i].ParentNode.ParentNode.Name == "p")
            //                )
            //            {
            //                foreach (var child in title1Nodes_init[i].DescendantsAndSelf())
            //                {
            //                    if (child.Name == "span" && child.HasAttributes)
            //                    {
            //                        foreach (var atbt in child.Attributes)
            //                        {
            //                            if (atbt.Name == "style")//&& atbt.Value== "font-size:15.0pt;font-family:黑体")
            //                            {
            //                                if ((title1Nodes_init[i].ParentNode.InnerText.Contains("第") && title1Nodes_init[i].ParentNode.InnerText.Contains("章"))
            //                                    )
            //                                {
            //                                    if (title1Nodes_init[i].ParentNode.ParentNode.Name == "p")
            //                                    {
            //                                        title1Nodes.Add(title1Nodes_init[i].ParentNode.ParentNode);
            //                                        str_title1List.Add(title1Nodes_init[i].ParentNode.ParentNode.InnerText.Replace("&nbsp;", " ").Replace("\r\n", ""));
            //                                    }
            //                                    else if (title1Nodes_init[i].ParentNode.Name == "p")
            //                                    {
            //                                        title1Nodes.Add(title1Nodes_init[i].ParentNode);
            //                                        str_title1List.Add(title1Nodes_init[i].ParentNode.InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //                                    }
            //                                    else if (title1Nodes_init[i].Name == "h" || title1Nodes_init[i].Name == "h1" || title1Nodes_init[i].Name == "h2")
            //                                    {
            //                                        title1Nodes.Add(title1Nodes_init[i]);
            //                                        str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //                                    }
            //                                }
            //                                break;
            //                            }
            //                        }
            //                        break;
            //                    }
            //                }
            //            }
            //        }
            //    }


            //#region 找出二级小节标题,HtmlNode保存在title2Nodes ,文本存储在str_title2List
            ////span所在的几种情形:div->p->a->span   div->p->span  div->h1->span

            //if (title2Nodes_init != null)
            //{
            //    for (int i = 0; i < title2Nodes_init.Count; i++)
            //    {
            //        //标题span存在的情形1
            //        if (title2Nodes_init[i].ParentNode.Name == "a" && title2Nodes_init[i].ParentNode.ParentNode.Name == "p")
            //        {
            //            //避免添加重复的部分
            //            if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.ParentNode.Line != title2Nodes_init[i - 1].ParentNode.ParentNode.Line))
            //            {
            //                title2Nodes.Add(title2Nodes_init[i].ParentNode.ParentNode);
            //                str_title2List.Add(title2Nodes_init[i].ParentNode.ParentNode.InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            }
            //        }
            //        //标题span存在的情形2、3
            //        else if ((title2Nodes_init[i].ParentNode.Name == "p" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div")
            //            || (title2Nodes_init[i].ParentNode.Name == "h1" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div"))
            //        {
            //            //避免添加重复的部分
            //            if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.Line != title2Nodes_init[i - 1].ParentNode.Line))
            //            {
            //                title2Nodes.Add(title2Nodes_init[i].ParentNode);
            //                str_title2List.Add(title2Nodes_init[i].ParentNode.InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            }
            //        }
            //    }
            //    for (int i = 0; i < title2Nodes.Count; i++)
            //    {
            //        if ((i > 0 && title2Nodes[i].Line == title2Nodes[i - 1].Line))
            //        {
            //            str_title2List.RemoveAt(i);
            //            title2Nodes.RemoveAt(i);
            //        }
            //    }
            //}
            //#endregion
            //
            //}
            #endregion

            #region  项1:pdf转为图片的word文件后,通过p节点class属性提取标题
            if (method == ReadMethod.TITLE_CLASS)
            {
                //HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone());
                title1Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=1]");
                title2Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=2]");
                for (int i = 0; i < title1Nodes_init.Count; i++)
                {
                    if (title1Nodes_init[i].InnerText.Replace("&nbsp;", "").Trim() != string.Empty)
                    {
                        str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
                        title1Nodes.Add(title1Nodes_init[i]);
                    }
                }
                for (int i = 0; i < title2Nodes_init.Count; i++)
                {
                    if (title2Nodes_init[i].InnerText.Replace("&nbsp;", "").Trim() != string.Empty)
                    {
                        str_title2List.Add(title2Nodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
                        title2Nodes.Add(title2Nodes_init[i]);
                    }
                }
            }
            #endregion

            #region  项2:标题中Span 标签 Style属性识别
            else if (method == ReadMethod.TITLE_SPANSTYLE)
            {
                HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone());
                #region 提取一级标题节点,生成一级目录的节点集合title1Nodes,和字符串集合str_title1List
                title1Nodes_init = htmlRootNode.SelectNodes(@"//p");
                if (title1Nodes_init != null)
                {
                    for (int i = 0; i < title1Nodes_init.Count; i++)
                    {
                        string str_style = title1Nodes_init[i].InnerHtml.Replace("\r\n", "");
                        bool   condition = str_style.Contains(title1_select);
                        //bool condition = str_style.Contains(title1_select)
                        //    && (title1Nodes_init[i].InnerText.Substring(0, 1) == "第")
                        //|| title1Nodes_init[i].InnerText.Substring(0, 1) == "附";
                        if (RecogOptions.title1_has_zitizihao)
                        {
                            string str_style_zihao = title1_select.Substring(0, title1_select.IndexOf(';'));
                            string str_style_ziti  = title1_select.Substring(title1_select.IndexOf(';') + 1);
                            condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti);
                        }
                        if (condition)
                        {
                            foreach (var match in title1Nodes_init[i].DescendantsAndSelf())
                            {
                                if (RecogOptions.title1_child == 0 && match.Name == "p")
                                {
                                    title1Nodes_tmp.Add(title1Nodes_init[i]);
                                    break;
                                }
                                if (RecogOptions.title1_child == 1 && match.Name == "b")
                                {
                                    title1Nodes_tmp.Add(title1Nodes_init[i]);
                                    break;
                                }
                                if (RecogOptions.title1_child == 2 && match.Name == "a")
                                {
                                    title1Nodes_tmp.Add(title1Nodes_init[i]);
                                    break;
                                }
                            }
                        }
                    }
                    for (int i = 0; i < title1Nodes_tmp.Count; i++)
                    {
                        if (title1Nodes_tmp[i].InnerText.Replace("&nbsp;", "").Trim() != string.Empty &&
                            (i == 0 || (i > 0 && title1Nodes_tmp[i].Line != title1Nodes_tmp[i - 1].Line)))
                        {
                            str_title1List.Add(title1Nodes_tmp[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
                            title1Nodes.Add(title1Nodes_tmp[i]);
                        }
                    }
                }
                #endregion

                #region 提取二级标题节点,生成二级目录的节点集合title2Nodes,和字符串集合str_title2List

                HtmlNodeCollection tempNodes = new HtmlNodeCollection(htmlRootNode.Clone());
                if (RecogOptions.title2RecogMethod == 1)
                {
                    title2Nodes_init = htmlRootNode.SelectNodes(@"//p");
                    if (title2Nodes_init != null)
                    {
                        for (int i = 0; i < title2Nodes_init.Count; i++)
                        {
                            string          str_tmp = title2Nodes_init[i].InnerText.Replace("&nbsp;", " ");
                            string          regExp  = Patterns.title2_x_dot_x_XXX;
                            Regex           reg     = new Regex(regExp, RegexOptions.Multiline);
                            MatchCollection matches = reg.Matches(str_tmp);
                            if (matches.Count > 0)
                            {
                                string tmp = matches[0].Value;
                                //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改
                                //if(tmp.Substring(0, 1) == "第" || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "修")
                                if (!tmp.Contains("。") //&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";")
                                    //&&!tmp.Contains("p"))
                                    )
                                //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修"))
                                {
                                    foreach (var match in title2Nodes_init[i].DescendantsAndSelf())
                                    {
                                        if (RecogOptions.title2_child == 0 && match.Name == "p")
                                        {
                                            tempNodes.Add(title2Nodes_init[i]);
                                            break;
                                        }
                                        if (RecogOptions.title2_child == 1 && match.Name == "b")
                                        {
                                            tempNodes.Add(title2Nodes_init[i]);
                                            break;
                                        }
                                        if (RecogOptions.title2_child == 2 && match.Name == "a")
                                        {
                                            tempNodes.Add(title2Nodes_init[i]);
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                if (RecogOptions.title2RecogMethod == 0)
                {
                    title2Nodes_init = htmlRootNode.SelectNodes(@"//span[@style]");
                    if (title2Nodes_init != null)
                    {
                        for (int i = 0; i < title2Nodes_init.Count; i++)
                        {
                            string str_style = title2Nodes_init[i].Attributes["style"].Value.Replace("\r\n", "");
                            bool   condition = str_style.Contains(title2_select);
                            if (RecogOptions.title2_has_zitizihao)
                            {
                                string str_style_zihao = title2_select.Substring(0, title2_select.IndexOf(';'));
                                string str_style_ziti  = title2_select.Substring(title2_select.IndexOf(';') + 1);
                                condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti);
                            }
                            if (condition)
                            {
                                if ((RecogOptions.title2_child == 0) ||
                                    (RecogOptions.title2_child == 1 && title2Nodes_init[i].ParentNode.Name == "b") ||
                                    (RecogOptions.title2_child == 2 && title2Nodes_init[i].ParentNode.Name == "a"))
                                {
                                    foreach (var match in title2Nodes_init[i].AncestorsAndSelf())
                                    {
                                        if (match.Name == "p")
                                        {
                                            //foreach(var match1 in match.Descendants())
                                            //{
                                            //    if (match1.Name == "a")
                                            //    {
                                            //        tempNodes.Add(match);
                                            //        break;
                                            //   }
                                            //}
                                            string tmp = match.InnerText.Replace("&nbsp;", "").Replace("\r\n", "").Trim();
                                            int    a   = 0;
                                            if (tmp.Length > 1)
                                            {
                                                //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改
                                                //if((tmp.Contains("条") && tmp.Substring(0, 1) == "第") || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "标")
                                                //if(tmp.Contains("条")&&tmp.Substring(0,1)=="第")
                                                //if(!(tmp.Substring(0,1)=="第")&& !(tmp.Substring(0, 1) == "附"))

                                                //if(int.TryParse(tmp.Substring(0, 1),out a)==true)
                                                if (!tmp.Contains("。"))//&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";"))
                                                //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修"))
                                                {
                                                    tempNodes.Add(match);
                                                }
                                            }
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                for (int i = 0; i < tempNodes.Count; i++)
                {
                    if (tempNodes[i].InnerText.Replace("&nbsp;", "").Trim() != String.Empty &&
                        (i == 0 || (i > 0 && tempNodes[i].Line != tempNodes[i - 1].Line)))
                    {
                        title2Nodes.Add(tempNodes[i]);
                        string tmp = tempNodes[i].InnerText.Replace("\r\n", "").Replace("&nbsp;", " ");
                        str_title2List.Add(tmp.Trim());
                    }
                }
                #endregion
            }
            #endregion

            #region  项3:h1/h2/h3标签识别标题
            //else if (method == ReadMethod.TITLE_TAG)
            //{
            //    titleNodes_init = htmlRootNode.SelectNodes(@"//" + title1_select + @"|" + @"//" + title2_select);
            //    title1Nodes_init = htmlRootNode.SelectNodes(@"//" + title1_select);
            //    title2Nodes_init = htmlRootNode.SelectNodes(@"//" + title2_select);
            //    for (int i = 0; i < titleNodes_init.Count; i++)
            //    {
            //        string tmpstr = titleNodes_init[i].InnerText;
            //        if (titleNodes_init[i].Name == title1_select && tmpstr.Contains("第") && tmpstr.Contains("章"))
            //        {
            //            titleNodes.Add(titleNodes_init[i]);
            //            title1Nodes.Add(titleNodes_init[i]);
            //            str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            str_title1List.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //        }
            //        else if (titleNodes_init[i].Name == title2_select)
            //        {
            //            titleNodes.Add(titleNodes_init[i]);
            //            title2Nodes.Add(titleNodes_init[i]);
            //            str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            str_title2List.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //        }
            //    }
            //}
            #endregion

            #region 生成包含按序排列的一二级目录的节点集合titleNodes,和字符串集合str_titleList
            foreach (var match in title1Nodes)
            {
                titleNodes.Add(match);
            }
            foreach (var match in title2Nodes)
            {
                titleNodes.Add(match);
            }
            for (int i = 0; i < titleNodes.Count; i++)
            {
                for (int j = i; j < titleNodes.Count; j++)
                {
                    if (titleNodes[i].Line > titleNodes[j].Line)
                    {
                        var temp = titleNodes[i];
                        titleNodes[i] = titleNodes[j];
                        titleNodes[j] = temp;
                    }
                }
            }

            for (int i = 0; i < titleNodes.Count; i++)
            {
                string tmp = titleNodes[i].InnerText.Replace("&nbsp;", " ").Replace("\r\n", "");
                str_titleList.Add(tmp.Trim());
            }
            #endregion
            try
            {
                #region 找出html文本末尾可能存在的各脚注div,HtmlNode存储在ftNoteRefnodes

                foreach (var match in htmlRootNode.Descendants())
                {
                    if (match.Name == "div" && match.HasAttributes)
                    {
                        string tmp = match.GetAttributeValue("id", "notfound");
                        if (tmp != "notfound")
                        {
                            ftNoteRefnodes.Add(match);
                        }
                    }
                }
                #endregion

                #region html文档中去除文档末尾的脚注,保存在 htmlTxt 字符串
                if (ftNoteRefnodes != null)
                {
                    for (int i = 0; i < ftNoteRefnodes.Count; i++)
                    {
                        htmlTxt = htmlTxt.Replace(ftNoteRefnodes[i].OuterHtml, "");
                    }
                }
                htmlTxt = htmlTxt.Replace("</body>", "").Replace("</html>", "").Replace("<body>", "").Replace("<html>", "");
                #endregion

                #region 替换图片路径
                Regex           reg     = new Regex(Patterns.imageSrc);
                MatchCollection matches = reg.Matches(htmlTxt);
                if (matches.Count == 0)
                {
                    retInfo.picResult = "无匹配图片";
                }
                else
                {
                    htmlTxt           = reg.Replace(htmlTxt, "${1}" + imageFilePath + "${2}");
                    retInfo.picResult = "识别到图片数目:" + matches.Count.ToString();
                }
                //System.IO.File.WriteAllText(@"../../../htmlRcgTest/全文.html", htmlTxt);
                #endregion

                #region 提取一级标题下可能有的正文,此标题序号和正文键值对 存储在字典dic_title1Content(包含脚注)dic_title1Content_tmp(不含脚注)
                Dictionary <int, string> dic_title1Content_tmp = new Dictionary <int, string>();
                for (int i = 0; i < titleNodes.Count; i++)
                {
                    for (int j = 0; j < title1Nodes.Count - 1; j++)
                    {
                        if (titleNodes[i].Line == title1Nodes[j].Line)
                        {
                            if ((i < titleNodes.Count - 1 && titleNodes[i + 1].Line == title1Nodes[j + 1].Line))
                            {
                                int start = htmlTxt.IndexOf(title1Nodes[j].OuterHtml);
                                int end   = htmlTxt.IndexOf(title1Nodes[j + 1].OuterHtml, start + 1);
                                if (start != -1 && end > start)
                                {
                                    dic_title1Content_tmp.Add(j, htmlTxt.Substring(start, end - start));
                                    break;
                                }
                                else
                                {
                                    throw new Exception("title1 content提取出错");
                                }
                            }
                        }
                    }
                }
                for (int i = 0; i < title1Nodes.Count; i++)
                {
                    if (titleNodes.Last().Line == title1Nodes[i].Line)
                    {
                        int start = htmlTxt.IndexOf(title1Nodes.Last().OuterHtml);
                        if (start != -1)
                        {
                            dic_title1Content_tmp.Add(title1Nodes.Count - 1, htmlTxt.Substring(start));
                            break;
                        }
                        else
                        {
                            throw new Exception("title1 last content提取出错");
                        }
                    }
                }
                foreach (var pair in dic_title1Content_tmp)
                {
                    string v = pair.Value;
                    foreach (var ftnref in ftNoteRefnodes)
                    {
                        if (pair.Value.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\""))
                        {
                            v = v + ftnref.OuterHtml;
                        }
                    }
                    dic_title1Content.Add(pair.Key, v);
                }
                #endregion

                #region 更新 htmlTxt 字符串,将html文本中一级标题和一级标题下直接的正文  删除
                if (title1Nodes != null)
                {
                    for (int i = 0; i < title1Nodes.Count; i++)
                    {
                        if (dic_title1Content_tmp.Count != 0)//若存在一级标题下直接的正文
                        {
                            foreach (var pair in dic_title1Content_tmp)
                            {
                                int index = htmlTxt.IndexOf(pair.Value);
                                htmlTxt = htmlTxt.Replace(pair.Value, "");
                                if (i != pair.Key)
                                {
                                    htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, "");
                                }
                            }
                        }
                        else //若不存在一级标题下直接的正文
                        {
                            htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, "");
                        }
                    }
                }

                #endregion

                #region 提取二级标题下Html正文,分小节存储,HtmlNode节点存储在contentNodes,文本存储在str_contentList
                int index_PartStart = 0, index_PartEnd = 0;
                for (int i = 0; i < title2Nodes.Count; i++)
                {
                    HtmlAgilityPack.HtmlDocument contentNodeDoc = new HtmlAgilityPack.HtmlDocument();
                    string str_content;

                    if (i < title2Nodes.Count - 1)
                    {
                        index_PartStart = htmlTxt.IndexOf(title2Nodes[i].OuterHtml, index_PartStart + 1);
                        index_PartEnd   = htmlTxt.IndexOf(title2Nodes[i + 1].OuterHtml, index_PartStart + 1);
                        if (index_PartStart != -1 && index_PartEnd > index_PartStart)
                        {
                            str_content = htmlTxt.Substring(index_PartStart, index_PartEnd - index_PartStart);
                        }
                        else
                        {
                            throw new Exception("提取出错");
                        }
                    }
                    else
                    {
                        index_PartStart = htmlTxt.IndexOf(title2Nodes[title2Nodes.Count - 1].OuterHtml, index_PartStart + 1);
                        if (index_PartStart != -1)
                        {
                            str_content = htmlTxt.Substring(index_PartStart);
                        }
                        else
                        {
                            throw new Exception("提取出错");
                        }
                    }
                    foreach (var ftnref in ftNoteRefnodes)
                    {
                        if (str_content.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\""))
                        {
                            str_content = str_content + ftnref.OuterHtml;
                        }
                    }
                    contentNodeDoc.LoadHtml(str_content);
                    contentNodes.Add(contentNodeDoc.DocumentNode);
                    str_contentList.Add(contentNodes[i].OuterHtml);
                    System.IO.File.WriteAllText(@"../../../htmlRcgTest/" + i + @".html", str_contentList[i]);
                }
                #endregion
                //  断点位置:在局部变量窗口中检查str_contentList/str_titleList/
                //  str_title1List /str_title2List/dic_title1Content
                //  1、数目是否正确
                //  2、的内容是否正确,是否有缺失(二级标题下的正文可以在输出的文件
                //     "../../../htmlRcgTest/" + i + @".html"中查看)
            }
            catch (Exception err)
            {
                Console.WriteLine(err.Message);
            }

            #region 将一、二级标题及内容录入数据库
            try
            {
                SQLUtils sqlUtils = SQLUtils.getInstance();
                sqlUtils.makeConnect();
                ConventionRow tmp_rootConvention = rootConvention;
                for (int i = 0; i < title1Nodes.Count; i++)
                {
                    ConventionRow tempRow1 = null;
                    foreach (var pair in dic_title1Content)
                    {
                        if (pair.Key == i)//若一级标题下有内容,而无二级目录
                        {
                            tempRow1 = new ConventionRow(rootConvention, str_title1List[i],
                                                         i + 1, ConventionOptions.CATEGORY.IS_CONTENT, pair.Value);
                            sqlUtils.writeRow_local(tempRow1);
                            retInfo.title1Guids.Add(tempRow1.Guid);
                            //retInfo.retTable.Rows.Add(tempRow1);
                            break;
                        }
                    }
                    if (tempRow1 == null)////若一级标题下无内容,有二级目录
                    {
                        tempRow1 = new ConventionRow(rootConvention, str_title1List[i],
                                                     i + 1, ConventionOptions.CATEGORY.IS_CATEGORY);
                        sqlUtils.writeRow_local(tempRow1);
                        retInfo.title1Guids.Add(tempRow1.Guid);
                        //retInfo.retTable.Rows.Add(tempRow1);
                    }
                    for (int j = 0, k = 0; j < title2Nodes.Count; j++)
                    {
                        tmp_rootConvention = tempRow1;
                        if (i < title1Nodes.Count - 1)
                        {
                            if (title2Nodes[j].Line <title1Nodes[i + 1].Line && title2Nodes[j].Line> title1Nodes[i].Line)
                            {
                                ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j],
                                                                           ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]);
                                sqlUtils.writeRow_local(tempRow2);
                                //retInfo.retTable.Rows.Add(tempRow2);
                            }
                        }
                        else if (title2Nodes[j].Line > title1Nodes[i].Line)
                        {
                            ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j],
                                                                       ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]);
                            sqlUtils.writeRow_local(tempRow2);
                            //retInfo.retTable.Rows.Add(tempRow2);
                        }
                    }
                }
                retInfo.title1s           = str_title1List;
                retInfo.title2s           = str_title2List;
                retInfo.title2Contents    = str_contentList;
                retInfo.titles            = str_titleList;
                retInfo.title1ContentsNum = dic_title1Content.Count;
            }
            catch (Exception err)
            {
                Console.WriteLine(err.Message);
                retInfo.errorInfo = "录入失败。错误原因:" + err.Message;
            }
            return(retInfo);

            #endregion
        }
Exemple #5
0
        // /Uploads/imagesrc/neihechuanbo/num5/2006

        private void btnWordRead_Click(object sender, EventArgs e)
        {
            if (this.tbHtmlPath.Text != string.Empty && this.tbParentGuid.Text != string.Empty &&
                this.tbParentDepth.Text != string.Empty && this.tbParentIDfolder.Text != string.Empty &&
                this.tbParentTitleCnFolder.Text != string.Empty && this.tbFilesPath.Text != string.Empty)
            {
                try
                {
                    conventionRead = new ConventionRead();
                    conventionRead.imageFilePath = tbFilesPath.Text;
                    ConventionRow rootNode = new ConventionRow(new Guid(this.tbParentGuid.Text), int.Parse(this.tbParentDepth.Text),
                                                               this.tbParentIDfolder.Text, this.tbParentTitleCnFolder.Text);
                    conventionRead.htmlPath = tbHtmlPath.Text;
                    if (rdbtTitle1Bold.Checked)
                    {
                        if (this.tbTitle1Xpath.Text != string.Empty && tbTitle2Xpath.Text.Trim() != String.Empty)
                        {
                            conventionRead.title1_select = tbTitle1Xpath.Text;
                            conventionRead.title2_select = tbTitle2Xpath.Text;
                            conventionRead.method        = ReadMethod.TITLE_CLASS;
                        }
                    }
                    else if (rdbtTitleHTag.Checked)
                    {
                        if (tbTitle1TagName.Text.Trim() != string.Empty && tbTitle2TagName.Text.Trim() != string.Empty)
                        {
                            conventionRead.title1_select = tbTitle1TagName.Text;
                            conventionRead.title2_select = tbTitle2TagName.Text;
                            conventionRead.method        = ReadMethod.TITLE_TAG;
                        }
                    }
                    else
                    {
                        if (tbTitle1SpanStyle.Text.Trim() != String.Empty && tbTitle2SpanStyle.Text.Trim() != String.Empty)
                        {
                            conventionRead.title1_select = tbTitle1SpanStyle.Text;
                            conventionRead.title2_select = tbTitle2SpanStyle.Text;
                            conventionRead.method        = ReadMethod.TITLE_SPANSTYLE;
                        }
                    }
                    info = conventionRead.ReadHtml(rootNode);
                    this.toolStripStatusLabel1.Text = "Html识别成功:一级目录有" + info.title1s.Count +
                                                      "个,二级目录共有" + info.title2s.Count + "个" + "一级标题直接内容有" + info.title1ContentsNum + "个,图片识别结果 " + info.picResult;
                    this.tbTitle1Guids.Text = "";
                    for (int i = 0; i < info.title1Guids.Count; i++)
                    {
                        this.tbTitle1Guids.Text += info.title1s[i] + "\r\n";// " : "+info.title1Guids[i]+"\r\n";
                    }
                    this.tbTitle1Guids.Text += "\r\n\r\n";
                    for (int i = 0; i < info.titles.Count; i++)
                    {
                        this.tbTitle1Guids.Text += info.titles[i] + "\r\n";
                    }
                }
                catch (Exception err)
                {
                    MessageBox.Show(err.Message);
                    this.toolStripStatusLabel1.Text = err.Message;
                }
            }
            else
            {
                MessageBox.Show("请输入信息!");
            }
            #region 废弃代码
            // ConventionRow rootNode=new ConventionRow(new Guid("1b506d0f-8956-46d3-a023-78d24e300ed0"),2,ConventionOptions.CATEGORY.IS_CATEGORY);
            // conventionRead.ReadCatalogue(rootNode);
            //    Word.Application app = new Word.Application();
            //    Word.Document doc = null;
            //    object unknow = Type.Missing;
            //    app.Visible = false;
            //    string str = @"D:\work\WordRead\test.docx";
            //    object file = str;
            //    doc = app.Documents.Open(ref file,
            //        ref unknow, ref unknow, ref unknow, ref unknow,
            //        ref unknow, ref unknow, ref unknow, ref unknow,
            //        ref unknow, ref unknow, ref unknow, ref unknow,
            //        ref unknow, ref unknow, ref unknow);
            //    string temp;
            //    //int paraCount = doc.Paragraphs.Count;
            //    //for (int i = 1; i < paraCount + 1; i++)
            //    //{
            //    //    temp = doc.Paragraphs[i].Range.Text.Trim();
            //    //    Console.WriteLine(temp);
            //    //}
            //    doc.ActiveWindow.Selection.WholeStory();
            //    doc.ActiveWindow.Selection.Copy();
            //    IDataObject data = Clipboard.GetDataObject();
            //    temp = data.GetData(DataFormats.Text).ToString();
            //    //回车换行使用了\r\n   和   \n
            //    string pattern_title1 = @"第\d{1,}章 {1,2}[\w ]+\r";//查找一级标题
            //    string pattern_title2 = @"(?<=\r\n|\r\n\s{1,})\d{1,}\s{1,}\w[^,。]+?\r";//查找二级标题
            //    string pattern_zhengwen = @"(?<=(?<=\r\n|\r\n\s{1,})\d{1,}\s{1,}\w[^,。]+?\r)" +
            //        @"[\s\S]+?(?=((?<=\r\n|\r\n\s{1,})\d{1,}\s{1,}\w[^,。]+?\r)|" +
            //        @"第\d{1,}章 {1,2}[\w ]+\r\n|(?<=\n|\n\s+)附录[\w\W]+?(?=\r\n)|$)";//查找正文
            //    string pattern_fulu = @"(?<=\n|\n\s+)附录[\w\W]+?(?=\r\n)"; //查找附录
            //    /*******缩进*********/
            //    string pattern_suojin1 = @"(?<=\n|^)[ \t\s]*(?=\d+[\..]\d+)";//匹配1.1、1.1.1缩进,替换顶行无缩进
            //    string pattern_suojin2 = @"(?<=\n)[ \t\s]*(?=[((]\d+[))])";//查找正文中的(1)替换为2空格
            //    string pattern_suojin3 = @"(?<=\n)[ \t\s]*(?=[①②③④⑤⑥⑦⑧⑨⑩])";//匹配 ①缩进,替换为6空格
            //    string pattern_suojin4 = @"(?<=\n)[ \t\s]*(?=[((][a-z]+[))])";//匹配(a)缩进,替换为8空格
            //    Regex.Replace(temp, pattern_suojin1, "");
            //    Regex.Replace(temp, pattern_suojin2, "  ");
            //    Regex.Replace(temp, pattern_suojin3, "      ");
            //    Regex.Replace(temp, pattern_suojin4, "        ");
            //    /****************/
            //    //FileStream mytxt = new FileStream(@"D:\work\WordRead\testResult.txt",
            //    //    FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
            //    MatchCollection mymatches = Regex.Matches(temp, pattern_zhengwen);
            //    foreach (Match match in mymatches)
            //    {
            //        File.AppendAllText(@"D:\work\WordRead\testResult.txt", match.Value);
            //        File.AppendAllText(@"D:\work\WordRead\testResult.txt", "\r\n\r\n\r\n\r\n完成一段正文。\r\n\r\n\r\n\r\n");
            //    }
            //    Console.WriteLine(mymatches.Count);
            //    Console.WriteLine("\nFinished");
            //    doc.Close();
            #endregion
        }