/// <summary> /// 向本地myTable中写入一条记录。 /// </summary> /// <param name="conventionRow"></param> public void writeRow_local(ConventionRow conventionRow) { DataRow myRow = myTable.NewRow(); myRow["Guid"] = conventionRow.Guid; myRow["Depth"] = conventionRow.Depth; myRow["ParentNodeGuid"] = conventionRow.ParentNodeGuid; myRow["Category"] = conventionRow.Category; myRow["TitleCn"] = conventionRow.TitleCn; myRow["TitleEn"] = conventionRow.TitleEn; myRow["TagCn"] = conventionRow.TagCn; //myRow["TagEn"] = conventionRow.TagEn; //myRow["QueryGuid"] = conventionRow.QueryGuid; //myRow["Note"] = conventionRow.Note; myRow["Display"] = conventionRow.Display; myRow["SequenceNumber"] = conventionRow.SequenceNumber; myRow["IDFolder"] = conventionRow.IDFolder; myRow["TitleCnFolder"] = conventionRow.TitleCnFolder; myRow["TitleEnFolder"] = conventionRow.TitleEnFolder; myRow["Purposes"] = conventionRow.Purposes; myRow["ShortTitleCn"] = conventionRow.ShortTitleCn; myRow["ShortTitleEn"] = conventionRow.ShortTitleEn; //myRow["LastEditDate"] = conventionRow.LastEditDate; //myRow["CreationDate"] = conventionRow.CreationDate; myRow["ConventionTypeKey"] = conventionRow.ConventionTypeKey; myTable.Rows.Add(myRow); }
/// <summary> /// 根据指定的父节点创建子记录 /// </summary> /// <param name="parentConventionRow"></param> /// <param name="titleCn"></param> /// <param name="sequenceNumber"></param> /// <param name="category"></param> /// <param name="tagcn"></param> public ConventionRow(ConventionRow parentConventionRow, string titleCn, int sequenceNumber, ConventionOptions.CATEGORY category, string tagcn = null) { this._TitleCn = this._TitleEn = this._ShortTitleCn = this._ShortTitleEn = titleCn; this._Category = (int)category; this._Guid = Guid.NewGuid(); ConventionRow_Init(); this._ParentNodeGuid = parentConventionRow.Guid; this._SequenceNumber = sequenceNumber; this._Depth = parentConventionRow.Depth + 1; this._IDFolder = parentConventionRow.IDFolder + "#" + this.Guid; this._TitleCnFolder = this._TitleEnFolder = parentConventionRow.TitleCnFolder + "#@`" + this.TitleCn; this._TagCn = tagcn; }
private void btnAdd_Click(object sender, EventArgs e) { Guid guid = new Guid(); ConventionOptions.CATEGORY isCategory; if (tbGuid.Text == string.Empty) { guid = Guid.NewGuid(); tbGuid.Text = guid.ToString(); } else { guid = new Guid(this.tbGuid.Text); } if (cbkIsCategory.Checked) { isCategory = ConventionOptions.CATEGORY.IS_CATEGORY; } else { isCategory = ConventionOptions.CATEGORY.IS_CONTENT; } try { ConventionRow tempRow = new ConventionRow(guid, new Guid(this.tbParentGuid.Text), int.Parse(this.tbParentDepth.Text) + 1, this.tbTitle.Text, int.Parse(this.tbSNum.Text), isCategory, this.tbParentIDfloder.Text + "#" + guid, this.tbParentTitleCNFolder.Text + "#@`" + this.tbTitle.Text, this.tbTag.Text); SQLUtils sqlUtils = SQLUtils.getInstance(); sqlUtils.writeRow_local(tempRow); sqlUtils.updateTable(); this.toolStripStatusLabel1.Text = "添加成功"; this.frm_WordRead.tbParentDepth.Text = this.tbParentDepth.Text; this.frm_WordRead.tbParentGuid.Text = this.tbGuid.Text; this.frm_WordRead.tbParentIDfolder.Text = this.tbParentIDfloder.Text + "#" + this.tbGuid.Text; this.frm_WordRead.tbParentTitleCnFolder.Text = this.tbParentTitleCNFolder.Text + "@#`" + this.tbTitle.Text; } catch (Exception err) { MessageBox.Show(err.Message); this.toolStripStatusLabel1.Text = "添加失败" + err.Message; } }
/// <summary> /// 附录需要在word里按目录要求,手动改为一级或者二级标题的格式 /// </summary> /// <param name="rootConvention"></param> public ReturnInfo ReadHtml(ConventionRow rootConvention) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.Load(htmlPath); HtmlNode htmlRootNode = doc.DocumentNode; HtmlNodeCollection title1Nodes_init; HtmlNodeCollection title2Nodes_init; List <string> str_contentList = new List <string>(); List <string> str_titleList = new List <string>(); List <string> str_title1List = new List <string>(); List <string> str_title2List = new List <string>(); HtmlNodeCollection contentNodes = new HtmlNodeCollection(htmlRootNode.Clone()); Dictionary <int, string> dic_title1Content = new Dictionary <int, string>(); HtmlNodeCollection titleNodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection title1Nodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection title2Nodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection ftNoteRefnodes = new HtmlNodeCollection(htmlRootNode.Clone()); string htmlTxt = htmlRootNode.InnerHtml; //正文识别标题 #region (废弃选项:一级标题粗体识别) //if (method == ReadMethod.TITLE1_BOLD) //{ // //一级标题 // title1Nodes_init = htmlRootNode.SelectNodes(title1_select); // //二级标题可能所在span // title2Nodes_init = htmlRootNode.SelectNodes(title2_select); // #region 找出一级标题,HtmlNode保存在title1Nodes,文本存储在 str_title1List // if (title1Nodes_init != null) // { // for (int i = 0; i < title1Nodes_init.Count; i++) // { // if ((title1Nodes_init[i].ParentNode.Name == "p" && title1Nodes_init[i].ParentNode.ParentNode.Name == "div" && title1Nodes_init[i].HasChildNodes) // || (title1Nodes_init[i].Name == "h1" && title1Nodes_init[i].ParentNode.Name == "div") // || (title1Nodes_init[i].Name == "h2" && title1Nodes_init[i].ParentNode.Name == "div") // || (title1Nodes_init[i].ParentNode.Name == "a" && title1Nodes_init[i].ParentNode.ParentNode.Name == "p") // ) // { // foreach (var child in title1Nodes_init[i].DescendantsAndSelf()) // { // if (child.Name == "span" && child.HasAttributes) // { // foreach (var atbt in child.Attributes) // { // if (atbt.Name == "style")//&& atbt.Value== "font-size:15.0pt;font-family:黑体") // { // if ((title1Nodes_init[i].ParentNode.InnerText.Contains("第") && title1Nodes_init[i].ParentNode.InnerText.Contains("章")) // ) // { // if (title1Nodes_init[i].ParentNode.ParentNode.Name == "p") // { // title1Nodes.Add(title1Nodes_init[i].ParentNode.ParentNode); // str_title1List.Add(title1Nodes_init[i].ParentNode.ParentNode.InnerText.Replace(" ", " ").Replace("\r\n", "")); // } // else if (title1Nodes_init[i].ParentNode.Name == "p") // { // title1Nodes.Add(title1Nodes_init[i].ParentNode); // str_title1List.Add(title1Nodes_init[i].ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // else if (title1Nodes_init[i].Name == "h" || title1Nodes_init[i].Name == "h1" || title1Nodes_init[i].Name == "h2") // { // title1Nodes.Add(title1Nodes_init[i]); // str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // break; // } // } // break; // } // } // } // } // } //#region 找出二级小节标题,HtmlNode保存在title2Nodes ,文本存储在str_title2List ////span所在的几种情形:div->p->a->span div->p->span div->h1->span //if (title2Nodes_init != null) //{ // for (int i = 0; i < title2Nodes_init.Count; i++) // { // //标题span存在的情形1 // if (title2Nodes_init[i].ParentNode.Name == "a" && title2Nodes_init[i].ParentNode.ParentNode.Name == "p") // { // //避免添加重复的部分 // if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.ParentNode.Line != title2Nodes_init[i - 1].ParentNode.ParentNode.Line)) // { // title2Nodes.Add(title2Nodes_init[i].ParentNode.ParentNode); // str_title2List.Add(title2Nodes_init[i].ParentNode.ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // //标题span存在的情形2、3 // else if ((title2Nodes_init[i].ParentNode.Name == "p" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div") // || (title2Nodes_init[i].ParentNode.Name == "h1" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div")) // { // //避免添加重复的部分 // if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.Line != title2Nodes_init[i - 1].ParentNode.Line)) // { // title2Nodes.Add(title2Nodes_init[i].ParentNode); // str_title2List.Add(title2Nodes_init[i].ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // } // for (int i = 0; i < title2Nodes.Count; i++) // { // if ((i > 0 && title2Nodes[i].Line == title2Nodes[i - 1].Line)) // { // str_title2List.RemoveAt(i); // title2Nodes.RemoveAt(i); // } // } //} //#endregion // //} #endregion #region 项1:pdf转为图片的word文件后,通过p节点class属性提取标题 if (method == ReadMethod.TITLE_CLASS) { //HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone()); title1Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=1]"); title2Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=2]"); for (int i = 0; i < title1Nodes_init.Count; i++) { if (title1Nodes_init[i].InnerText.Replace(" ", "").Trim() != string.Empty) { str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title1Nodes.Add(title1Nodes_init[i]); } } for (int i = 0; i < title2Nodes_init.Count; i++) { if (title2Nodes_init[i].InnerText.Replace(" ", "").Trim() != string.Empty) { str_title2List.Add(title2Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title2Nodes.Add(title2Nodes_init[i]); } } } #endregion #region 项2:标题中Span 标签 Style属性识别 else if (method == ReadMethod.TITLE_SPANSTYLE) { HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone()); #region 提取一级标题节点,生成一级目录的节点集合title1Nodes,和字符串集合str_title1List title1Nodes_init = htmlRootNode.SelectNodes(@"//p"); if (title1Nodes_init != null) { for (int i = 0; i < title1Nodes_init.Count; i++) { string str_style = title1Nodes_init[i].InnerHtml.Replace("\r\n", ""); bool condition = str_style.Contains(title1_select); //bool condition = str_style.Contains(title1_select) // && (title1Nodes_init[i].InnerText.Substring(0, 1) == "第") //|| title1Nodes_init[i].InnerText.Substring(0, 1) == "附"; if (RecogOptions.title1_has_zitizihao) { string str_style_zihao = title1_select.Substring(0, title1_select.IndexOf(';')); string str_style_ziti = title1_select.Substring(title1_select.IndexOf(';') + 1); condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti); } if (condition) { foreach (var match in title1Nodes_init[i].DescendantsAndSelf()) { if (RecogOptions.title1_child == 0 && match.Name == "p") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } if (RecogOptions.title1_child == 1 && match.Name == "b") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } if (RecogOptions.title1_child == 2 && match.Name == "a") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } } } } for (int i = 0; i < title1Nodes_tmp.Count; i++) { if (title1Nodes_tmp[i].InnerText.Replace(" ", "").Trim() != string.Empty && (i == 0 || (i > 0 && title1Nodes_tmp[i].Line != title1Nodes_tmp[i - 1].Line))) { str_title1List.Add(title1Nodes_tmp[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title1Nodes.Add(title1Nodes_tmp[i]); } } } #endregion #region 提取二级标题节点,生成二级目录的节点集合title2Nodes,和字符串集合str_title2List HtmlNodeCollection tempNodes = new HtmlNodeCollection(htmlRootNode.Clone()); if (RecogOptions.title2RecogMethod == 1) { title2Nodes_init = htmlRootNode.SelectNodes(@"//p"); if (title2Nodes_init != null) { for (int i = 0; i < title2Nodes_init.Count; i++) { string str_tmp = title2Nodes_init[i].InnerText.Replace(" ", " "); string regExp = Patterns.title2_x_dot_x_XXX; Regex reg = new Regex(regExp, RegexOptions.Multiline); MatchCollection matches = reg.Matches(str_tmp); if (matches.Count > 0) { string tmp = matches[0].Value; //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改 //if(tmp.Substring(0, 1) == "第" || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "修") if (!tmp.Contains("。") //&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";") //&&!tmp.Contains("p")) ) //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修")) { foreach (var match in title2Nodes_init[i].DescendantsAndSelf()) { if (RecogOptions.title2_child == 0 && match.Name == "p") { tempNodes.Add(title2Nodes_init[i]); break; } if (RecogOptions.title2_child == 1 && match.Name == "b") { tempNodes.Add(title2Nodes_init[i]); break; } if (RecogOptions.title2_child == 2 && match.Name == "a") { tempNodes.Add(title2Nodes_init[i]); break; } } } } } } } if (RecogOptions.title2RecogMethod == 0) { title2Nodes_init = htmlRootNode.SelectNodes(@"//span[@style]"); if (title2Nodes_init != null) { for (int i = 0; i < title2Nodes_init.Count; i++) { string str_style = title2Nodes_init[i].Attributes["style"].Value.Replace("\r\n", ""); bool condition = str_style.Contains(title2_select); if (RecogOptions.title2_has_zitizihao) { string str_style_zihao = title2_select.Substring(0, title2_select.IndexOf(';')); string str_style_ziti = title2_select.Substring(title2_select.IndexOf(';') + 1); condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti); } if (condition) { if ((RecogOptions.title2_child == 0) || (RecogOptions.title2_child == 1 && title2Nodes_init[i].ParentNode.Name == "b") || (RecogOptions.title2_child == 2 && title2Nodes_init[i].ParentNode.Name == "a")) { foreach (var match in title2Nodes_init[i].AncestorsAndSelf()) { if (match.Name == "p") { //foreach(var match1 in match.Descendants()) //{ // if (match1.Name == "a") // { // tempNodes.Add(match); // break; // } //} string tmp = match.InnerText.Replace(" ", "").Replace("\r\n", "").Trim(); int a = 0; if (tmp.Length > 1) { //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改 //if((tmp.Contains("条") && tmp.Substring(0, 1) == "第") || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "标") //if(tmp.Contains("条")&&tmp.Substring(0,1)=="第") //if(!(tmp.Substring(0,1)=="第")&& !(tmp.Substring(0, 1) == "附")) //if(int.TryParse(tmp.Substring(0, 1),out a)==true) if (!tmp.Contains("。"))//&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";")) //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修")) { tempNodes.Add(match); } } break; } } } } } } } for (int i = 0; i < tempNodes.Count; i++) { if (tempNodes[i].InnerText.Replace(" ", "").Trim() != String.Empty && (i == 0 || (i > 0 && tempNodes[i].Line != tempNodes[i - 1].Line))) { title2Nodes.Add(tempNodes[i]); string tmp = tempNodes[i].InnerText.Replace("\r\n", "").Replace(" ", " "); str_title2List.Add(tmp.Trim()); } } #endregion } #endregion #region 项3:h1/h2/h3标签识别标题 //else if (method == ReadMethod.TITLE_TAG) //{ // titleNodes_init = htmlRootNode.SelectNodes(@"//" + title1_select + @"|" + @"//" + title2_select); // title1Nodes_init = htmlRootNode.SelectNodes(@"//" + title1_select); // title2Nodes_init = htmlRootNode.SelectNodes(@"//" + title2_select); // for (int i = 0; i < titleNodes_init.Count; i++) // { // string tmpstr = titleNodes_init[i].InnerText; // if (titleNodes_init[i].Name == title1_select && tmpstr.Contains("第") && tmpstr.Contains("章")) // { // titleNodes.Add(titleNodes_init[i]); // title1Nodes.Add(titleNodes_init[i]); // str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // str_title1List.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // else if (titleNodes_init[i].Name == title2_select) // { // titleNodes.Add(titleNodes_init[i]); // title2Nodes.Add(titleNodes_init[i]); // str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // str_title2List.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } //} #endregion #region 生成包含按序排列的一二级目录的节点集合titleNodes,和字符串集合str_titleList foreach (var match in title1Nodes) { titleNodes.Add(match); } foreach (var match in title2Nodes) { titleNodes.Add(match); } for (int i = 0; i < titleNodes.Count; i++) { for (int j = i; j < titleNodes.Count; j++) { if (titleNodes[i].Line > titleNodes[j].Line) { var temp = titleNodes[i]; titleNodes[i] = titleNodes[j]; titleNodes[j] = temp; } } } for (int i = 0; i < titleNodes.Count; i++) { string tmp = titleNodes[i].InnerText.Replace(" ", " ").Replace("\r\n", ""); str_titleList.Add(tmp.Trim()); } #endregion try { #region 找出html文本末尾可能存在的各脚注div,HtmlNode存储在ftNoteRefnodes foreach (var match in htmlRootNode.Descendants()) { if (match.Name == "div" && match.HasAttributes) { string tmp = match.GetAttributeValue("id", "notfound"); if (tmp != "notfound") { ftNoteRefnodes.Add(match); } } } #endregion #region html文档中去除文档末尾的脚注,保存在 htmlTxt 字符串 if (ftNoteRefnodes != null) { for (int i = 0; i < ftNoteRefnodes.Count; i++) { htmlTxt = htmlTxt.Replace(ftNoteRefnodes[i].OuterHtml, ""); } } htmlTxt = htmlTxt.Replace("</body>", "").Replace("</html>", "").Replace("<body>", "").Replace("<html>", ""); #endregion #region 替换图片路径 Regex reg = new Regex(Patterns.imageSrc); MatchCollection matches = reg.Matches(htmlTxt); if (matches.Count == 0) { retInfo.picResult = "无匹配图片"; } else { htmlTxt = reg.Replace(htmlTxt, "${1}" + imageFilePath + "${2}"); retInfo.picResult = "识别到图片数目:" + matches.Count.ToString(); } //System.IO.File.WriteAllText(@"../../../htmlRcgTest/全文.html", htmlTxt); #endregion #region 提取一级标题下可能有的正文,此标题序号和正文键值对 存储在字典dic_title1Content(包含脚注)dic_title1Content_tmp(不含脚注) Dictionary <int, string> dic_title1Content_tmp = new Dictionary <int, string>(); for (int i = 0; i < titleNodes.Count; i++) { for (int j = 0; j < title1Nodes.Count - 1; j++) { if (titleNodes[i].Line == title1Nodes[j].Line) { if ((i < titleNodes.Count - 1 && titleNodes[i + 1].Line == title1Nodes[j + 1].Line)) { int start = htmlTxt.IndexOf(title1Nodes[j].OuterHtml); int end = htmlTxt.IndexOf(title1Nodes[j + 1].OuterHtml, start + 1); if (start != -1 && end > start) { dic_title1Content_tmp.Add(j, htmlTxt.Substring(start, end - start)); break; } else { throw new Exception("title1 content提取出错"); } } } } } for (int i = 0; i < title1Nodes.Count; i++) { if (titleNodes.Last().Line == title1Nodes[i].Line) { int start = htmlTxt.IndexOf(title1Nodes.Last().OuterHtml); if (start != -1) { dic_title1Content_tmp.Add(title1Nodes.Count - 1, htmlTxt.Substring(start)); break; } else { throw new Exception("title1 last content提取出错"); } } } foreach (var pair in dic_title1Content_tmp) { string v = pair.Value; foreach (var ftnref in ftNoteRefnodes) { if (pair.Value.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\"")) { v = v + ftnref.OuterHtml; } } dic_title1Content.Add(pair.Key, v); } #endregion #region 更新 htmlTxt 字符串,将html文本中一级标题和一级标题下直接的正文 删除 if (title1Nodes != null) { for (int i = 0; i < title1Nodes.Count; i++) { if (dic_title1Content_tmp.Count != 0)//若存在一级标题下直接的正文 { foreach (var pair in dic_title1Content_tmp) { int index = htmlTxt.IndexOf(pair.Value); htmlTxt = htmlTxt.Replace(pair.Value, ""); if (i != pair.Key) { htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, ""); } } } else //若不存在一级标题下直接的正文 { htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, ""); } } } #endregion #region 提取二级标题下Html正文,分小节存储,HtmlNode节点存储在contentNodes,文本存储在str_contentList int index_PartStart = 0, index_PartEnd = 0; for (int i = 0; i < title2Nodes.Count; i++) { HtmlAgilityPack.HtmlDocument contentNodeDoc = new HtmlAgilityPack.HtmlDocument(); string str_content; if (i < title2Nodes.Count - 1) { index_PartStart = htmlTxt.IndexOf(title2Nodes[i].OuterHtml, index_PartStart + 1); index_PartEnd = htmlTxt.IndexOf(title2Nodes[i + 1].OuterHtml, index_PartStart + 1); if (index_PartStart != -1 && index_PartEnd > index_PartStart) { str_content = htmlTxt.Substring(index_PartStart, index_PartEnd - index_PartStart); } else { throw new Exception("提取出错"); } } else { index_PartStart = htmlTxt.IndexOf(title2Nodes[title2Nodes.Count - 1].OuterHtml, index_PartStart + 1); if (index_PartStart != -1) { str_content = htmlTxt.Substring(index_PartStart); } else { throw new Exception("提取出错"); } } foreach (var ftnref in ftNoteRefnodes) { if (str_content.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\"")) { str_content = str_content + ftnref.OuterHtml; } } contentNodeDoc.LoadHtml(str_content); contentNodes.Add(contentNodeDoc.DocumentNode); str_contentList.Add(contentNodes[i].OuterHtml); System.IO.File.WriteAllText(@"../../../htmlRcgTest/" + i + @".html", str_contentList[i]); } #endregion // 断点位置:在局部变量窗口中检查str_contentList/str_titleList/ // str_title1List /str_title2List/dic_title1Content // 1、数目是否正确 // 2、的内容是否正确,是否有缺失(二级标题下的正文可以在输出的文件 // "../../../htmlRcgTest/" + i + @".html"中查看) } catch (Exception err) { Console.WriteLine(err.Message); } #region 将一、二级标题及内容录入数据库 try { SQLUtils sqlUtils = SQLUtils.getInstance(); sqlUtils.makeConnect(); ConventionRow tmp_rootConvention = rootConvention; for (int i = 0; i < title1Nodes.Count; i++) { ConventionRow tempRow1 = null; foreach (var pair in dic_title1Content) { if (pair.Key == i)//若一级标题下有内容,而无二级目录 { tempRow1 = new ConventionRow(rootConvention, str_title1List[i], i + 1, ConventionOptions.CATEGORY.IS_CONTENT, pair.Value); sqlUtils.writeRow_local(tempRow1); retInfo.title1Guids.Add(tempRow1.Guid); //retInfo.retTable.Rows.Add(tempRow1); break; } } if (tempRow1 == null)////若一级标题下无内容,有二级目录 { tempRow1 = new ConventionRow(rootConvention, str_title1List[i], i + 1, ConventionOptions.CATEGORY.IS_CATEGORY); sqlUtils.writeRow_local(tempRow1); retInfo.title1Guids.Add(tempRow1.Guid); //retInfo.retTable.Rows.Add(tempRow1); } for (int j = 0, k = 0; j < title2Nodes.Count; j++) { tmp_rootConvention = tempRow1; if (i < title1Nodes.Count - 1) { if (title2Nodes[j].Line <title1Nodes[i + 1].Line && title2Nodes[j].Line> title1Nodes[i].Line) { ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j], ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]); sqlUtils.writeRow_local(tempRow2); //retInfo.retTable.Rows.Add(tempRow2); } } else if (title2Nodes[j].Line > title1Nodes[i].Line) { ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j], ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]); sqlUtils.writeRow_local(tempRow2); //retInfo.retTable.Rows.Add(tempRow2); } } } retInfo.title1s = str_title1List; retInfo.title2s = str_title2List; retInfo.title2Contents = str_contentList; retInfo.titles = str_titleList; retInfo.title1ContentsNum = dic_title1Content.Count; } catch (Exception err) { Console.WriteLine(err.Message); retInfo.errorInfo = "录入失败。错误原因:" + err.Message; } return(retInfo); #endregion }
// /Uploads/imagesrc/neihechuanbo/num5/2006 private void btnWordRead_Click(object sender, EventArgs e) { if (this.tbHtmlPath.Text != string.Empty && this.tbParentGuid.Text != string.Empty && this.tbParentDepth.Text != string.Empty && this.tbParentIDfolder.Text != string.Empty && this.tbParentTitleCnFolder.Text != string.Empty && this.tbFilesPath.Text != string.Empty) { try { conventionRead = new ConventionRead(); conventionRead.imageFilePath = tbFilesPath.Text; ConventionRow rootNode = new ConventionRow(new Guid(this.tbParentGuid.Text), int.Parse(this.tbParentDepth.Text), this.tbParentIDfolder.Text, this.tbParentTitleCnFolder.Text); conventionRead.htmlPath = tbHtmlPath.Text; if (rdbtTitle1Bold.Checked) { if (this.tbTitle1Xpath.Text != string.Empty && tbTitle2Xpath.Text.Trim() != String.Empty) { conventionRead.title1_select = tbTitle1Xpath.Text; conventionRead.title2_select = tbTitle2Xpath.Text; conventionRead.method = ReadMethod.TITLE_CLASS; } } else if (rdbtTitleHTag.Checked) { if (tbTitle1TagName.Text.Trim() != string.Empty && tbTitle2TagName.Text.Trim() != string.Empty) { conventionRead.title1_select = tbTitle1TagName.Text; conventionRead.title2_select = tbTitle2TagName.Text; conventionRead.method = ReadMethod.TITLE_TAG; } } else { if (tbTitle1SpanStyle.Text.Trim() != String.Empty && tbTitle2SpanStyle.Text.Trim() != String.Empty) { conventionRead.title1_select = tbTitle1SpanStyle.Text; conventionRead.title2_select = tbTitle2SpanStyle.Text; conventionRead.method = ReadMethod.TITLE_SPANSTYLE; } } info = conventionRead.ReadHtml(rootNode); this.toolStripStatusLabel1.Text = "Html识别成功:一级目录有" + info.title1s.Count + "个,二级目录共有" + info.title2s.Count + "个" + "一级标题直接内容有" + info.title1ContentsNum + "个,图片识别结果 " + info.picResult; this.tbTitle1Guids.Text = ""; for (int i = 0; i < info.title1Guids.Count; i++) { this.tbTitle1Guids.Text += info.title1s[i] + "\r\n";// " : "+info.title1Guids[i]+"\r\n"; } this.tbTitle1Guids.Text += "\r\n\r\n"; for (int i = 0; i < info.titles.Count; i++) { this.tbTitle1Guids.Text += info.titles[i] + "\r\n"; } } catch (Exception err) { MessageBox.Show(err.Message); this.toolStripStatusLabel1.Text = err.Message; } } else { MessageBox.Show("请输入信息!"); } #region 废弃代码 // ConventionRow rootNode=new ConventionRow(new Guid("1b506d0f-8956-46d3-a023-78d24e300ed0"),2,ConventionOptions.CATEGORY.IS_CATEGORY); // conventionRead.ReadCatalogue(rootNode); // Word.Application app = new Word.Application(); // Word.Document doc = null; // object unknow = Type.Missing; // app.Visible = false; // string str = @"D:\work\WordRead\test.docx"; // object file = str; // doc = app.Documents.Open(ref file, // ref unknow, ref unknow, ref unknow, ref unknow, // ref unknow, ref unknow, ref unknow, ref unknow, // ref unknow, ref unknow, ref unknow, ref unknow, // ref unknow, ref unknow, ref unknow); // string temp; // //int paraCount = doc.Paragraphs.Count; // //for (int i = 1; i < paraCount + 1; i++) // //{ // // temp = doc.Paragraphs[i].Range.Text.Trim(); // // Console.WriteLine(temp); // //} // doc.ActiveWindow.Selection.WholeStory(); // doc.ActiveWindow.Selection.Copy(); // IDataObject data = Clipboard.GetDataObject(); // temp = data.GetData(DataFormats.Text).ToString(); // //回车换行使用了\r\n 和 \n // string pattern_title1 = @"第\d{1,}章 {1,2}[\w ]+\r";//查找一级标题 // string pattern_title2 = @"(?<=\r\n|\r\n\s{1,})\d{1,}\s{1,}\w[^,。]+?\r";//查找二级标题 // string pattern_zhengwen = @"(?<=(?<=\r\n|\r\n\s{1,})\d{1,}\s{1,}\w[^,。]+?\r)" + // @"[\s\S]+?(?=((?<=\r\n|\r\n\s{1,})\d{1,}\s{1,}\w[^,。]+?\r)|" + // @"第\d{1,}章 {1,2}[\w ]+\r\n|(?<=\n|\n\s+)附录[\w\W]+?(?=\r\n)|$)";//查找正文 // string pattern_fulu = @"(?<=\n|\n\s+)附录[\w\W]+?(?=\r\n)"; //查找附录 // /*******缩进*********/ // string pattern_suojin1 = @"(?<=\n|^)[ \t\s]*(?=\d+[\..]\d+)";//匹配1.1、1.1.1缩进,替换顶行无缩进 // string pattern_suojin2 = @"(?<=\n)[ \t\s]*(?=[((]\d+[))])";//查找正文中的(1)替换为2空格 // string pattern_suojin3 = @"(?<=\n)[ \t\s]*(?=[①②③④⑤⑥⑦⑧⑨⑩])";//匹配 ①缩进,替换为6空格 // string pattern_suojin4 = @"(?<=\n)[ \t\s]*(?=[((][a-z]+[))])";//匹配(a)缩进,替换为8空格 // Regex.Replace(temp, pattern_suojin1, ""); // Regex.Replace(temp, pattern_suojin2, " "); // Regex.Replace(temp, pattern_suojin3, " "); // Regex.Replace(temp, pattern_suojin4, " "); // /****************/ // //FileStream mytxt = new FileStream(@"D:\work\WordRead\testResult.txt", // // FileMode.Open, FileAccess.Read, FileShare.ReadWrite); // MatchCollection mymatches = Regex.Matches(temp, pattern_zhengwen); // foreach (Match match in mymatches) // { // File.AppendAllText(@"D:\work\WordRead\testResult.txt", match.Value); // File.AppendAllText(@"D:\work\WordRead\testResult.txt", "\r\n\r\n\r\n\r\n完成一段正文。\r\n\r\n\r\n\r\n"); // } // Console.WriteLine(mymatches.Count); // Console.WriteLine("\nFinished"); // doc.Close(); #endregion }