private bool GenerateNewPage(IListSheet listSheet) { bool succeed = true; string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); string outputTitleTextDir = Path.Combine(exportDir, "titleText"); Dictionary <string, int> subjectColumnDic = CommonUtil.InitStringIndexDic(new string[] { "detailPageUrl", "detailPageName", "cookie", "grabStatus", "giveUpGrab", "subjectIndex", "id", "title", "googleUrl", "creator", "createDate", "messageCount", "authorCount" }); string subjectFileExcelPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_List.xlsx"); ExcelWriter subjectEW = new ExcelWriter(subjectFileExcelPath, "List", subjectColumnDic); Dictionary <string, int> subjectImportColumnDic = CommonUtil.InitStringIndexDic(new string[] { "index", "id", "creator", "createDate", "commentPublisher", "commentPublished", "googleUrl", "title", "url" }); string subjectFileXmlPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_List.xml"); XmlWriter subjectXW = new XmlWriter(subjectFileXmlPath, subjectImportColumnDic); string[] commentUsers = new string[] { "sunhua", "shizhengzhong", "liyuzhu" }; int sujectIndex = 1; for (int i = listSheet.RowCount - 1; i >= 0; i--) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string url = row[SysConfig.DetailPageUrlFieldName]; string localFilePath = this.RunPage.GetFilePath(url, pageSourceDir); TextReader tr = null; try { tr = new StreamReader(localFilePath); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); HtmlNodeCollection subjectNodes = htmlDoc.DocumentNode.SelectNodes("//body[1]/table[1]/tr"); if (subjectNodes != null) { for (int j = subjectNodes.Count - 1; j >= 1; j--) { HtmlNode subjectNode = subjectNodes[j]; HtmlNode titleNode = subjectNode.SelectSingleNode("./td[@class=\"subject\"]/a"); string title = CommonUtil.HtmlDecode(CommonUtil.ReplaceAsciiByString(titleNode.InnerText.Trim())); string detailPageUrl = titleNode.Attributes["href"].Value.Trim(); string detailPageName = detailPageUrl.Substring(detailPageUrl.LastIndexOf("/") + 1); HtmlNode authorNode = subjectNode.SelectSingleNode("./td[@class=\"author\"]"); string author = authorNode == null ? "" : CommonUtil.HtmlDecode(CommonUtil.ReplaceAsciiByString(authorNode.InnerText.Trim())); HtmlNode lastPostDateNode = subjectNode.SelectSingleNode("./td[@class=\"lastPostDate\"]"); string lastPostDate = lastPostDateNode == null ? "" : lastPostDateNode.InnerText.Trim(); Dictionary <string, string> commentF2vs = new Dictionary <string, string>(); commentF2vs.Add("subjectIndex", sujectIndex.ToString()); commentF2vs.Add("detailPageUrl", detailPageUrl); commentF2vs.Add("detailPageName", detailPageName); commentF2vs.Add("id", detailPageName); commentF2vs.Add("title", title); commentF2vs.Add("googleUrl", detailPageUrl); commentF2vs.Add("creator", author); commentF2vs.Add("createDate", lastPostDate); subjectEW.AddRow(commentF2vs); for (int u = 0; u < commentUsers.Length; u++) { string user = commentUsers[u]; Dictionary <string, string> commentXF2vs = new Dictionary <string, string>(); commentXF2vs.Add("index", sujectIndex.ToString()); commentXF2vs.Add("id", user + "_" + detailPageName); commentXF2vs.Add("title", title); commentXF2vs.Add("url", detailPageName + ".html"); commentXF2vs.Add("googleUrl", detailPageUrl); commentXF2vs.Add("creator", author); commentXF2vs.Add("createDate", lastPostDate); commentXF2vs.Add("commentPublisher", user); commentXF2vs.Add("commentPublished", "No"); subjectXW.AddRow(commentXF2vs); } sujectIndex++; } } } catch (Exception ex) { if (tr != null) { tr.Close(); tr.Dispose(); } throw ex; } } } subjectXW.SaveToDisk(); subjectEW.SaveToDisk(); return(succeed); }
private bool GenerateNewPage(IListSheet listSheet) { bool succeed = true; string exportDir = this.RunPage.GetExportDir(); string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); string outputTitleTextDir = Path.Combine(exportDir, "titleText"); Dictionary <string, int> subjectColumnDic = CommonUtil.InitStringIndexDic(new string[] { "index", "id", "title", "url", "googleUrl", "creator", "createDate", "messageCount", "authorCount", "commentPublisher", "commentPublished" }); string subjectFileExcelPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_List.xlsx"); ExcelWriter subjectEW = new ExcelWriter(subjectFileExcelPath, "List", subjectColumnDic); string subjectFileXmlPath = Path.Combine(exportDir, this.RunPage.Project.Name + "_List.xml"); XmlWriter subjectXW = new XmlWriter(subjectFileXmlPath, subjectColumnDic); Dictionary <string, int> allCommentsColumnDic = new Dictionary <string, int>(); allCommentsColumnDic.Add("subjectIndex", 0); allCommentsColumnDic.Add("googleUrl", 1); allCommentsColumnDic.Add("creator", 2); allCommentsColumnDic.Add("author", 3); allCommentsColumnDic.Add("lastPostDate", 4); string allCommentsFilePath = Path.Combine(exportDir, this.RunPage.Project.Name + "_AllComments.xlsx"); ExcelWriter allCommentsListEW = new ExcelWriter(allCommentsFilePath, "List", allCommentsColumnDic); for (int i = 0; i < listSheet.RowCount; i++) { Dictionary <string, string> row = listSheet.GetRow(i); bool giveUp = "Y".Equals(row[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { string index = row[SysConfig.ListPageIndexFieldName].PadLeft(4, '0'); string googleUrl = row[SysConfig.DetailPageUrlFieldName]; string title = row["title"]; string creator = row["creator"]; string createDate = row["createDate"]; string localFilePath = this.RunPage.GetFilePath(googleUrl, pageSourceDir); TextReader tr = null; try { tr = new StreamReader(localFilePath); string webPageHtml = tr.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(webPageHtml); string messagesNumStr = htmlDoc.DocumentNode.SelectSingleNode("//body/i").InnerText; int ofIndex = messagesNumStr.IndexOf("of"); int messagesIndex = messagesNumStr.IndexOf("messages"); string messageCount = messagesNumStr.Substring(ofIndex + 2, messagesIndex - ofIndex - 2).Trim(); List <string> authorNames = new List <string>(); HtmlNodeCollection messageNodes = htmlDoc.DocumentNode.SelectNodes("//body/table/tr"); if (messageNodes != null) { for (int j = 0; j < messageNodes.Count; j++) { HtmlNode messageNode = messageNodes[j]; HtmlNode authorNode = messageNode.SelectSingleNode("./td[2]"); HtmlNode lastPostDateNode = messageNode.SelectSingleNode("./td[3]"); if (authorNode != null) { string author = HttpUtility.HtmlDecode(authorNode.InnerText).Trim(); if (j == 0) { creator = author; } if (!authorNames.Contains(author)) { authorNames.Add(author); } string lastPostDate = lastPostDateNode == null ? "" : lastPostDateNode.InnerText; Dictionary <string, string> commentF2vs = new Dictionary <string, string>(); commentF2vs.Add("subjectIndex", index); commentF2vs.Add("googleUrl", googleUrl); commentF2vs.Add("creator", creator); commentF2vs.Add("author", author); commentF2vs.Add("lastPostDate", lastPostDate); allCommentsListEW.AddRow(commentF2vs); } } } //修改html内容,增加线框 HtmlNode tableNode = htmlDoc.DocumentNode.SelectSingleNode("//body/table"); tableNode.Attributes["border"].Value = "1"; int localHtmlUrlStartIndex = googleUrl.IndexOf("/idempiere/") + "/idempiere/".Length; string htmlLocalName = CommonUtil.ProcessFileName(googleUrl.Substring(localHtmlUrlStartIndex), "_") + ".html"; string htmlLocalUrl = Path.Combine(Path.GetDirectoryName(pageSourceDir), "export\\html\\" + htmlLocalName); CommonUtil.CreateFileDirectory(htmlLocalUrl); htmlDoc.Save(htmlLocalUrl); Dictionary <string, string> f2vs = new Dictionary <string, string>(); f2vs.Add("index", index); f2vs.Add("title", title); f2vs.Add("googleUrl", googleUrl); f2vs.Add("url", htmlLocalName); f2vs.Add("creator", creator); f2vs.Add("createDate", createDate); f2vs.Add("messageCount", messageNodes.Count.ToString()); f2vs.Add("authorCount", authorNames.Count.ToString()); f2vs.Add("commentPublisher", ""); f2vs.Add("commentPublished", ""); f2vs.Add("id", ""); IRow newPageListRow = subjectEW.AddRow(f2vs); f2vs["commentPublished"] = "No"; f2vs["commentPublisher"] = "sunhua"; f2vs["id"] = "sunhua" + index; subjectXW.AddRow(f2vs); f2vs["commentPublisher"] = "shizhengzhong"; f2vs["id"] = "shizhengzhong" + index; subjectXW.AddRow(f2vs); f2vs["commentPublisher"] = "liyuzhu"; f2vs["id"] = "liyuzhu" + index; subjectXW.AddRow(f2vs); ICell localUrlCell = subjectEW.GetCell(newPageListRow, "url", true); IHyperlink hyperlink = new XSSFHyperlink(HyperlinkType.File); hyperlink.Address = "html/" + htmlLocalName; localUrlCell.Hyperlink = hyperlink; //保存message文本 string msg = Encoding.UTF8.GetString(Encoding.Convert(Encoding.ASCII, Encoding.UTF8, Encoding.ASCII.GetBytes(title))).Trim(); if (msg.Length > 0) { string titleTextFilePath = Path.Combine(outputTitleTextDir, index + ".txt"); CommonUtil.CreateFileDirectory(titleTextFilePath); TextWriter tw = null; try { tw = new StreamWriter(titleTextFilePath, false, new UTF8Encoding(false)); tw.Write(msg); tw.Flush(); } catch (Exception ee) { throw ee; } finally { if (tw != null) { tw.Close(); tw.Dispose(); } } } } catch (Exception ex) { if (tr != null) { tr.Dispose(); tr = null; } this.RunPage.InvokeAppendLogText("读取出错. " + ex.Message + " LocalPath = " + localFilePath, LogLevelType.Error, true); } } } subjectXW.SaveToDisk(); subjectEW.SaveToDisk(); allCommentsListEW.SaveToDisk(); return(succeed); }