public override void GetDataByOtherAccessType(Dictionary <string, string> listRow) { string pageSourceDir = this.RunPage.GetDetailSourceFileDir(); base.GetDataByOtherAccessType(listRow); string detailUrl = listRow["detailPageUrl"]; string adjunctType = listRow["adjunctType"].ToLower().Trim(); string destFilePath = this.RunPage.GetFilePath(detailUrl, pageSourceDir); string sourceFilePath = this.RunPage.GetFilePath(detailUrl, this.GongGaoSourceFileDir); try { switch (adjunctType) { case "pdf": Pdf2Txt.Pdf2TxtByITextSharp(sourceFilePath, destFilePath, true); break; case "html": Html2Txt.Html2TxtByHtmlAgilityPack(sourceFilePath, destFilePath, true, "gb2312"); break; case "txt": { Html2Txt.Html2TxtByHtmlAgilityPack(sourceFilePath, destFilePath, true, "gb2312"); //File.Copy(sourceFilePath, destFilePath); } break; default: Html2Txt.Html2TxtByHtmlAgilityPack(sourceFilePath, destFilePath, true, "gb2312"); break; //throw new Exception("不可识别的公告文档类型, adjunctType = " + adjunctType); } } catch (Exception ex) { throw ex; } }
private void ConvertToTxt(IListSheet listSheet) { try { string sourceDir = this.RunPage.GetDetailSourceFileDir(); string exportDir = this.RunPage.GetExportDir(); string pdfUrlFilePath = Path.Combine(exportDir, "论文_ScienceDirect_论文PDF页.xlsx"); ExcelWriter pdfUrlWriter = this.GetDownloadPdfExcelWriter(pdfUrlFilePath); for (int i = 0; i < listSheet.RowCount; i++) { this.RunPage.InvokeAppendLogText("已转换" + i.ToString() + "/" + listSheet.RowCount.ToString(), LogLevelType.System, true); Dictionary <string, string> listRow = listSheet.GetRow(i); string pageUrl = listRow[SysConfig.DetailPageUrlFieldName]; bool giveUp = "Y".Equals(listRow[SysConfig.GiveUpGrabFieldName]); if (!giveUp) { try { string textFileDir = this.RunPage.GetReadFilePath(pageUrl, exportDir); string fullTextFilePath = Path.Combine(textFileDir, "allText.txt"); if (!File.Exists(fullTextFilePath)) { HtmlAgilityPack.HtmlDocument htmlDoc = this.RunPage.GetLocalHtmlDocument(listSheet, i); HtmlNode linkNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"redirect-message\"]/p/a"); string pdfUrl = linkNode.GetAttributeValue("href", ""); string pdfFilePath = this.RunPage.GetFilePath(pdfUrl, sourceDir); if (!Directory.Exists(textFileDir)) { Directory.CreateDirectory(textFileDir); } string[] pdfPartFilePaths = PdfSpliter.ExtractPages(pdfFilePath, textFileDir); StringBuilder fullText = new StringBuilder(); for (int j = 0; j < pdfPartFilePaths.Length; j++) { string pdfPartFilePath = pdfPartFilePaths[j]; string textPartFilePath = Path.Combine(textFileDir, (j + 1).ToString() + ".txt"); try { Pdf2Txt.Pdf2TxtByITextSharp(pdfPartFilePath, textPartFilePath, true); string text = FileHelper.GetTextFromFile(textPartFilePath, Encoding.UTF8); fullText.Append(text); } catch (Exception pdf2TxtEx) { if (pdf2TxtEx.Message.Contains("System.FormatException")) { this.RunPage.InvokeAppendLogText("转换txt失败, pdfPartFilePath = " + pdfPartFilePath, LogLevelType.Error, true); } else { throw pdf2TxtEx; } } } FileHelper.SaveTextToFile(fullText.ToString(), fullTextFilePath, Encoding.UTF8); } Dictionary <string, string> pdfUrlRow = new Dictionary <string, string>(); pdfUrlRow.Add("publication", listRow["publication"]); pdfUrlRow.Add("host", listRow["host"]); pdfUrlRow.Add("title", listRow["title"]); pdfUrlRow.Add("authors", listRow["authors"]); pdfUrlRow.Add("abstracts", listRow["abstracts"]); pdfUrlRow.Add("refs", listRow["refs"]); pdfUrlRow.Add("pageUrl", pageUrl); pdfUrlRow.Add("txtUrl", fullTextFilePath); pdfUrlWriter.AddRow(pdfUrlRow); } catch (Exception ex) { string filePath = this.RunPage.GetFilePath(pageUrl, sourceDir); this.RunPage.InvokeAppendLogText("错误,filePath = " + filePath + ", pageUrl = " + pageUrl, LogLevelType.Error, true); throw ex; } } } pdfUrlWriter.SaveToDisk(); } catch (Exception ex) { throw ex; } }