internal string[] GetParagraphs() { List<string> paragraphs = new List<string>(); foreach (AsposePdf.Page page in _document.Pages) { var textVisitor = new AsposePdf.Text.TextAbsorber(); try { page.Accept(textVisitor); } catch (Exception e) { Logger.LogError(e); } if (String.IsNullOrEmpty(textVisitor.Text)) { continue; } string[] lines = Regex.Split(textVisitor.Text, "\r\n"); // Todo: There should be a better way to get the text per paragraph! paragraphs.AddRange(lines); } paragraphs.RemoveAll(String.IsNullOrEmpty); return paragraphs.ToArray(); }
/// <summary> /// 读取 pdf 文件中的文本 /// </summary> /// <param name="filename"></param> /// <returns></returns> public static string GetTextFromPdf(string filename) { var textAbsorber = new Aspose.Pdf.Text.TextAbsorber(); var pdf = new Aspose.Pdf.Document(filename); pdf.Pages.Accept(textAbsorber); return(textAbsorber.Text); }
public static string GetPdfContent(string filepath) { int numPDFMaxPage = Util.GetAppSetting("PDFMaxPage", PDFMaxPage.ToString()).ToInt(); Aspose.Pdf.Document doc = new Aspose.Pdf.Document(filepath); Aspose.Pdf.Text.TextAbsorber txt = new Aspose.Pdf.Text.TextAbsorber(); StringBuilder sb = new StringBuilder(); for (int i = 1; i <= doc.Pages.Count; i++) { if (i > numPDFMaxPage) { break; } doc.Pages[i].Accept(txt); sb.Append(txt.Text); } return(sb.ToString()); }