public static String Extract(String url, String saveTo = null) { try { WebClient wc = new WebClient(); byte[] data = wc.DownloadData(url); // 如果设置了保存到文件,则写入文件 if (!String.IsNullOrWhiteSpace(saveTo) && data != null && data.Length > 0) { if (!Directory.Exists(saveTo)) { Directory.CreateDirectory(saveTo); } String file = GeneratePath(saveTo, url); try { using (FileStream fs = new FileStream(file, FileMode.CreateNew)) { fs.Write(data, 0, data.Length); } } catch (IOException ioe) { } } using (PdfReader reader = new PdfReader(data)) { try { StringBuilder sb = new StringBuilder(); ITextExtractionStrategy extract = new SimpleTextExtractionStrategy(); for (int i = 1; i <= reader.NumberOfPages; i++) { sb.Append(PdfTextExtractor.GetTextFromPage(reader, i, extract)); reader.ReleasePage(i); } return sb.ToString(); } finally { if (reader != null) { reader.Close(); } } } } catch (Exception e) { return String.Empty; } }
/** * Gets a <CODE>List</CODE> with the bookmarks that are children of <CODE>outline</CODE>. It returns <CODE>null</CODE> if * the document doesn't have any bookmarks. * @param reader the document * @param outline the outline dictionary to get bookmarks from * @param includeRoot indicates if to include <CODE>outline</CODE> parameter itself into returned list of bookmarks * @return a <CODE>List</CODE> with the bookmarks or <CODE>null</CODE> if the * document doesn't have any */ public static IList <Dictionary <String, Object> > GetBookmark(PdfReader reader, PdfDictionary outline, bool includeRoot) { if (outline == null) { return(null); } IntHashtable pages = new IntHashtable(); int numPages = reader.NumberOfPages; for (int k = 1; k <= numPages; ++k) { pages[reader.GetPageOrigRef(k).Number] = k; reader.ReleasePage(k); } if (includeRoot) { return(BookmarkDepth(reader, outline, pages, true)); } else { return(BookmarkDepth(reader, (PdfDictionary)PdfReader.GetPdfObjectRelease(outline.Get(PdfName.FIRST)), pages, false)); } }
/** * Gets a <CODE>List</CODE> with the bookmarks that are children of <CODE>outline</CODE>. It returns <CODE>null</CODE> if * the document doesn't have any bookmarks. * @param reader the document * @param outline the outline dictionary to get bookmarks from * @param includeRoot indicates if to include <CODE>outline</CODE> parameter itself into returned list of bookmarks * @return a <CODE>List</CODE> with the bookmarks or <CODE>null</CODE> if the * document doesn't have any */ public static IList<Dictionary<String, Object>> GetBookmark(PdfReader reader, PdfDictionary outline, bool includeRoot) { if (outline == null) return null; IntHashtable pages = new IntHashtable(); int numPages = reader.NumberOfPages; for (int k = 1; k <= numPages; ++k) { pages[reader.GetPageOrigRef(k).Number] = k; reader.ReleasePage(k); } if (includeRoot) return BookmarkDepth(reader, outline, pages, true); else return BookmarkDepth(reader, (PdfDictionary)PdfReader.GetPdfObjectRelease(outline.Get(PdfName.FIRST)), pages, false); }
/** * Gets a <CODE>List</CODE> with the bookmarks. It returns <CODE>null</CODE> if * the document doesn't have any bookmarks. * @param reader the document * @return a <CODE>List</CODE> with the bookmarks or <CODE>null</CODE> if the * document doesn't have any */ public static IList<Dictionary<String, Object>> GetBookmark(PdfReader reader) { PdfDictionary catalog = reader.Catalog; PdfObject obj = PdfReader.GetPdfObjectRelease(catalog.Get(PdfName.OUTLINES)); if (obj == null || !obj.IsDictionary()) return null; PdfDictionary outlines = (PdfDictionary)obj; IntHashtable pages = new IntHashtable(); int numPages = reader.NumberOfPages; for (int k = 1; k <= numPages; ++k) { pages[reader.GetPageOrigRef(k).Number] = k; reader.ReleasePage(k); } return BookmarkDepth(reader, (PdfDictionary)PdfReader.GetPdfObjectRelease(outlines.Get(PdfName.FIRST)), pages); }