/////////////////////////////////////////////////////////////////////// // ParsePage /////////////////////////////////////////////////////////////////////// private static void ParsePage(Pdfix pdfix, PdfPage page, String outDir) { // get pageMap for the current page PdePageMap pageMap = page.AcquirePageMap(); if (pageMap == null) { throw new Exception(pdfix.GetError()); } if (!pageMap.CreateElements(null, null)) { throw new Exception(pdfix.GetError()); } // get page container PdeElement container = pageMap.GetElement(); if (container == null) { throw new Exception(pdfix.GetError()); } // parse children recursivelly ParseElement(container, outDir); pageMap.Release(); }
/////////////////////////////////////////////////////////////////////// // ParseTable /////////////////////////////////////////////////////////////////////// private static void ParseTable(PdeTable table, String outDir) { StreamWriter file = new System.IO.StreamWriter(outDir + "\\ExtractTables" + tableIndex++ + ".csv"); int rowCount = table.GetNumRows(); int colCount = table.GetNumCols(); for (int row = 0; row < rowCount; row++) { for (int col = 0; col < colCount; col++) { PdeCell cell = (PdeCell)table.GetCell(row, col); if (cell == null) { continue; } int rowSpan = cell.GetRowSpan(); int colSpan = cell.GetColSpan(); int count = cell.GetNumChildren(); if ((rowSpan != 0) && (colSpan != 0) && (count > 0)) { file.Write("\""); for (int i = 0; i < count; i++) { PdeElement child = cell.GetChild(i); if (child != null && (child.GetType_() == PdfElementType.kPdeText)) { ParseText((PdeText)child, file, false); } if (i < count - 1) { file.Write(" "); } } file.Write("\""); } if (col < colCount) { file.Write(","); } } if (row < rowCount) { file.Write("\n"); } } file.Close(); }
/////////////////////////////////////////////////////////////////////// // ParseElement /////////////////////////////////////////////////////////////////////// private static void ParseElement(PdeElement element, String outDir) { // parse element based on type; PdfElementType elemType = element.GetType_(); switch (elemType) { case PdfElementType.kPdeTable: ParseTable((PdeTable)element, outDir); return; } int numChilds = element.GetNumChildren(); for (int i = 0; i < numChilds; i++) { ParseElement(element.GetChild(i), outDir); } }
/////////////////////////////////////////////////////////////////////// // ParseElement /////////////////////////////////////////////////////////////////////// private static void ParseElement(PdeElement element, StreamWriter file) { // parse element based on type; PdfElementType elemType = element.GetType_(); switch (elemType) { case PdfElementType.kPdeText: ParseText((PdeText)element, file); return; } int numChilds = element.GetNumChildren(); for (int i = 0; i < numChilds; i++) { ParseElement(element.GetChild(i), file); } }
/////////////////////////////////////////////////////////////////////// // ParsePage /////////////////////////////////////////////////////////////////////// private static void ParsePage(Pdfix pdfix, PdfPage page, StreamWriter file) { // get pageMap for the current page PdePageMap pageMap = page.AcquirePageMap(null, IntPtr.Zero); if (pageMap == null) { throw new Exception(pdfix.GetError()); } // get page container PdeElement container = pageMap.GetElement(); if (container == null) { throw new Exception(pdfix.GetError()); } // parse children recursivelly ParseElement(container, file); pageMap.Release(); }
public static void Run( String openPath, // source PDF document String savePath // dest PDF document ) { Pdfix pdfix = PdfixEngine.Instance; PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement table = GetFirstTable(struct_tree); if (table == null) { throw new Exception("No table found."); } PdfRect bbox = new PdfRect(); GetStructElementBBox(table, ref bbox); // remove all items from the table to make it untagged cotnent for (int i = table.GetNumChildren() - 1; i >= 0; i--) { table.RemoveChild(i); } // tag page PdfPage page = doc.AcquirePage(0); PdePageMap page_map = page.AcquirePageMap(); PdeElement elem = page_map.CreateElement(PdfElementType.kPdeImage, null); elem.SetBBox(bbox); elem.SetAlt("This is image caption"); // prepare document template to ignore already tagged content var doc_prelight = doc.GetTemplate(); doc_prelight.SetProperty("ignore_tags", 1); // re-tag non-tagged page content PdePageMap pageMap = page.AcquirePageMap(); if (pageMap == null) { throw new Exception(pdfix.GetError()); } if (!pageMap.CreateElements(null, null)) { throw new Exception(pdfix.GetError()); } if (!page_map.AddTags(table, null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // udpate the table element type if (!table.SetType("Sect")) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, Pdfix.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement table = GetFirstTable(struct_tree); if (table == null) { throw new Exception("No table found."); } PdfRect bbox = new PdfRect(); GetStructElementBBox(table, ref bbox); // remove all items from the table to make it untagged cotnent for (int i = table.GetNumKids() - 1; i >= 0; i--) { table.RemoveKid(i); } // tag page PdfPage page = doc.AcquirePage(0); PdePageMap page_map = page.CreatePageMap(); PdeElement elem = page_map.CreateElement(PdfElementType.kPdeImage, null); elem.SetBBox(bbox); elem.SetAlt("This is image caption"); // prepare document template to ignore already tagged content PdfDocTemplate doc_tmpl = doc.GetDocTemplate(); doc_tmpl.SetProperty("ignore_tags", 1); // re-tag non-tagged page content if (!page_map.AcquireElements(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!page_map.AddTags(table, null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // udpate the table element type if (!table.SetType("Sect")) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, PdfSaveFlags.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); pdfix.Destroy(); }