///////////////////////////////////////////////////////////////////////
        // ParsePage
        ///////////////////////////////////////////////////////////////////////
        private static void ParsePage(Pdfix pdfix, PdfPage page, String outDir)
        {
            // get pageMap for the current page
            PdePageMap pageMap = page.AcquirePageMap();

            if (pageMap == null)
            {
                throw new Exception(pdfix.GetError());
            }
            if (!pageMap.CreateElements(null, null))
            {
                throw new Exception(pdfix.GetError());
            }

            // get page container
            PdeElement container = pageMap.GetElement();

            if (container == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // parse children recursivelly
            ParseElement(container, outDir);

            pageMap.Release();
        }
        ///////////////////////////////////////////////////////////////////////
        // ParseTable
        ///////////////////////////////////////////////////////////////////////
        private static void ParseTable(PdeTable table, String outDir)
        {
            StreamWriter file = new System.IO.StreamWriter(outDir + "\\ExtractTables" + tableIndex++ + ".csv");

            int rowCount = table.GetNumRows();
            int colCount = table.GetNumCols();

            for (int row = 0; row < rowCount; row++)
            {
                for (int col = 0; col < colCount; col++)
                {
                    PdeCell cell = (PdeCell)table.GetCell(row, col);
                    if (cell == null)
                    {
                        continue;
                    }

                    int rowSpan = cell.GetRowSpan();
                    int colSpan = cell.GetColSpan();

                    int count = cell.GetNumChildren();
                    if ((rowSpan != 0) && (colSpan != 0) && (count > 0))
                    {
                        file.Write("\"");
                        for (int i = 0; i < count; i++)
                        {
                            PdeElement child = cell.GetChild(i);
                            if (child != null && (child.GetType_() == PdfElementType.kPdeText))
                            {
                                ParseText((PdeText)child, file, false);
                            }
                            if (i < count - 1)
                            {
                                file.Write(" ");
                            }
                        }
                        file.Write("\"");
                    }

                    if (col < colCount)
                    {
                        file.Write(",");
                    }
                }
                if (row < rowCount)
                {
                    file.Write("\n");
                }
            }

            file.Close();
        }
        ///////////////////////////////////////////////////////////////////////
        // ParseElement
        ///////////////////////////////////////////////////////////////////////
        private static void ParseElement(PdeElement element, String outDir)
        {
            // parse element based on type;
            PdfElementType elemType = element.GetType_();

            switch (elemType)
            {
            case PdfElementType.kPdeTable:
                ParseTable((PdeTable)element, outDir);
                return;
            }

            int numChilds = element.GetNumChildren();

            for (int i = 0; i < numChilds; i++)
            {
                ParseElement(element.GetChild(i), outDir);
            }
        }
        ///////////////////////////////////////////////////////////////////////
        // ParseElement
        ///////////////////////////////////////////////////////////////////////
        private static void ParseElement(PdeElement element, StreamWriter file)
        {
            // parse element based on type;
            PdfElementType elemType = element.GetType_();

            switch (elemType)
            {
            case PdfElementType.kPdeText:
                ParseText((PdeText)element, file);
                return;
            }

            int numChilds = element.GetNumChildren();

            for (int i = 0; i < numChilds; i++)
            {
                ParseElement(element.GetChild(i), file);
            }
        }
Exemplo n.º 5
0
        ///////////////////////////////////////////////////////////////////////
        // ParsePage
        ///////////////////////////////////////////////////////////////////////
        private static void ParsePage(Pdfix pdfix, PdfPage page, StreamWriter file)
        {
            // get pageMap for the current page
            PdePageMap pageMap = page.AcquirePageMap(null, IntPtr.Zero);

            if (pageMap == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // get page container
            PdeElement container = pageMap.GetElement();

            if (container == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // parse children recursivelly
            ParseElement(container, file);

            pageMap.Release();
        }
        public static void Run(
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement table = GetFirstTable(struct_tree);

            if (table == null)
            {
                throw new Exception("No table found.");
            }

            PdfRect bbox = new PdfRect();

            GetStructElementBBox(table, ref bbox);

            // remove all items from the table to make it untagged cotnent
            for (int i = table.GetNumChildren() - 1; i >= 0; i--)
            {
                table.RemoveChild(i);
            }

            // tag page
            PdfPage page = doc.AcquirePage(0);

            PdePageMap page_map = page.AcquirePageMap();
            PdeElement elem     = page_map.CreateElement(PdfElementType.kPdeImage, null);

            elem.SetBBox(bbox);
            elem.SetAlt("This is image caption");

            // prepare document template to ignore already tagged content
            var doc_prelight = doc.GetTemplate();

            doc_prelight.SetProperty("ignore_tags", 1);

            // re-tag non-tagged page content
            PdePageMap pageMap = page.AcquirePageMap();

            if (pageMap == null)
            {
                throw new Exception(pdfix.GetError());
            }
            if (!pageMap.CreateElements(null, null))
            {
                throw new Exception(pdfix.GetError());
            }

            if (!page_map.AddTags(table, null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // udpate the table element type
            if (!table.SetType("Sect"))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, Pdfix.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
        }
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement table = GetFirstTable(struct_tree);

            if (table == null)
            {
                throw new Exception("No table found.");
            }

            PdfRect bbox = new PdfRect();

            GetStructElementBBox(table, ref bbox);

            // remove all items from the table to make it untagged cotnent
            for (int i = table.GetNumKids() - 1; i >= 0; i--)
            {
                table.RemoveKid(i);
            }

            // tag page
            PdfPage page = doc.AcquirePage(0);

            PdePageMap page_map = page.CreatePageMap();
            PdeElement elem     = page_map.CreateElement(PdfElementType.kPdeImage, null);

            elem.SetBBox(bbox);
            elem.SetAlt("This is image caption");

            // prepare document template to ignore already tagged content
            PdfDocTemplate doc_tmpl = doc.GetDocTemplate();

            doc_tmpl.SetProperty("ignore_tags", 1);

            // re-tag non-tagged page content
            if (!page_map.AcquireElements(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }
            if (!page_map.AddTags(table, null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // udpate the table element type
            if (!table.SetType("Sect"))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }