//////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        return(kid_elem);
                    }

                    var paragraph = GetFirstParagraph(kid_elem);
                    if (paragraph != null)
                    {
                        return(paragraph);
                    }
                }
            }
            return(null);
        }
Example #2
0
        private static void ProcessPageObject(PdfPage page, PdsPageObject obj, string savePath)
        {
            if (obj == null)
            {
                PdfixEngine.ThrowException();
            }

            switch (obj.GetObjectType())
            {
            case PdfPageObjectType.kPdsPageImage:
                ExtractImage(page, (PdsImage)obj, savePath);
                break;

            case PdfPageObjectType.kPdsPageForm:
            {
                var form    = (PdsForm)obj;
                var content = form.GetContent();
                for (int i = 0; i < content.GetNumObjects(); i++)
                {
                    ProcessPageObject(page, content.GetObject(i), savePath);
                }
            }
            break;
            }
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static bool MoveParagraphToParent(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        if (!struct_elem.MoveChild(i, struct_elem, struct_elem.GetNumChildren() - 1))
                        {
                            throw new Exception();
                        }
                        return(true);
                    }
                    var paragraph = MoveParagraphToParent(kid_elem);
                    return(paragraph);
                }
            }
            return(false);
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // Remove paragraph from structure tree
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        internal static void RemoveParagraph(PdsStructElement struct_elem)
        {
            // remove last 2 P struct elements from struct tree
            for (int i = struct_elem.GetNumChildren() - 1; i >= 0; i--)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--)
                        {
                            if (!kid_elem.RemoveChild(j))
                            {
                                PdfixEngine.ThrowException();
                            }
                        }
                    }
                    else if (type == "Figure")
                    {
                        // remove figure if does not contain an alt text
                        string alt = kid_elem.GetAlt();
                        if (alt.Length == 0)
                        {
                            for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--)
                            {
                                if (!kid_elem.RemoveChild(j))
                                {
                                    PdfixEngine.ThrowException();
                                }
                            }
                        }
                    }
                    else
                    {
                        RemoveParagraph(kid_elem);
                    }
                    // remove this element if it has no kids
                    if (kid_elem.GetNumChildren() == 0)
                    {
                        struct_elem.RemoveChild(i);
                    }
                }
                // remove only 2 paragraphs in this sample
                if (++count >= 2)
                {
                    break;
                }
            }
        }
Example #5
0
 static void Main()
 {
     try
     {
         PdfixEngine.Init();
         PdfixSamples.Run("..\\..\\..\\");
         Console.WriteLine("SUCCESS");
     }
     catch (System.Exception ex)
     {
         Console.WriteLine(ex.Message);
     } finally
     {
         PdfixEngine.Terminate();
     }
 }
        static void DoSomething()
        {
            Console.WriteLine("{0} = waiting", Thread.CurrentThread.Name);
            semaphore.WaitOne();
            Console.WriteLine("{0} begins!", Thread.CurrentThread.Name);
            PdfPage page = doc.AcquirePage(0);

            if (page == null)
            {
                PdfixEngine.ThrowException();
            }

            Thread.Sleep(1000);
            page.Release();
            Console.WriteLine("{0} releasing...", Thread.CurrentThread.Name);
            semaphore.Release();
        }
Example #7
0
        private static void ExtractImage(PdfPage page, PdsImage image, string savePath)
        {
            // widget specific properties
            var       bbox      = image.GetBBox();
            var       page_view = page.AcquirePageView(1, PdfRotate.kRotate0);
            var       rect      = page_view.RectToDevice(bbox);
            PdfMatrix matrix    = page_view.GetDeviceMatrix();

            page_view.Release();

            string imgPath = savePath + "/image_" + (++_imageIndex).ToString() + ".jpg";

            Console.WriteLine("Image Found: " + imgPath);

            var img = PdfixEngine.Instance.CreateImage(rect.right - rect.left, rect.bottom - rect.top, PsImageDIBFormat.kImageDIBFormatArgb);

            PdfPageRenderParams renderParams = new PdfPageRenderParams
            {
                clip_box = bbox,
                matrix   = matrix,
                image    = img
            };

            if (!page.DrawContent(renderParams, null, null))
            {
                PdfixEngine.ThrowException();
            }

            PdfImageParams imgParams = new PdfImageParams
            {
                format  = PdfImageFormat.kImageFormatJpg,
                quality = 80
            };

            if (!img.Save(imgPath, imgParams))
            {
                PdfixEngine.ThrowException();
            }

            img.Destroy();
        }
Example #8
0
        // process struct element childs and collect all bboxes of the content elements
        // bounding boxes of child elements and content can be placed on different pages
        static List <PdfRect> GetStructElementBboxes(PdfDoc doc, PdsStructElement struct_elem)
        {
            var bboxes   = new List <PdfRect>();
            int num_kids = struct_elem.GetNumChildren();

            for (int i = 0; i < num_kids; i++)
            {
                var kid_obj = struct_elem.GetChildObject(i);
                switch (struct_elem.GetChildType(i))
                {
                case PdfStructElementType.kPdsStructChildElement:
                {
                    var kid_struct_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_struct_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }
                    bboxes.AddRange(GetStructElementBboxes(doc, kid_struct_elem));
                }
                break;

                case PdfStructElementType.kPdsStructChildObject:
                    break;

                case PdfStructElementType.kPdsStructChildStreamContent:
                case PdfStructElementType.kPdsStructChildPageContent:
                {
                    var kid_page_num = struct_elem.GetChildPageNumber(i);
                    var mcid         = struct_elem.GetChildMcid(i);
                    bboxes.AddRange(GetMcidBBoxes(doc, kid_page_num, mcid));
                }
                break;
                }
            }
            return(bboxes);
        }
Example #9
0
        static void ProcessStructElement(PdfDoc doc, PdsStructElement struct_elem, string indent)
        {
            indent += " ";
            if (struct_elem == null)
            {
                PdfixEngine.ThrowException();
            }

            // get the element type
            string type_str = struct_elem.GetType_(true);

            if (type_str.Length != 0)
            {
                Console.WriteLine(indent + "Struct Element: " + type_str);
            }


            // get struct element bounding boxes (can be on multiple pages)
            // skip document struct element - can take long time to collect all bboxes
            if (type_str != "Document")
            {
                List <PdfRect> bboxes = GetStructElementBboxes(doc, struct_elem);
                bboxes.ForEach(delegate(PdfRect bbox)
                {
                    Console.WriteLine(indent + "[" + bbox.left + ", " + bbox.bottom + ", " + bbox.right + ", " +
                                      bbox.top + "]");
                });
            }

            string tile_str = struct_elem.GetTitle();

            if (tile_str.Length != 0)
            {
                Console.WriteLine(indent + "title: " + tile_str);
            }

            string actual_text_str = struct_elem.GetActualText();

            if (actual_text_str.Length != 0)
            {
                Console.WriteLine(indent + "actual text: " + actual_text_str);
            }

            string alt_str = struct_elem.GetAlt();

            if (alt_str.Length != 0)
            {
                Console.WriteLine(indent, "alt: " + alt_str);
            }

            var page_num = struct_elem.GetPageNumber();

            if (page_num != -1)
            {
                Console.WriteLine(indent, "Page number: " + page_num);
            }

            int num_kids = struct_elem.GetNumChildren();

            for (int i = 0; i < num_kids; i++)
            {
                var kid_obj = struct_elem.GetChildObject(i);
                // based on structure element you can obtain different data
                switch (struct_elem.GetChildType(i))
                {
                case PdfStructElementType.kPdsStructChildElement:
                {
                    var kid_struct_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_struct_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }
                    ProcessStructElement(doc, kid_struct_elem, indent);
                }
                break;

                case PdfStructElementType.kPdsStructChildObject:
                    break;

                case PdfStructElementType.kPdsStructChildStreamContent:
                {
                    var kid_page_num = struct_elem.GetChildPageNumber(i);
                    Console.WriteLine(indent + "Kid Page number: " + kid_page_num);
                    var mcid = struct_elem.GetChildMcid(i);
                    Console.WriteLine(indent + "MCID: " + mcid);
                }
                break;

                case PdfStructElementType.kPdsStructChildPageContent:
                {
                    var mcid = struct_elem.GetChildMcid(i);
                    Console.WriteLine(indent + "MCID: " + mcid);
                }
                break;
                }
            }
            Console.WriteLine("");
        }