Пример #1
0
        private static void ProcessPageObject(PdfPage page, PdsPageObject obj, string savePath)
        {
            if (obj == null)
            {
                PdfixEngine.ThrowException();
            }

            switch (obj.GetObjectType())
            {
            case PdfPageObjectType.kPdsPageImage:
                ExtractImage(page, (PdsImage)obj, savePath);
                break;

            case PdfPageObjectType.kPdsPageForm:
            {
                var form    = (PdsForm)obj;
                var content = form.GetContent();
                for (int i = 0; i < content.GetNumObjects(); i++)
                {
                    ProcessPageObject(page, content.GetObject(i), savePath);
                }
            }
            break;
            }
        }
Пример #2
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetPageObjectTextState
        // get text object's of specified mcid the text state
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static PdfTextState GetPageObjectTextState(PdsPageObject page_object, int mcid)
        {
            if (page_object.GetObjectType() == PdfPageObjectType.kPdsPageText)
            {
                PdsText text = (PdsText)page_object;

                // check if this text page object has the same mcid
                PdsContentMark content_mark = page_object.GetContentMark();
                if (content_mark != null && content_mark.GetTagMcid() == mcid)
                {
                    PdfTextState ts = text.GetTextState();
                    if (ts.font != null)
                    {
                        return(ts);
                    }
                }
            }
            else if (page_object.GetObjectType() == PdfPageObjectType.kPdsPageForm)
            {
                // search for the text object inside of the form XObject
                PdsForm form    = (PdsForm)page_object;
                var     content = form.GetContent();
                for (int i = 0; i < content.GetNumObjects(); i++)
                {
                    var ts = GetPageObjectTextState(content.GetObject(i), mcid);
                    if (ts.font != null)
                    {
                        return(ts);
                    }
                }
            }
            return(new PdfTextState());
        }
Пример #3
0
        // get the ocg layers of the page object
        internal static List <KeyValuePair <string, int> > GetPageObjectLayers(PdsPageObject page_object)
        {
            List <KeyValuePair <string, int> > layers = new List <KeyValuePair <string, int> >();

            var content_mark = page_object.GetContentMark();

            if (content_mark != null)
            {
                for (var i = 0; i < content_mark.GetNumTags(); i++)
                {
                    var name = content_mark.GetTagName(i);
                    if (name == "OC")
                    {
                        var content_mark_obj = content_mark.GetTagObject(i);
                        if (content_mark_obj != null)
                        {
                            void push_ocg(PdsDictionary ocg)
                            {
                                var ocg_name = ocg.GetText("Name");
                                var id       = ocg.GetId();

                                layers.Add(new KeyValuePair <string, int>(ocg_name, id));
                            }

                            var type = content_mark_obj.GetText("Type");

                            if (type == "OCMD")
                            {
                                var ocgs_dict = content_mark_obj.GetDictionary("OCGs");
                                if (ocgs_dict != null)
                                {
                                    push_ocg(ocgs_dict);
                                }
                                var ocgs_arr = content_mark_obj.GetArray("OCGs");
                                if (ocgs_arr != null)
                                {
                                    for (var j = 0; j < ocgs_arr.GetNumObjects(); j++)
                                    {
                                        var ocg_dict = ocgs_arr.GetDictionary(j);
                                        push_ocg(ocg_dict);
                                    }
                                }
                            }
                            else if (type == "OCG")
                            {
                                push_ocg(content_mark_obj);
                            }
                        }
                    }
                }
            }
            return(layers);
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetParagraphBBox
        // get the text state of the text objects inside paragraph by iterating content kid objects
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static bool GetStructElementBBox(PdsStructElement struct_elem, ref PdfRect bbox)
        {
            bool result = false;

            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent)
                {
                    // acquire page on which the element is present
                    PdfDoc  doc  = struct_elem.GetStructTree().GetDoc();
                    PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i));

                    // find text object with mcid on the page to get the text state
                    int mcid    = struct_elem.GetChildMcid(i);
                    var content = page.GetContent();
                    for (int j = 0; j < content.GetNumObjects(); j++)
                    {
                        PdsPageObject page_object = content.GetObject(j);

                        // check if this text page object has the same mcid
                        PdsContentMark content_mark = page_object.GetContentMark();
                        if (content_mark != null && content_mark.GetTagMcid() == mcid)
                        {
                            PdfRect elem_bbox = page_object.GetBBox();
                            if ((bbox.left - bbox.right == 0) || (bbox.top - bbox.bottom == 0))
                            {
                                bbox = elem_bbox;
                            }
                            else
                            {
                                bbox.left   = Math.Min(bbox.left, elem_bbox.left);
                                bbox.right  = Math.Max(bbox.right, elem_bbox.right);
                                bbox.top    = Math.Max(bbox.top, elem_bbox.top);
                                bbox.bottom = Math.Min(bbox.bottom, elem_bbox.bottom);
                            }
                            result = true;
                        }
                    }
                }
                else if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    GetStructElementBBox(kid_elem, ref bbox);
                }
            }
            return(result);
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // MarkUntaggedObjectsAsArtifact
        // find any non-tagged objects in the page content and mark them as artifact
        //////////////////////////////////////////////////////////////////////////////////////////////////
        internal static void MarkUntaggedObjectsAsArtifact(PdfPage page)
        {
            PdfDoc doc = page.GetDoc();

            for (int i = 0; i < page.GetNumPageObjects(); i++)
            {
                PdsPageObject page_obj = page.GetPageObject(i);

                PdsContentMark content_mark = page_obj.GetContentMark();
                if (!content_mark.GetTagArtifact() && content_mark.GetTagMcid() == -1)
                {
                    PdsDictionary artifact_dict = doc.CreateDictObject(false);
                    artifact_dict.Put("Type", doc.CreateNameObject(false, "Pagination"));
                    artifact_dict.Put("Subtype", doc.CreateNameObject(false, "Footer"));
                    content_mark.AddTag("Artifact", artifact_dict, false);
                }
            }
            page.SetContent();
        }
Пример #6
0
 // check page object if it fits to
 internal static void CheckPageObject(PdsPageObject page_object, KeyValuePair <string, int> layer)
 {
     switch (page_object.GetObjectType())
     {
     case PdfPageObjectType.kPdsPageText:
     {
         var    text_obj = (PdsText)page_object;
         string text     = text_obj.GetText();
         var    layers   = GetPageObjectLayers(page_object);
         foreach (KeyValuePair <string, int> l in layers)
         {
             if (layer.Key == l.Key && layer.Value == l.Value)
             {
                 Console.WriteLine(text);
                 break;
             }
         }
     }
     break;
     }
 }
Пример #7
0
        // collect all bounding boxes of the page object with specified mcid
        static List <PdfRect> GetMcidBBoxes(PdsPageObject obj, int mcid)
        {
            var bboxes = new List <PdfRect>();
            // check object mcid
            var content_mark = obj.GetContentMark();

            if (content_mark != null && content_mark.GetTagMcid() == mcid)
            {
                bboxes.Add(obj.GetBBox());
            }
            else
            {
                if (obj.GetObjectType() == PdfPageObjectType.kPdsPageForm)
                {
                    var form_obj = (PdsForm)obj;
                    for (var i = 0; i < form_obj.GetNumPageObjects(); i++)
                    {
                        bboxes.AddRange(GetMcidBBoxes(form_obj.GetPageObject(i), mcid));
                    }
                }
            }
            return(bboxes);
        }
Пример #8
0
        private static void ProcessPageObject(PdfPage page, PdsPageObject obj, string savePath)
        {
            if (obj == null)
            {
                throw new Exception(_pdfix.GetError());
            }

            switch (obj.GetObjectType())
            {
            case PdfPageObjectType.kPdsPageImage:
                ExtractImage(page, (PdsImage)obj, savePath);
                break;

            case PdfPageObjectType.kPdsPageForm:
            {
                var form = (PdsForm)obj;
                for (int i = 0; i < form.GetNumPageObjects(); i++)
                {
                    ProcessPageObject(page, form.GetPageObject(i), savePath);
                }
            }
            break;
            }
        }