コード例 #1
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static bool MoveParagraphToParent(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        if (!struct_elem.MoveChild(i, struct_elem, struct_elem.GetNumChildren() - 1))
                        {
                            throw new Exception();
                        }
                        return(true);
                    }
                    var paragraph = MoveParagraphToParent(kid_elem);
                    return(paragraph);
                }
            }
            return(false);
        }
コード例 #2
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetParagraphTextState
        // get the text state of the text objects inside paragraph by iterating content kid objects
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static PdfTextState GetParagraphTextState(PdsStructElement struct_elem)
        {
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidPageContent)
                {
                    // acquire page on which the element is present
                    PdfDoc  doc  = struct_elem.GetStructTree().GetDoc();
                    PdfPage page = doc.AcquirePage(struct_elem.GetKidPageNumber(i));

                    // find text object with mcid on the page to get the text state
                    int mcid      = struct_elem.GetKidMcid(i);
                    var num_pages = page.GetNumPageObjects();

                    for (int j = 0; j < page.GetNumPageObjects(); j++)
                    {
                        var ts = GetPageObjectTextState(page.GetPageObject(j), mcid);


                        // Handled by MIkhaylov KS
                        if (ts.font_size == 0)
                        {
                            continue;
                        }

                        page.Release();
                        return(ts);
                    }
                    page.Release();
                }
            }
            return(new PdfTextState());
        }
コード例 #3
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    if (kid_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        return(kid_elem);
                    }

                    var paragraph = GetFirstParagraph(kid_elem);
                    if (paragraph != null)
                    {
                        kid_elem.Release();
                        return(paragraph);
                    }

                    kid_elem.Release();
                }
            }
            return(null);
        }
コード例 #4
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstTable(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "Table")
                    {
                        return(kid_elem);
                    }

                    var table = GetFirstTable(kid_elem);
                    if (table != null)
                    {
                        return(table);
                    }
                }
            }
            return(null);
        }
コード例 #5
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        return(kid_elem);
                    }

                    var paragraph = GetFirstParagraph(kid_elem);
                    if (paragraph != null)
                    {
                        return(paragraph);
                    }
                }
            }
            return(null);
        }
コード例 #6
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // read document structure tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                Console.WriteLine("No Tags available");
            }
            else
            {
                for (var i = 0; i < struct_tree.GetNumKids(); i++)
                {
                    PdsObject        kid_object  = struct_tree.GetKidObject(i);
                    PdsStructElement struct_elem = struct_tree.AcquireStructElement(kid_object);
                    ProcessStructElement(doc, struct_elem, "");
                    struct_elem.Release();
                }
            }


            doc.Close();
            pdfix.Destroy();
        }
コード例 #7
0
        public static void Run(
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // tag text on the bottom of the page as artifact
            for (int i = 0; i < struct_tree.GetNumChildren(); i++)
            {
                PdsObject        kid_obj  = struct_tree.GetChildObject(i);
                PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj);
                RemoveParagraph(kid_elem);
            }

            // the struct tree was updates, save page content on each page to apply changes
            for (int i = 0; i < doc.GetNumPages(); i++)
            {
                PdfPage page = doc.AcquirePage(i);
                MarkUntaggedObjectsAsArtifact(page);
                page.Release();
            }

            if (!doc.Save(savePath, Pdfix.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
        }
コード例 #8
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // Remove paragraph from structure tree
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        internal static void RemoveParagraph(PdsStructElement struct_elem)
        {
            // remove last 2 P struct elements from struct tree
            for (int i = struct_elem.GetNumKids() - 1; i >= 0; i--)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        for (int j = kid_elem.GetNumKids() - 1; j >= 0; j--)
                        {
                            if (!kid_elem.RemoveKid(j))
                            {
                                throw new Exception(pdfix.GetErrorType().ToString());
                            }
                        }
                    }
                    else if (type == "Figure")
                    {
                        // remove figure if does not contain an alt text
                        string alt = kid_elem.GetAlt();
                        if (alt.Length == 0)
                        {
                            for (int j = kid_elem.GetNumKids() - 1; j >= 0; j--)
                            {
                                if (!kid_elem.RemoveKid(j))
                                {
                                    throw new Exception(pdfix.GetErrorType().ToString());
                                }
                            }
                        }
                    }
                    else
                    {
                        RemoveParagraph(kid_elem);
                    }
                    // remove this element if it has no kids
                    if (kid_elem.GetNumKids() == 0)
                    {
                        struct_elem.RemoveKid(i);
                    }

                    kid_elem.Release();
                }
                // remove only 2 paragraphs in this sample
                if (++count >= 2)
                {
                    break;
                }
            }
        }
コード例 #9
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }


            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement figure = GetFirstFigure(struct_tree);

            if (figure == null)
            {
                throw new Exception("No figure found.");
            }

            if (!figure.SetAlt("This is a new alternate text"))
            {
                throw new Exception(pdfix.GetError());
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }
コード例 #10
0
 private static bool MoveParagraphToParent(PdsStructTree struct_tree)
 {
     for (int i = 0; i < struct_tree.GetNumChildren();)
     {
         PdsObject        kid_obj  = struct_tree.GetChildObject(i);
         PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj);
         var paragraph             = MoveParagraphToParent(kid_elem);
         return(paragraph);
     }
     return(false);
 }
コード例 #11
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // Remove paragraph from structure tree
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        internal static void RemoveParagraph(PdsStructElement struct_elem)
        {
            // remove last 2 P struct elements from struct tree
            for (int i = struct_elem.GetNumChildren() - 1; i >= 0; i--)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--)
                        {
                            if (!kid_elem.RemoveChild(j))
                            {
                                PdfixEngine.ThrowException();
                            }
                        }
                    }
                    else if (type == "Figure")
                    {
                        // remove figure if does not contain an alt text
                        string alt = kid_elem.GetAlt();
                        if (alt.Length == 0)
                        {
                            for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--)
                            {
                                if (!kid_elem.RemoveChild(j))
                                {
                                    PdfixEngine.ThrowException();
                                }
                            }
                        }
                    }
                    else
                    {
                        RemoveParagraph(kid_elem);
                    }
                    // remove this element if it has no kids
                    if (kid_elem.GetNumChildren() == 0)
                    {
                        struct_elem.RemoveChild(i);
                    }
                }
                // remove only 2 paragraphs in this sample
                if (++count >= 2)
                {
                    break;
                }
            }
        }
コード例 #12
0
 private static PdsStructElement GetFirstTable(PdsStructTree struct_tree)
 {
     for (int i = 0; i < struct_tree.GetNumChildren(); i++)
     {
         PdsObject        kid_obj  = struct_tree.GetChildObject(i);
         PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj);
         var paragraph             = GetFirstTable(kid_elem);
         if (paragraph != null)
         {
             return(paragraph);
         }
     }
     return(null);
 }
コード例 #13
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetParagraphBBox
        // get the text state of the text objects inside paragraph by iterating content kid objects
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static bool GetStructElementBBox(PdsStructElement struct_elem, ref PdfRect bbox)
        {
            bool result = false;

            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent)
                {
                    // acquire page on which the element is present
                    PdfDoc  doc  = struct_elem.GetStructTree().GetDoc();
                    PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i));

                    // find text object with mcid on the page to get the text state
                    int mcid    = struct_elem.GetChildMcid(i);
                    var content = page.GetContent();
                    for (int j = 0; j < content.GetNumObjects(); j++)
                    {
                        PdsPageObject page_object = content.GetObject(j);

                        // check if this text page object has the same mcid
                        PdsContentMark content_mark = page_object.GetContentMark();
                        if (content_mark != null && content_mark.GetTagMcid() == mcid)
                        {
                            PdfRect elem_bbox = page_object.GetBBox();
                            if ((bbox.left - bbox.right == 0) || (bbox.top - bbox.bottom == 0))
                            {
                                bbox = elem_bbox;
                            }
                            else
                            {
                                bbox.left   = Math.Min(bbox.left, elem_bbox.left);
                                bbox.right  = Math.Max(bbox.right, elem_bbox.right);
                                bbox.top    = Math.Max(bbox.top, elem_bbox.top);
                                bbox.bottom = Math.Min(bbox.bottom, elem_bbox.bottom);
                            }
                            result = true;
                        }
                    }
                }
                else if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    GetStructElementBBox(kid_elem, ref bbox);
                }
            }
            return(result);
        }
コード例 #14
0
 private static PdsStructElement GetFirstParagraph(PdsStructTree struct_tree)
 {
     for (int i = 0; i < struct_tree.GetNumKids(); i++)
     {
         PdsObject        kid_obj  = struct_tree.GetKidObject(i);
         PdsStructElement kid_elem = struct_tree.AcquireStructElement(kid_obj);
         var paragraph             = GetFirstParagraph(kid_elem);
         if (paragraph != null)
         {
             kid_elem.Release();
             return(paragraph);
         }
         kid_elem.Release();
     }
     return(null);
 }
コード例 #15
0
        public static void Run(
            String openPath                             // source PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // read document structure tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                Console.WriteLine("No Tags available");
            }
            else
            {
                for (var i = 0; i < struct_tree.GetNumChildren(); i++)
                {
                    PdsObject        kid_object  = struct_tree.GetChildObject(i);
                    PdsStructElement struct_elem = struct_tree.GetStructElementFromObject(kid_object);
                    ProcessStructElement(doc, struct_elem, "");
                }
            }


            doc.Close();
        }
コード例 #16
0
        public static void Run(
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }


            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement figure = GetFirstFigure(struct_tree);

            if (figure == null)
            {
                throw new Exception("No figure found.");
            }

            if (!figure.SetAlt("This is a new alternate text"))
            {
                throw new Exception(pdfix.GetError());
            }

            if (!doc.Save(savePath, Pdfix.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
        }
コード例 #17
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // TagParagraphAsHeading
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static void TagParagraphAsHeading(PdsStructElement struct_elem)
        {
            string type = struct_elem.GetType_(true);

            if (type == "P")
            {
                // get the paragraph text_state
                PdfTextState ts = GetParagraphTextState(struct_elem);

                // get the font name
                string font_name = ts.font != null?ts.font.GetFontName() : "";

                string tag_type = "";
                if (font_name.Contains("Black") && ts.font_size >= 25)
                {
                    tag_type = "H1";
                }
                else if (font_name.Contains("Bold") && ts.font_size >= 16)
                {
                    tag_type = "H2";
                }

                // update tag type
                if (tag_type.Length != 0)
                {
                    struct_elem.SetType(tag_type);
                }
                return; // this was a P tag, no need to continue to kid struct elements
            }
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    TagParagraphAsHeading(kid_elem);
                    kid_elem.Release();
                }
            }
        }
コード例 #18
0
        // process struct element childs and collect all bboxes of the content elements
        // bounding boxes of child elements and content can be placed on different pages
        static List <PdfRect> GetStructElementBboxes(PdfDoc doc, PdsStructElement struct_elem)
        {
            var bboxes   = new List <PdfRect>();
            int num_kids = struct_elem.GetNumKids();

            for (int i = 0; i < num_kids; i++)
            {
                var kid_obj = struct_elem.GetKidObject(i);
                switch (struct_elem.GetKidType(i))
                {
                case PdfStructElementType.kPdsStructKidElement:
                {
                    var kid_struct_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    if (kid_struct_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }
                    bboxes.AddRange(GetStructElementBboxes(doc, kid_struct_elem));
                    kid_struct_elem.Release();
                }
                break;

                case PdfStructElementType.kPdsStructKidObject:
                    break;

                case PdfStructElementType.kPdsStructKidStreamContent:
                case PdfStructElementType.kPdsStructKidPageContent:
                {
                    var kid_page_num = struct_elem.GetKidPageNumber(i);
                    var mcid         = struct_elem.GetKidMcid(i);
                    bboxes.AddRange(GetMcidBBoxes(doc, kid_page_num, mcid));
                }
                break;
                }
            }
            return(bboxes);
        }
コード例 #19
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetParagraphTextState
        // get the text state of the text objects inside paragraph by iterating content kid objects
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static PdfTextState GetParagraphTextState(PdsStructElement struct_elem)
        {
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent)
                {
                    // acquire page on which the element is present
                    PdfDoc  doc  = struct_elem.GetStructTree().GetDoc();
                    PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i));

                    // find text object with mcid on the page to get the text state
                    int mcid    = struct_elem.GetChildMcid(i);
                    var content = page.GetContent();
                    for (int j = 0; j < content.GetNumObjects();)
                    {
                        var ts = GetPageObjectTextState(content.GetObject(j), mcid);
                        page.Release();
                        return(ts);
                    }
                    page.Release();
                }
            }
            return(new PdfTextState());
        }
コード例 #20
0
        // process struct element childs and collect all bboxes of the content elements
        // bounding boxes of child elements and content can be placed on different pages
        static List <PdfRect> GetStructElementBboxes(PdfDoc doc, PdsStructElement struct_elem)
        {
            var bboxes   = new List <PdfRect>();
            int num_kids = struct_elem.GetNumChildren();

            for (int i = 0; i < num_kids; i++)
            {
                var kid_obj = struct_elem.GetChildObject(i);
                switch (struct_elem.GetChildType(i))
                {
                case PdfStructElementType.kPdsStructChildElement:
                {
                    var kid_struct_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_struct_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }
                    bboxes.AddRange(GetStructElementBboxes(doc, kid_struct_elem));
                }
                break;

                case PdfStructElementType.kPdsStructChildObject:
                    break;

                case PdfStructElementType.kPdsStructChildStreamContent:
                case PdfStructElementType.kPdsStructChildPageContent:
                {
                    var kid_page_num = struct_elem.GetChildPageNumber(i);
                    var mcid         = struct_elem.GetChildMcid(i);
                    bboxes.AddRange(GetMcidBBoxes(doc, kid_page_num, mcid));
                }
                break;
                }
            }
            return(bboxes);
        }
コード例 #21
0
        static void ProcessStructElement(PdfDoc doc, PdsStructElement struct_elem, string indent)
        {
            indent += " ";
            if (struct_elem == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the element type
            string type_str = struct_elem.GetType_(true);

            if (type_str.Length != 0)
            {
                Console.WriteLine(indent + "Struct Element: " + type_str);
            }


            // get struct element bounding boxes (can be on multiple pages)
            // skip document struct element - can take long time to collect all bboxes
            if (type_str != "Document")
            {
                List <PdfRect> bboxes = GetStructElementBboxes(doc, struct_elem);
                bboxes.ForEach(delegate(PdfRect bbox)
                {
                    Console.WriteLine(indent + "[" + bbox.left + ", " + bbox.bottom + ", " + bbox.right + ", " +
                                      bbox.top + "]");
                });
            }

            string tile_str = struct_elem.GetTitle();

            if (tile_str.Length != 0)
            {
                Console.WriteLine(indent + "title: " + tile_str);
            }

            string actual_text_str = struct_elem.GetActualText();

            if (actual_text_str.Length != 0)
            {
                Console.WriteLine(indent + "actual text: " + actual_text_str);
            }

            string alt_str = struct_elem.GetAlt();

            if (alt_str.Length != 0)
            {
                Console.WriteLine(indent, "alt: " + alt_str);
            }

            var page_num = struct_elem.GetPageNumber();

            if (page_num != -1)
            {
                Console.WriteLine(indent, "Page number: " + page_num);
            }

            int num_kids = struct_elem.GetNumKids();

            for (int i = 0; i < num_kids; i++)
            {
                var kid_obj = struct_elem.GetKidObject(i);
                // based on structure element you can obtain different data
                switch (struct_elem.GetKidType(i))
                {
                case PdfStructElementType.kPdsStructKidElement:
                {
                    var kid_struct_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    if (kid_struct_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }
                    ProcessStructElement(doc, kid_struct_elem, indent);
                    kid_struct_elem.Release();
                }
                break;

                case PdfStructElementType.kPdsStructKidObject:
                    break;

                case PdfStructElementType.kPdsStructKidStreamContent:
                {
                    var kid_page_num = struct_elem.GetKidPageNumber(i);
                    Console.WriteLine(indent + "Kid Page number: " + kid_page_num);
                    var mcid = struct_elem.GetKidMcid(i);
                    Console.WriteLine(indent + "MCID: " + mcid);
                }
                break;

                case PdfStructElementType.kPdsStructKidPageContent:
                {
                    var mcid = struct_elem.GetKidMcid(i);
                    Console.WriteLine(indent + "MCID: " + mcid);
                }
                break;
                }
            }
            Console.WriteLine("");
        }
コード例 #22
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement paragraph = GetFirstParagraph(struct_tree);

            if (paragraph == null)
            {
                throw new Exception("No table found.");
            }

            // move paragraph to the back of it's parent
            PdsStructElement parent = struct_tree.AcquireStructElement(paragraph.GetParentObject());

            if (parent == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!paragraph.SetParent(parent, parent.GetNumKids() - 1))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }
コード例 #23
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // tag text on the bottom of the page as artifact
            for (int i = 0; i < struct_tree.GetNumKids(); i++)
            {
                PdsObject        kid_obj  = struct_tree.GetKidObject(i);
                PdsStructElement kid_elem = struct_tree.AcquireStructElement(kid_obj);
                TagParagraphAsHeading(kid_elem);
                kid_elem.Release();
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }
コード例 #24
0
        public static void Run(
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement table = GetFirstTable(struct_tree);

            if (table == null)
            {
                throw new Exception("No table found.");
            }

            PdfRect bbox = new PdfRect();

            GetStructElementBBox(table, ref bbox);

            // remove all items from the table to make it untagged cotnent
            for (int i = table.GetNumChildren() - 1; i >= 0; i--)
            {
                table.RemoveChild(i);
            }

            // tag page
            PdfPage page = doc.AcquirePage(0);

            PdePageMap page_map = page.AcquirePageMap();
            PdeElement elem     = page_map.CreateElement(PdfElementType.kPdeImage, null);

            elem.SetBBox(bbox);
            elem.SetAlt("This is image caption");

            // prepare document template to ignore already tagged content
            var doc_prelight = doc.GetTemplate();

            doc_prelight.SetProperty("ignore_tags", 1);

            // re-tag non-tagged page content
            PdePageMap pageMap = page.AcquirePageMap();

            if (pageMap == null)
            {
                throw new Exception(pdfix.GetError());
            }
            if (!pageMap.CreateElements(null, null))
            {
                throw new Exception(pdfix.GetError());
            }

            if (!page_map.AddTags(table, null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // udpate the table element type
            if (!table.SetType("Sect"))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, Pdfix.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
        }
コード例 #25
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement paragraph = GetFirstParagraph(struct_tree);

            if (paragraph == null)
            {
                throw new Exception("No paragraph found.");
            }

            PdfRect annot_bbox = new PdfRect();

            GetStructElementBBox(paragraph, ref annot_bbox);

            // add new link annotation to the page
            PdfPage      page  = doc.AcquirePage(0);
            PdfLinkAnnot annot = page.AddLinkAnnot(0, annot_bbox);

            if (annot == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // re-tag the document the link annotation
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }
コード例 #26
0
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // TagParagraphAsHeading
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static void TagParagraphAsHeading(PdsStructElement struct_elem)
        {
            string type = struct_elem.GetType_(true);
            // if (type == "P") {
            // get the paragraph text_state
            PdfTextState ts = GetParagraphTextState(struct_elem);

            // get the font name
            //string font_name = ts.font != null ? ts.font.GetFontName() : "";
            string tag_type = "";


            //if (font_name.Contains("Black") && ts.font_size >= 25)
            //    tag_type = "H1";
            //else if (font_name.Contains("Bold") && ts.font_size >= 16)
            //    tag_type = "H2";

            Console.WriteLine("Before replacing..");

            if (ts.font_size >= 14)
            {
                tag_type = "H1";
                Console.WriteLine("Replacing paragraph with Heading 1");
            }

            else if (ts.font_size >= 12)
            {
                tag_type = "H2";
                Console.WriteLine("Replacing paragraph with Heading 2");
            }

            else if (ts.font_size >= 6)
            {
                tag_type = "H3";
                Console.WriteLine("Replacing paragraph with Heading 2");
            }


            // update tag type
            if (tag_type.Length != 0)
            {
                struct_elem.SetType(tag_type);
            }
            // return; // this was a P tag, no need to continue to kid struct elements
            //}

            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                string actualText = struct_elem.GetActualText();

                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);

                    TagParagraphAsHeading(kid_elem);
                    kid_elem.Release();
                }
                else if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidPageContent)
                {
                    PdfTextState textState = GetParagraphTextState(struct_elem);

                    //PdsObject kid_obj =
                    //PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    //var element_text = kid_elem.GetActualText();
                }
            }
        }
コード例 #27
0
        public static void Run(
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement paragraph = GetFirstParagraph(struct_tree);

            if (paragraph == null)
            {
                throw new Exception("No paragraph found.");
            }

            PdfRect annot_bbox = new PdfRect();

            GetStructElementBBox(paragraph, ref annot_bbox);

            // add new link annotation to the page
            PdfPage      page  = doc.AcquirePage(0);
            PdfLinkAnnot annot = (PdfLinkAnnot)page.CreateAnnot(PdfAnnotSubtype.kAnnotLink, annot_bbox);

            page.AddAnnot(0, annot);
            if (annot == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // re-tag the document the link annotation
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, Pdfix.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
        }
コード例 #28
0
        static void ProcessStructElement(PdsStructElement struct_elem, string indent)
        {
            indent += " ";
            if (struct_elem == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            string type_str = struct_elem.GetType_(true);

            Console.WriteLine(indent + type_str);

            string tile_str = struct_elem.GetTitle();

            Console.WriteLine(indent + tile_str);

            string actual_text_str = struct_elem.GetActualText();

            Console.WriteLine(indent + actual_text_str);

            string alt_str = struct_elem.GetAlt();

            Console.WriteLine(indent, alt_str);

            var page_num = struct_elem.GetPageNumber();

            Console.WriteLine(indent, "Page number: " + page_num);

            int num_kids = struct_elem.GetNumKids();

            for (int i = 0; i < num_kids; i++)
            {
                var kid_obj = struct_elem.GetKidObject(i);
                // based on structure element you can obtain different data
                switch (struct_elem.GetKidType(i))
                {
                case PdfStructElementType.kPdsStructKidElement:
                {
                    var kid_struct_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    if (kid_struct_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }
                    ProcessStructElement(kid_struct_elem, indent);
                    kid_struct_elem.Release();
                }
                break;

                case PdfStructElementType.kPdsStructKidObject:
                    break;

                case PdfStructElementType.kPdsStructKidStreamContent:
                {
                    var kid_page_num = struct_elem.GetKidPageNumber(i);
                    Console.WriteLine(indent + "Kid Page number: " + kid_page_num);
                    var mcid = struct_elem.GetKidMcid(i);
                    Console.WriteLine(indent + "MCID: " + mcid);
                }
                break;

                case PdfStructElementType.kPdsStructKidPageContent:
                {
                    var mcid = struct_elem.GetKidMcid(i);
                    Console.WriteLine(indent + "MCID: " + mcid);
                }
                break;
                }
            }
        }
コード例 #29
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdsStructElement table = GetFirstTable(struct_tree);

            if (table == null)
            {
                throw new Exception("No table found.");
            }

            PdfRect bbox = new PdfRect();

            GetStructElementBBox(table, ref bbox);

            // remove all items from the table to make it untagged cotnent
            for (int i = table.GetNumKids() - 1; i >= 0; i--)
            {
                table.RemoveKid(i);
            }

            // tag page
            PdfPage page = doc.AcquirePage(0);

            PdePageMap page_map = page.CreatePageMap();
            PdeElement elem     = page_map.CreateElement(PdfElementType.kPdeImage, null);

            elem.SetBBox(bbox);
            elem.SetAlt("This is image caption");

            // prepare document template to ignore already tagged content
            PdfDocTemplate doc_tmpl = doc.GetDocTemplate();

            doc_tmpl.SetProperty("ignore_tags", 1);

            // re-tag non-tagged page content
            if (!page_map.AcquireElements(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }
            if (!page_map.AddTags(table, null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // udpate the table element type
            if (!table.SetType("Sect"))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }