//////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        return(kid_elem);
                    }

                    var paragraph = GetFirstParagraph(kid_elem);
                    if (paragraph != null)
                    {
                        return(paragraph);
                    }
                }
            }
            return(null);
        }
예제 #2
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath                             // source PDF document
            )
        {
            Pdfix pdfix = new Pdfix();

            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetError());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            PdsObject rootObj = doc.GetRootObject();

            ParseObject(rootObj, 1);

            doc.Close();
            pdfix.Destroy();
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    if (kid_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        return(kid_elem);
                    }

                    var paragraph = GetFirstParagraph(kid_elem);
                    if (paragraph != null)
                    {
                        kid_elem.Release();
                        return(paragraph);
                    }

                    kid_elem.Release();
                }
            }
            return(null);
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static bool MoveParagraphToParent(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        PdfixEngine.ThrowException();
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        if (!struct_elem.MoveChild(i, struct_elem, struct_elem.GetNumChildren() - 1))
                        {
                            throw new Exception();
                        }
                        return(true);
                    }
                    var paragraph = MoveParagraphToParent(kid_elem);
                    return(paragraph);
                }
            }
            return(false);
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetFirstParagraph
        // get reference to the first paragraph on the page
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static PdsStructElement GetFirstTable(PdsStructElement struct_elem)
        {
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    if (kid_elem == null)
                    {
                        throw new Exception(pdfix.GetErrorType().ToString());
                    }

                    string type = kid_elem.GetType_(true);
                    if (type == "Table")
                    {
                        return(kid_elem);
                    }

                    var table = GetFirstTable(kid_elem);
                    if (table != null)
                    {
                        return(table);
                    }
                }
            }
            return(null);
        }
예제 #6
0
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // read document structure tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                Console.WriteLine("No Tags available");
            }
            else
            {
                for (var i = 0; i < struct_tree.GetNumKids(); i++)
                {
                    PdsObject        kid_object  = struct_tree.GetKidObject(i);
                    PdsStructElement struct_elem = struct_tree.AcquireStructElement(kid_object);
                    ProcessStructElement(doc, struct_elem, "");
                    struct_elem.Release();
                }
            }


            doc.Close();
            pdfix.Destroy();
        }
        public static void Run(
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // tag text on the bottom of the page as artifact
            for (int i = 0; i < struct_tree.GetNumChildren(); i++)
            {
                PdsObject        kid_obj  = struct_tree.GetChildObject(i);
                PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj);
                RemoveParagraph(kid_elem);
            }

            // the struct tree was updates, save page content on each page to apply changes
            for (int i = 0; i < doc.GetNumPages(); i++)
            {
                PdfPage page = doc.AcquirePage(i);
                MarkUntaggedObjectsAsArtifact(page);
                page.Release();
            }

            if (!doc.Save(savePath, Pdfix.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // Remove paragraph from structure tree
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        internal static void RemoveParagraph(PdsStructElement struct_elem)
        {
            // remove last 2 P struct elements from struct tree
            for (int i = struct_elem.GetNumKids() - 1; i >= 0; i--)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        for (int j = kid_elem.GetNumKids() - 1; j >= 0; j--)
                        {
                            if (!kid_elem.RemoveKid(j))
                            {
                                throw new Exception(pdfix.GetErrorType().ToString());
                            }
                        }
                    }
                    else if (type == "Figure")
                    {
                        // remove figure if does not contain an alt text
                        string alt = kid_elem.GetAlt();
                        if (alt.Length == 0)
                        {
                            for (int j = kid_elem.GetNumKids() - 1; j >= 0; j--)
                            {
                                if (!kid_elem.RemoveKid(j))
                                {
                                    throw new Exception(pdfix.GetErrorType().ToString());
                                }
                            }
                        }
                    }
                    else
                    {
                        RemoveParagraph(kid_elem);
                    }
                    // remove this element if it has no kids
                    if (kid_elem.GetNumKids() == 0)
                    {
                        struct_elem.RemoveKid(i);
                    }

                    kid_elem.Release();
                }
                // remove only 2 paragraphs in this sample
                if (++count >= 2)
                {
                    break;
                }
            }
        }
 private static bool MoveParagraphToParent(PdsStructTree struct_tree)
 {
     for (int i = 0; i < struct_tree.GetNumChildren();)
     {
         PdsObject        kid_obj  = struct_tree.GetChildObject(i);
         PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj);
         var paragraph             = MoveParagraphToParent(kid_elem);
         return(paragraph);
     }
     return(false);
 }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // Remove paragraph from structure tree
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        internal static void RemoveParagraph(PdsStructElement struct_elem)
        {
            // remove last 2 P struct elements from struct tree
            for (int i = struct_elem.GetNumChildren() - 1; i >= 0; i--)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);

                    string type = kid_elem.GetType_(true);
                    if (type == "P")
                    {
                        for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--)
                        {
                            if (!kid_elem.RemoveChild(j))
                            {
                                PdfixEngine.ThrowException();
                            }
                        }
                    }
                    else if (type == "Figure")
                    {
                        // remove figure if does not contain an alt text
                        string alt = kid_elem.GetAlt();
                        if (alt.Length == 0)
                        {
                            for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--)
                            {
                                if (!kid_elem.RemoveChild(j))
                                {
                                    PdfixEngine.ThrowException();
                                }
                            }
                        }
                    }
                    else
                    {
                        RemoveParagraph(kid_elem);
                    }
                    // remove this element if it has no kids
                    if (kid_elem.GetNumChildren() == 0)
                    {
                        struct_elem.RemoveChild(i);
                    }
                }
                // remove only 2 paragraphs in this sample
                if (++count >= 2)
                {
                    break;
                }
            }
        }
 private static PdsStructElement GetFirstTable(PdsStructTree struct_tree)
 {
     for (int i = 0; i < struct_tree.GetNumChildren(); i++)
     {
         PdsObject        kid_obj  = struct_tree.GetChildObject(i);
         PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj);
         var paragraph             = GetFirstTable(kid_elem);
         if (paragraph != null)
         {
             return(paragraph);
         }
     }
     return(null);
 }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // GetParagraphBBox
        // get the text state of the text objects inside paragraph by iterating content kid objects
        //////////////////////////////////////////////////////////////////////////////////////////////////
        private static bool GetStructElementBBox(PdsStructElement struct_elem, ref PdfRect bbox)
        {
            bool result = false;

            for (int i = 0; i < struct_elem.GetNumChildren(); i++)
            {
                if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent)
                {
                    // acquire page on which the element is present
                    PdfDoc  doc  = struct_elem.GetStructTree().GetDoc();
                    PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i));

                    // find text object with mcid on the page to get the text state
                    int mcid    = struct_elem.GetChildMcid(i);
                    var content = page.GetContent();
                    for (int j = 0; j < content.GetNumObjects(); j++)
                    {
                        PdsPageObject page_object = content.GetObject(j);

                        // check if this text page object has the same mcid
                        PdsContentMark content_mark = page_object.GetContentMark();
                        if (content_mark != null && content_mark.GetTagMcid() == mcid)
                        {
                            PdfRect elem_bbox = page_object.GetBBox();
                            if ((bbox.left - bbox.right == 0) || (bbox.top - bbox.bottom == 0))
                            {
                                bbox = elem_bbox;
                            }
                            else
                            {
                                bbox.left   = Math.Min(bbox.left, elem_bbox.left);
                                bbox.right  = Math.Max(bbox.right, elem_bbox.right);
                                bbox.top    = Math.Max(bbox.top, elem_bbox.top);
                                bbox.bottom = Math.Min(bbox.bottom, elem_bbox.bottom);
                            }
                            result = true;
                        }
                    }
                }
                else if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetChildObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj);
                    GetStructElementBBox(kid_elem, ref bbox);
                }
            }
            return(result);
        }
 private static PdsStructElement GetFirstParagraph(PdsStructTree struct_tree)
 {
     for (int i = 0; i < struct_tree.GetNumKids(); i++)
     {
         PdsObject        kid_obj  = struct_tree.GetKidObject(i);
         PdsStructElement kid_elem = struct_tree.AcquireStructElement(kid_obj);
         var paragraph             = GetFirstParagraph(kid_elem);
         if (paragraph != null)
         {
             kid_elem.Release();
             return(paragraph);
         }
         kid_elem.Release();
     }
     return(null);
 }
예제 #14
0
        public static void Run(
            String openPath                             // source PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, null))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // read document structure tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                Console.WriteLine("No Tags available");
            }
            else
            {
                for (var i = 0; i < struct_tree.GetNumChildren(); i++)
                {
                    PdsObject        kid_object  = struct_tree.GetChildObject(i);
                    PdsStructElement struct_elem = struct_tree.GetStructElementFromObject(kid_object);
                    ProcessStructElement(doc, struct_elem, "");
                }
            }


            doc.Close();
        }
        public static void Run(
            String openPath                             // source PDF document
            )
        {
            Pdfix pdfix = PdfixEngine.Instance;

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            PdsObject rootObj = doc.GetRootObject();

            ParseObject(rootObj, 1);

            doc.Close();
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // TagParagraphAsHeading
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static void TagParagraphAsHeading(PdsStructElement struct_elem)
        {
            string type = struct_elem.GetType_(true);

            if (type == "P")
            {
                // get the paragraph text_state
                PdfTextState ts = GetParagraphTextState(struct_elem);

                // get the font name
                string font_name = ts.font != null?ts.font.GetFontName() : "";

                string tag_type = "";
                if (font_name.Contains("Black") && ts.font_size >= 25)
                {
                    tag_type = "H1";
                }
                else if (font_name.Contains("Bold") && ts.font_size >= 16)
                {
                    tag_type = "H2";
                }

                // update tag type
                if (tag_type.Length != 0)
                {
                    struct_elem.SetType(tag_type);
                }
                return; // this was a P tag, no need to continue to kid struct elements
            }
            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    TagParagraphAsHeading(kid_elem);
                    kid_elem.Release();
                }
            }
        }
        ///////////////////////////////////////////////////////////////////////
        // ParseElement
        ///////////////////////////////////////////////////////////////////////
        private static void ParseObject(PdsObject obj, int level)
        {
            if (level == 3)
            {
                return;             // Don't go too deep, it's just a sample.
            }
            Action <string> dump = str =>
            {
                Console.WriteLine($"{str}");
            };

            String indent = new String('-', level);

            dump(indent);

            // parse element based on type;
            PdfObjectType objType = obj.GetObjectType();

            switch (objType)
            {
            case PdfObjectType.kPdsNull:
                dump(indent + "null:" + ((PdsBoolean)obj).GetValue());
                break;

            case PdfObjectType.kPdsBoolean:
                dump(indent + "boolean:" + ((PdsBoolean)obj).GetValue());
                break;

            case PdfObjectType.kPdsNumber:
                dump(indent + "number:" + ((PdsNumber)obj).GetValue());
                break;

            case PdfObjectType.kPdsString:
                dump(indent + "string:" + ((PdsString)obj).GetText());
                break;

            case PdfObjectType.kPdsStream:
                dump(indent + "stream:" + ((PdsStream)obj).GetRawDataSize());
                ParseObject(((PdsStream)obj).GetStreamDict(), level + 1);
                break;

            case PdfObjectType.kPdsArray:
            {
                dump("array:");
                PdsArray arr = (PdsArray)obj;
                for (int i = 0; i < arr.GetNumObjects(); i++)
                {
                    dump(indent + " [" + i + "]");
                    ParseObject(arr.Get(i), level + 1);
                }
            }
            break;

            case PdfObjectType.kPdsDictionary:
            {
                dump("dictionary:");
                PdsDictionary dict = (PdsDictionary)obj;
                for (int i = 0; i < dict.GetNumKeys(); i++)
                {
                    String key = dict.GetKey(i);
                    dump(indent + " /" + key);
                    ParseObject(dict.Get(key), level + 1);
                }
            }
            break;
            }
        }
        public static void Run(
            String email,                               // authorization email
            String licenseKey,                          // authorization license key
            String openPath,                            // source PDF document
            String savePath                             // dest PDF document
            )
        {
            pdfix = new Pdfix();
            if (pdfix == null)
            {
                throw new Exception("Pdfix initialization fail");
            }

            if (!pdfix.Authorize(email, licenseKey))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            PdfDoc doc = pdfix.OpenDoc(openPath, "");

            if (doc == null)
            {
                throw new Exception(pdfix.GetError());
            }

            // cleanup any previous structure tree
            if (!doc.RemoveTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // autotag document first
            if (!doc.AddTags(null, IntPtr.Zero))
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // get the struct tree
            PdsStructTree struct_tree = doc.GetStructTree();

            if (struct_tree == null)
            {
                throw new Exception(pdfix.GetErrorType().ToString());
            }

            // tag text on the bottom of the page as artifact
            for (int i = 0; i < struct_tree.GetNumKids(); i++)
            {
                PdsObject        kid_obj  = struct_tree.GetKidObject(i);
                PdsStructElement kid_elem = struct_tree.AcquireStructElement(kid_obj);
                TagParagraphAsHeading(kid_elem);
                kid_elem.Release();
            }

            if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
            {
                throw new Exception(pdfix.GetError());
            }

            doc.Close();
            pdfix.Destroy();
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////
        // TagParagraphAsHeading
        // re-tag the struct element to heading based on font properties
        //////////////////////////////////////////////////////////////////////////////////////////////////
        static void TagParagraphAsHeading(PdsStructElement struct_elem)
        {
            string type = struct_elem.GetType_(true);
            // if (type == "P") {
            // get the paragraph text_state
            PdfTextState ts = GetParagraphTextState(struct_elem);

            // get the font name
            //string font_name = ts.font != null ? ts.font.GetFontName() : "";
            string tag_type = "";


            //if (font_name.Contains("Black") && ts.font_size >= 25)
            //    tag_type = "H1";
            //else if (font_name.Contains("Bold") && ts.font_size >= 16)
            //    tag_type = "H2";

            Console.WriteLine("Before replacing..");

            if (ts.font_size >= 14)
            {
                tag_type = "H1";
                Console.WriteLine("Replacing paragraph with Heading 1");
            }

            else if (ts.font_size >= 12)
            {
                tag_type = "H2";
                Console.WriteLine("Replacing paragraph with Heading 2");
            }

            else if (ts.font_size >= 6)
            {
                tag_type = "H3";
                Console.WriteLine("Replacing paragraph with Heading 2");
            }


            // update tag type
            if (tag_type.Length != 0)
            {
                struct_elem.SetType(tag_type);
            }
            // return; // this was a P tag, no need to continue to kid struct elements
            //}

            // search kid struct elements
            for (int i = 0; i < struct_elem.GetNumKids(); i++)
            {
                string actualText = struct_elem.GetActualText();

                if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement)
                {
                    PdsObject        kid_obj  = struct_elem.GetKidObject(i);
                    PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);

                    TagParagraphAsHeading(kid_elem);
                    kid_elem.Release();
                }
                else if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidPageContent)
                {
                    PdfTextState textState = GetParagraphTextState(struct_elem);

                    //PdsObject kid_obj =
                    //PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj);
                    //var element_text = kid_elem.GetActualText();
                }
            }
        }