Ejemplo n.º 1
0
    /**
     * Print infos about the document.
     *
     * @param tet The TET object
     * @param doc The TET document handle
     *
     * @throws TETException
     */
    private static void print_infos(TET tet, int doc)
    {
        /* --------- general information (always available) */
        int pcosmode = (int)tet.pcos_get_number(doc, "pcosmode");

        Console.WriteLine("   File name: "
                          + tet.pcos_get_string(doc, "filename"));

        Console.WriteLine(" PDF version: "
                          + tet.pcos_get_string(doc, "pdfversionstring"));

        Console.WriteLine("  Encryption: "
                          + tet.pcos_get_string(doc, "encrypt/description"));

        Console.WriteLine("   Master pw: "
                          + (tet.pcos_get_number(doc, "encrypt/master") != 0 ? "yes" : "no"));

        Console.WriteLine("     User pw: "
                          + (tet.pcos_get_number(doc, "encrypt/user") != 0 ? "yes" : "no"));

        Console.WriteLine("Text copying: "
                          + (tet.pcos_get_number(doc, "encrypt/nocopy") != 0 ? "no" : "yes"));

        Console.WriteLine("  Linearized: "
                          + (tet.pcos_get_number(doc, "linearized") != 0 ? "yes" : "no"));

        if (pcosmode == 0)
        {
            Console.WriteLine("Minimum mode: no more information available\n\n");
        }
        else
        {
            print_userpassword_infos(tet, doc, pcosmode);
        }
    }
Ejemplo n.º 2
0
    /* Print the following information for each image:
     * - image number
     * - pCOS id (required for indexing the images[] array)
     * - physical size of the placed image on the page
     * - pixel size of the underlying PDF Image XObject
     * - number of components, bits per component,and colorspace
     * - mergetype if different from "normal", i.e. "artificial"
     *   (=merged) or "consumed"
     *   - "stencilmask" property, i.e. /ImageMask in PDF
     *   - pCOS id of mask image, i.e. /Mask or /SMask
     */
    static void report_image_info(TET tet, int doc, int imageid)
    {
        int    width, height, bpc, cs, components, mergetype, stencilmask, maskid;
        String csname;


        width = (int)tet.pcos_get_number(doc,
                                         "images[" + imageid + "]/Width");
        height = (int)tet.pcos_get_number(doc,
                                          "images[" + imageid + "]/Height");
        bpc = (int)tet.pcos_get_number(doc,
                                       "images[" + imageid + "]/bpc");
        cs = (int)tet.pcos_get_number(doc,
                                      "images[" + imageid + "]/colorspaceid");
        components = (int)tet.pcos_get_number(doc,
                                              "colorspaces[" + cs + "]/components");


        Console.Write("image {0}: {1}x{2} pixel, ", imageid, width, height);

        csname = tet.pcos_get_string(doc, "colorspaces[" + cs + "]/name");
        Console.Write(components + "x" + bpc + " bit " + csname);

        if (csname == "Indexed")
        {
            int    basecs = 0;
            String basecsname;
            basecs = (int)tet.pcos_get_number(doc,
                                              "colorspaces[" + cs + "]/baseid");
            basecsname = tet.pcos_get_string(doc,
                                             "colorspaces[" + basecs + "]/name");
            Console.Write(" " + basecsname);
        }
        /* Check whether this image has been created by merging smaller images*/
        mergetype = (int)tet.pcos_get_number(doc,
                                             "images[" + imageid + "]/mergetype");
        if (mergetype == 1)
        {
            Console.Write(", mergetype=artificial");
        }

        stencilmask = (int)tet.pcos_get_number(doc,
                                               "images[" + imageid + "]/stencilmask");
        if (stencilmask == 1)
        {
            Console.Write(", used as stencil mask");
        }

        /* Check whether the image has an attached mask */
        maskid = (int)tet.pcos_get_number(doc,
                                          "images[" + imageid + "]/maskid");
        if (maskid != -1)
        {
            Console.Write(", masked with image " + maskid);
        }

        Console.WriteLine("");
    }
Ejemplo n.º 3
0
    static int Main(String[] args)
    {
        int    exitstat   = 0;
        string searchpath = "{../data} {../../data}";

        if (args.Length != 1)
        {
            Console.WriteLine("usage: dumper <filename>");
            exitstat = 2;
        }
        else
        {
            TET tet = null;
            try
            {
                tet = new TET();
                String docoptlist    = "requiredmode=minimum";
                String globaloptlist = "";
                String optlist;

                optlist = "searchpath={" + searchpath + "}";
                tet.set_option(optlist);

                tet.set_option(globaloptlist);

                int doc = tet.open_document(args[0], docoptlist);
                if (doc == -1)
                {
                    Console.WriteLine("ERROR: " + tet.get_errmsg());
                }
                else
                {
                    print_infos(tet, doc);
                    tet.close_document(doc);
                }
            }
            catch (TETException e)
            {
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "(): " + e.get_errmsg());
                exitstat = 1;
            }
            catch (Exception e)
            {
                Console.WriteLine("General Exception: " + e.ToString());
                exitstat = 1;
            }
            finally
            {
                if (tet != null)
                {
                    tet.Dispose();
                }
            }
        }

        return(exitstat);
    }
Ejemplo n.º 4
0
    /**
     * Print infos that require at least the user password.
     *
     * @param tet The tet object
     * @param doc The tet document handle
     * @param pcosmode The pCOS mode for the document
     *
     * @throws TETException
     */
    private static void print_userpassword_infos(TET tet, int doc, int pcosmode)
    {
        Console.WriteLine("PDF/X status: " + tet.pcos_get_string(doc, "pdfx"));

        Console.WriteLine("PDF/A status: " + tet.pcos_get_string(doc, "pdfa"));

        Console.WriteLine("    XFA data: "
                          + (tet.pcos_get_number(doc, "type:/Root/AcroForm/XFA") != 0 ? "yes" : "no"));

        Console.WriteLine("  Tagged PDF: "
                          + (tet.pcos_get_number(doc, "tagged") != 0 ? "yes" : "no"));
        Console.WriteLine();

        Console.WriteLine("No. of pages: "
                          + (int)tet.pcos_get_number(doc, "length:pages"));

        Console.WriteLine(" Page 1 size: width="
                          + tet.pcos_get_number(doc, "pages[0]/width") + ", height="
                          + tet.pcos_get_number(doc, "pages[0]/height"));

        int count = (int)tet.pcos_get_number(doc, "length:fonts");

        Console.WriteLine("No. of fonts: " + count);

        for (int i = 0; i < count; i++)
        {
            if (tet.pcos_get_number(doc, "fonts[" + i + "]/embedded") != 0)
            {
                Console.Write("embedded ");
            }
            else
            {
                Console.Write("unembedded ");
            }

            Console.Write(tet
                          .pcos_get_string(doc, "fonts[" + i + "]/type")
                          + " font ");
            Console.WriteLine(tet
                              .pcos_get_string(doc, "fonts[" + i + "]/name"));
        }

        Console.WriteLine();

        bool plainmetadata =
            tet.pcos_get_number(doc, "encrypt/plainmetadata") != 0;

        if (pcosmode == 1 && !plainmetadata &&
            tet.pcos_get_number(doc, "encrypt/nocopy") != 0)
        {
            Console.WriteLine("Restricted mode: no more information available");
        }
        else
        {
            print_masterpassword_infos(tet, doc);
        }
    }
    /**
     * Extract text from a document for which a TET handle is already available.
     *
     * @param tet
     *            The TET object
     * @param doc
     *            A valid TET document handle
     * @param outfp
     *            Output file handle
     *
     * @throws TETException
     * @throws IOException
     */
    static void extract_text(TET tet, int doc, BinaryWriter outfp)
    {
        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        /*
         * Get number of pages in the document.
         */
        int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

        /* loop over pages */
        for (int pageno = 1; pageno <= n_pages; ++pageno)
        {
            String text;
            int    page;

            page = tet.open_page(doc, pageno, pageoptlist);

            if (page == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() on page " + pageno + ": "
                                  + tet.get_errmsg());
                continue; /* try next page */
            }

            /*
             * Retrieve all text fragments; This loop is actually not required
             * for granularity=page, but must be used for other granularities.
             */
            while ((text = tet.get_text(page)) != null)
            {
                outfp.Write(unicode.GetBytes(text)); // print the retrieved text

                /* print a separator between chunks of text */
                outfp.Write(unicode.GetBytes(separator));
            }

            if (tet.get_errnum() != 0)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() on page " + pageno + ": "
                                  + tet.get_errmsg());
            }

            tet.close_page(page);
        }
    }
Ejemplo n.º 6
0
    /**
     * Print document info keys and XMP metadata (requires master pw or
     * plaintext metadata).
     *
     * @param tet
     * @param doc
     * @throws TETException
     */
    private static void print_masterpassword_infos(TET tet, int doc)
    {
        String objtype;
        int    count = (int)tet.pcos_get_number(doc, "length:/Info");

        for (int i = 0; i < count; i++)
        {
            objtype = tet.pcos_get_string(doc, "type:/Info[" + i + "]");
            String key = tet.pcos_get_string(doc, "/Info[" + i + "].key");
            Console.Write(String.Empty.PadLeft(12 - key.Length) + key + ": ");

            /* Info entries can be stored as string or name objects */
            if (objtype == "string" || objtype == "name")
            {
                Console.WriteLine("'"
                                  + tet.pcos_get_string(doc, "/Info[" + i + "]") + "'");
            }
            else
            {
                Console.WriteLine("("
                                  + tet.pcos_get_string(doc, "type:/Info[" + i + "]")
                                  + "object)");
            }
        }

        Console.WriteLine();
        Console.Write("XMP meta data: ");

        objtype = tet.pcos_get_string(doc, "type:/Root/Metadata");
        if (objtype == "stream")
        {
            byte[] contents = tet.pcos_get_stream(doc, "", "/Root/Metadata");
            Console.Write(contents.Length + " bytes ");

            UTF8Encoding utf8 = new UTF8Encoding();
            String       str  = utf8.GetString(contents);
            Console.WriteLine("(" + str.Length
                              + " Unicode characters)");
        }
        else
        {
            Console.WriteLine("not present\n\n");
        }
    }
    /**
     * Process a single file.
     *
     * @param outfp Output stream for messages
     * @param tet The TET object
     * @param doc The TET document handle
     *
     * @throws TETException
     * @throws IOException
     */
    private static void process_document(BinaryWriter outfp, TET tet, int doc)
    {
        String          objtype;
        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        // -------------------- Extract the document's own page contents
        extract_text(tet, doc, outfp);

        // -------------------- Process all document-level file attachments

        // Get the number of document-level file attachments.
        int filecount = (int)tet.pcos_get_number(doc,
                                                 "length:names/EmbeddedFiles");

        for (int file = 0; file < filecount; file++)
        {
            String attname;

            /*
             * fetch the name of the file attachment; check for Unicode file
             * name (a PDF 1.7 feature)
             */
            objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                                          + file + "]/UF");

            if (objtype == "string")
            {
                attname = tet.pcos_get_string(doc,
                                              "names/EmbeddedFiles[" + file + "]/UF");
            }
            else
            {
                objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                                              + file + "]/F");

                if (objtype == "string")
                {
                    attname = tet.pcos_get_string(doc, "names/EmbeddedFiles["
                                                  + file + "]/F");
                }
                else
                {
                    attname = "(unnamed)";
                }
            }
            /* fetch the contents of the file attachment and process it */
            objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                                          + file + "]/EF/F");

            if (objtype == "stream")
            {
                outfp.Write(unicode.GetBytes("----- File attachment '" + attname + "':\n"));
                byte[] attdata = tet.pcos_get_stream(doc, "",
                                                     "names/EmbeddedFiles[" + file + "]/EF/F");

                process_document(outfp, null, attname, attdata);
                outfp.Write(unicode.GetBytes("----- End file attachment '" + attname + "'\n"));
            }
        }

        // -------------------- Process all page-level file attachments

        int pagecount = (int)tet.pcos_get_number(doc, "length:pages");

        // Check all pages for annotations of type FileAttachment
        for (int page = 0; page < pagecount; page++)
        {
            int annotcount = (int)tet.pcos_get_number(doc, "length:pages["
                                                      + page + "]/Annots");

            for (int annot = 0; annot < annotcount; annot++)
            {
                String val;
                String attname;

                val = tet.pcos_get_string(doc, "pages[" + page + "]/Annots["
                                          + annot + "]/Subtype");

                attname = "page " + (page + 1) + ", annotation " + (annot + 1);
                if (val == "FileAttachment")
                {
                    String attpath = "pages[" + page
                                     + "]/Annots[" + annot + "]/FS/EF/F";

                    /*
                     * fetch the contents of the attachment and process it
                     */
                    objtype = tet.pcos_get_string(doc, "type:" + attpath);

                    if (objtype == "stream")
                    {
                        outfp.Write(unicode.GetBytes("----- Page level attachment '" + attname + "':\n"));
                        byte[] attdata = tet.pcos_get_stream(doc, "", attpath);
                        process_document(outfp, null, attname, attdata);
                        outfp.Write(unicode.GetBytes("----- End page level attachment '" + attname + "'\n"));
                    }
                }
            }
        }

        tet.close_document(doc);
    }
    /**
     * Open a named physical or virtual file, extract the text from it, search
     * for document or page attachments, and process these recursively. Either
     * filename must be supplied for physical files, or data+length from which a
     * virtual file will be created. The caller cannot create the PVF file since
     * we create a new TET object here in case an exception happens with the
     * embedded document - the caller can happily continue with his TET object
     * even in case of an exception here.
     *
     * @param outfp
     * @param filename
     * @param realname
     * @param data
     *
     * @return 0 if successful, otherwise a non-null code to be used as exit
     *         status
     */
    static int process_document(BinaryWriter outfp, String filename, String realname,
                                byte[] data)
    {
        int retval = 0;
        TET tet    = null;

        try
        {
            String pvfname = "/pvf/attachment";

            tet = new TET();

            /*
             * Construct a PVF file if data instead of a filename was provided
             */
            if (filename == null || filename.Length == 0)
            {
                tet.create_pvf(pvfname, data, "");
                filename = pvfname;
            }

            tet.set_option(globaloptlist);

            int doc = tet.open_document(filename, docoptlist);

            if (doc == -1)

            {
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() (source: attachment '"
                                  + realname + "'): " + tet.get_errmsg());

                retval = 5;
            }
            else
            {
                process_document(outfp, tet, doc);
            }

            /*
             * If there was no PVF file deleting it won't do any harm
             */
            tet.delete_pvf(pvfname);
        }
        catch (TETException e)
        {
            Console.WriteLine("Error " + e.get_errnum() + " in  "
                              + e.get_apiname() + "() (source: attachment '" + realname
                              + "'): " + e.get_errmsg());
            retval = 1;
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            retval = 1;
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }


        return(retval);
    }
Ejemplo n.º 9
0
    static int Main(string[] args)
    {
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list  e.g
         * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}"
         */
        string pageoptlist = "";


        TET    tet;
        int    pageno = 0;
        string outfilebase;

        if (args.Length != 1)
        {
            Console.WriteLine("usage: image_resources <filename>");
            return(2);
        }

        outfilebase = args.GetValue(0).ToString();
        if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF")))
        {
            outfilebase = outfilebase.Substring(0, outfilebase.Length - 4);
        }

        tet = new TET();

        try
        {
            int n_pages;

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                return(2);
            }
            /* Get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages and extract images  */
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                int page;
                int imagecount = 0;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg());
                    continue; /* try next page */
                }

                /*
                 * Retrieve all images on the page
                 */
                while ((tet.get_image_info(page)) == 1)
                {
                    String imageoptlist;
                    int    maskid;

                    imagecount++;

                    /* Report image details: pixel geometry, color space etc. */
                    report_image_info(tet, doc, tet.imageid);

                    /* Report placement geometry */
                    Console.WriteLine("  placed on page " + pageno +
                                      " at position (" + tet.x.ToString("f2") + ", " + tet.y.ToString("f2") + "): " +
                                      (int)tet.width + "x" + (int)tet.height + "pt, alpha=" + tet.alpha + ", beta=" +
                                      tet.beta);
                    /* Write image data to file */
                    imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "}";

                    if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1)
                    {
                        Console.WriteLine("\nError [" + tet.get_errnum() +
                                          " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
                        continue; /* try next image */
                    }

                    /* Check whether the image has a mask attached... */
                    maskid = (int)tet.pcos_get_number(doc,
                                                      "images[" + tet.imageid + "]/maskid");

                    /* and retrieve it if present */
                    if (maskid != -1)
                    {
                        Console.WriteLine("  masked with ");
                        report_image_info(tet, doc, maskid);

                        imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "mask_I" + maskid + "}";

                        if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1)
                        {
                            Console.WriteLine("\nError [" + tet.get_errnum() +
                                              " in " + tet.get_apiname() +
                                              "() for mask image: " + tet.get_errmsg());
                            continue; /* try next image */
                        }
                    }

                    if (tet.get_errnum() != 0)
                    {
                        Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                          tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg());
                    }
                }
                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
            return(2);
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            return(2);
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }

        return(0);
    }
Ejemplo n.º 10
0
    static int Main(string[] args)
    {
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list  e.g
         * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}"
         */
        string pageoptlist = "";


        TET    tet;
        int    pageno = 0;
        string outfilebase;

        if (args.Length != 1)
        {
            Console.WriteLine("usage: image_resources <filename>");
            return(2);
        }

        outfilebase = args.GetValue(0).ToString();
        if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF")))
        {
            outfilebase = outfilebase.Substring(0, outfilebase.Length - 4);
        }

        tet = new TET();

        try
        {
            int n_pages;

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                return(2);
            }


            /* Images will only be merged upon opening a page.
             * In order to enumerate all merged image resources
             * we open all pages before extracting the images.
             */

            /* get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over all pages to trigger image merging */
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                string text;
                int    page;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                                      tet.get_errmsg());
                    continue;                    /* process next page */
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                                      tet.get_errmsg());
                }
                tet.close_page(page);
            }

            int imageid, n_images;

            /* Get the number of images in the document */
            n_images = (int)tet.pcos_get_number(doc, "length:images");

            /* Loop over image resources in the document */
            for (imageid = 0; imageid < n_images; ++imageid)
            {
                string imageoptlist;
                /* Skiop images which have been consumed by merging */
                int mergetype = (int)tet.pcos_get_number(doc,
                                                         "images[" + imageid + "]/mergetype");

                if (mergetype == 2)
                {
                    continue;
                }

                /* Skip small images (see "smallimages" option) */
                if (tet.pcos_get_number(doc, "images[" + imageid + "]/small") > 0)
                {
                    continue;
                }
                /* Report image details: pixel geometry, color space etc . */
                report_image_info(tet, doc, imageid);

                /* Write image data to file */

                imageoptlist = " filename={" + outfilebase + "_I" + imageid + "}";

                if (tet.write_image_file(doc, imageid, imageoptlist) == -1)
                {
                    Console.WriteLine(
                        "Error {0} in {1}(): {2}",
                        tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                    continue;              /* process next image */
                }
            }
            tet.close_document(doc);
        }
        catch (TETException e) {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
            return(2);
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            return(2);
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }

        return(0);
    }
Ejemplo n.º 11
0
    static int Main(string[] args)
    {
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list */
        string pageoptlist = "granularity=page";

        /* separator to emit after each chunk of text. This depends on the
         * applications needs; for granularity=word a space character may be useful.
         */
        string separator = "\n";

        TET          tet;
        FileStream   outfile;
        BinaryWriter w;
        int          pageno = 0;

        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        Byte[] byteOrderMark = unicode.GetPreamble();


        if (args.Length != 2)
        {
            Console.WriteLine("usage: extractor <infilename> <outfilename>");
            return(2);
        }

        outfile = File.Create(args.GetValue(1).ToString());
        w       = new BinaryWriter(outfile);
        w.Write(byteOrderMark);

        tet = new TET();

        try
        {
            int n_pages;

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                return(2);
            }

            /* get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* loop over pages in the document */
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                string text;
                int    page;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                                      tet.get_errmsg());
                    continue;                    /* try next page */
                }

                /* Retrieve all text fragments; This is actually not required
                 * for granularity=page, but must be used for other
                 * granularities.
                 */
                while ((text = tet.get_text(page)) != null)
                {
                    /* print the retrieved text */
                    w.Write(unicode.GetBytes(text));

                    /* print a separator between chunks of text */
                    w.Write(unicode.GetBytes(separator));
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error {0} in {1}(): {3}",
                                      tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                }
                tet.close_page(page);
            }
            tet.close_document(doc);
        }
        catch (TETException e) {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            return(2);
        }
        finally
        {
            outfile.Close();
            if (tet != null)
            {
                tet.Dispose();
            }
        }

        return(0);
    }
Ejemplo n.º 12
0
    public static void Main(String[] args)
    {
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string docoptlist = "";

        /* Page-specific option list. */
        string pageoptlist = "granularity=word";

        FileStream   outfile;
        StreamWriter outfp;

        if (args.Length != 2)
        {
            Console.WriteLine("usage: glyphinfo <infilename> <outfilename>");
            return;
        }

        outfile = File.Create(args.GetValue(1).ToString());
        outfp   = new StreamWriter(outfile, System.Text.Encoding.UTF8);

        TET tet = null;

        try
        {
            tet = new TET();

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args[0], docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            /* get number of pages in the document */
            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
            {
                string text;
                int    page;
                int    previouscolor = -1;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "() on page "
                                      + pageno + ": " + tet.get_errmsg());
                    continue;                        /* try next page */
                }

                /* Administrative information */
                outfp.WriteLine("[ Document: '" +
                                tet.pcos_get_string(doc, "filename") + "' ]");

                outfp.WriteLine("[ Document options: '" + docoptlist + "' ]");

                outfp.WriteLine("[ Page options: '" + pageoptlist + "' ]");

                outfp.WriteLine("[ ----- Page " + pageno + " ----- ]");

                /* Retrieve all text fragments */
                while ((text = tet.get_text(page)) != null)
                {
                    /* print the retrieved text */
                    outfp.WriteLine("[" + text + "]");

                    /* Loop over all glyphs and print their details */
                    while (tet.get_char_info(page) != -1)
                    {
                        string str;
                        string fontname;

                        /* Fetch the font name with pCOS (based on its ID) */
                        fontname = tet.pcos_get_string(doc,
                                                       "fonts[" + tet.fontid + "]/name");

                        /* Print the character */
                        str = String.Format("U+{0}", tet.uv.ToString("X4"));

                        /* ...and its UTF8 representation */
                        str = str + String.Format(" '" + (char)(tet.uv) + "'");

                        /* Print font name, size, and position */
                        str = str + String.Format(" {0} size={1} x={2} y={3}",
                                                  fontname, tet.fontsize.ToString("f2"),
                                                  tet.x.ToString("f2"), tet.y.ToString("f2"));
                        /* Print the color id */
                        str = str + String.Format(" colorid={0}", tet.colorid);

                        /* check wheather the text color changes */
                        if (tet.colorid != previouscolor)
                        {
                            str           = print_color_value(str, tet, doc, tet.colorid);
                            previouscolor = tet.colorid;
                        }
                        /* Examine the "type" member */
                        if (tet.type == 1)
                        {
                            str = str + " ligature_start";
                        }

                        else if (tet.type == 10)
                        {
                            str = str + " ligature_cont";
                        }

                        /* Separators are only inserted for granularity > word*/
                        else if (tet.type == 12)
                        {
                            str = str + " inserted";
                        }

                        /* Examine the bit flags in the "attributes" member */
                        const int ATTR_NONE    = 0;
                        const int ATTR_SUB     = 1;
                        const int ATTR_SUP     = 2;
                        const int ATTR_DROPCAP = 4;
                        const int ATTR_SHADOW  = 8;
                        const int ATTR_DH_PRE  = 16;
                        const int ATTR_DH_ARTF = 32;
                        const int ATTR_DH_POST = 64;

                        if (tet.attributes != ATTR_NONE)
                        {
                            if ((tet.attributes & ATTR_SUB) == ATTR_SUB)
                            {
                                str = str + "/sub";
                            }
                            if ((tet.attributes & ATTR_SUP) == ATTR_SUP)
                            {
                                str = str + "/sup";
                            }
                            if ((tet.attributes & ATTR_DROPCAP) == ATTR_DROPCAP)
                            {
                                str = str + "/dropcap";
                            }
                            if ((tet.attributes & ATTR_SHADOW) == ATTR_SHADOW)
                            {
                                str = str + "/shadow";
                            }
                            if ((tet.attributes & ATTR_DH_PRE) == ATTR_DH_PRE)
                            {
                                str = str + "/dehyphenation_pre";
                            }
                            if ((tet.attributes & ATTR_DH_ARTF) == ATTR_DH_ARTF)
                            {
                                str = str + "/dehyphenation_artifact";
                            }
                            if ((tet.attributes & ATTR_DH_POST) == ATTR_DH_POST)
                            {
                                str = str + "/dehyphenation_post";
                            }
                        }
                        outfp.WriteLine(str);
                    }
                    outfp.WriteLine("");
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "() on page "
                                      + pageno + ": " + tet.get_errmsg());
                }

                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            Console.WriteLine("Error " + e.get_errnum() + " in "
                              + e.get_apiname() + "(): " + e.get_errmsg());
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }
    }
Ejemplo n.º 13
0
    public static String print_color_value(string str, TET tet, int doc, int colorid)
    {
        int    colorinfo;
        String csname;                  /* color space name */
        int    i;

        /* We handle only the fill color, but ignore the stroke color.
         * The stroke color can be retrieved analogously with the
         * keyword "stroke".
         */
        colorinfo = tet.get_color_info(doc, colorid, "usage=fill");
        if (tet.colorspaceid == -1 && tet.patternid == -1)
        {
            str = str + String.Format(" (not filled)");
            return(str);
        }

        str = str + String.Format(" (");

        if (tet.patternid != -1)
        {
            int patterntype =
                (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PatternType");

            if (patterntype == 1)       /* Tiling pattern */
            {
                int painttype =
                    (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PaintType");
                if (painttype == 1)
                {
                    str = str + String.Format("colored Pattern)");
                    return(str);
                }
                else if (painttype == 2)
                {
                    str = str + String.Format("uncolored Pattern, base color: ");
                    /* FALLTHROUGH to colorspaceid output */
                }
            }
            else if (patterntype == 2)  /* Shading pattern */
            {
                int shadingtype =
                    (int)tet.pcos_get_number(doc,
                                             "patterns[" + tet.patternid + "]/Shading/ShadingType");

                str = str + String.Format("shading Pattern, ShadingType={0})", shadingtype);
                return(str);
            }
        }

        csname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/name");

        str = str + String.Format("{0}", csname);

        /* Emit more details depending on the colorspace type */
        if (csname.Equals("ICCBased"))
        {
            int    iccprofileid;
            String profilename;
            String profilecs;
            String errormessage;

            iccprofileid = (int)tet.pcos_get_number(doc,
                                                    "colorspaces[" + tet.colorspaceid + "]/iccprofileid");

            errormessage = tet.pcos_get_string(doc,
                                               "iccprofiles[" + iccprofileid + "]/errormessage");

            /* Check whether the embedded profile is damaged */
            if (errormessage.Equals(""))
            {
                str = str + String.Format(" ({0})", errormessage);
            }
            else
            {
                profilename =
                    tet.pcos_get_string(doc,
                                        "iccprofiles[" + iccprofileid + "]/profilename");
                str = str + String.Format(" '{0}'", profilename);

                profilecs = tet.pcos_get_string(doc,
                                                "iccprofiles[" + iccprofileid + "]/profilecs");
                str = str + String.Format(" '{0}'", profilecs);
            }
        }
        else if (csname.Equals("Separation"))
        {
            String colorantname =
                tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/colorantname");
            str = str + String.Format(" '{0}'", colorantname);
        }
        else if (csname.Equals("DeviceN"))
        {
            str = str + String.Format(" ");

            for (i = 0; i < tet.components.Length; i++)
            {
                String colorantname =
                    tet.pcos_get_string(doc,
                                        "colorspaces[" + tet.colorspaceid + "]/colorantnames[" + i + "]");

                str = str + String.Format("{0}", colorantname);

                if (i != tet.components.Length - 1)
                {
                    str = str + String.Format("/");
                }
            }
        }
        else if (csname.Equals("Indexed"))
        {
            int baseid =
                (int)tet.pcos_get_number(doc, "colorspaces[" + tet.colorspaceid + "]/baseid");

            csname = tet.pcos_get_string(doc, "colorspaces[" + baseid + "]/name");

            str = str + String.Format(" {0}", csname);
        }

        str = str + String.Format(" ");
        for (i = 0; i < tet.components.Length; i++)
        {
            str = str + String.Format("{0}", tet.components[i]);

            if (i != tet.components.Length - 1)
            {
                str = str + String.Format("/");
            }
        }
        str = str + String.Format(")");
        return(str);
    }
    public static void Main(String[] args)
    {
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string docoptlist = "";

        /* Page-specific option list. */
        string pageoptlist = "granularity=line";

        /* Search text with at least this size (use 0 to catch all sizes). */
        double fontsizetrigger = 10;

        /* Catch text where the font name contains this string (use empty string
         * to catch all font names).
         */
        String fontnametrigger = "Bold";

        TET tet    = null;
        int pageno = 0;

        if (args.Length != 1)
        {
            Console.WriteLine("usage: fontfilter <infilename>");
            return;
        }

        try
        {
            tet = new TET();
            tet.set_option(globaloptlist);

            int doc = tet.open_document(args[0], docoptlist);
            if (doc == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            /* Loop over pages in the document */
            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                int page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "(): " + tet.get_errmsg());
                    return; /* try next page */
                }

                /* Retrieve all text fragments for the page */
                String text;
                while ((text = tet.get_text(page)) != null)
                {
                    /* Loop over all characters */
                    int ci;
                    while ((ci = tet.get_char_info(page)) != -1)
                    {
                        /* We need only the font name and size; the text
                         * position could be fetched from tet.x and tet.y.
                         */
                        String fontname = tet.pcos_get_string(doc,
                                                              "fonts[" + tet.fontid + "]/name");

                        /* Check whether we found a match */
                        if (tet.fontsize >= fontsizetrigger &&
                            fontname.IndexOf(fontnametrigger) != -1)
                        {
                            /* print the retrieved font name, size, and text */
                            Console.WriteLine("[{0} {1:0.00}] {2}", fontname,
                                              tet.fontsize, text);
                        }

                        /* In this sample we check only the first character of
                         * each fragment.
                         */
                        break;
                    }
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "(): " + tet.get_errmsg());
                }

                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            if (pageno == 0)
            {
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            }
            else
            {
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "() on page " + pageno + ": "
                                  + e.get_errmsg() + "\n");
            }
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
        }
        finally
        {
            tet.Dispose();
        }
    }
Ejemplo n.º 15
0
    public static void Main(String[] args)
    {
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string basedocoptlist = "";

        /* Page-specific option list. */

        /* Remove the tetml= option if you don't need font and geometry
         * information */
        string pageoptlist = "granularity=word tetml={glyphdetails={all}}";

        /* set this to true to generate TETML output in memory */
        bool inmemory = false;

        if (args.Length != 2)
        {
            Console.WriteLine("usage: tetml <pdffilename> <xmlfilename>");
            return;
        }

        TET tet = null;

        try
        {
            String docoptlist;

            tet = new TET();
            tet.set_option(globaloptlist);

            if (inmemory)
            {
                /*
                 * This program fetches the TETML data encoded in UTF-8.
                 * Subsequently the data is converted to a VisualBasic String,
                 * which is encoded in UTF-16.
                 * While it is not strictly necessary in case of this program, it
                 * is more clean to instruct TET to put 'encoding="UTF-16"' into
                 * the XML header.
                 */
                docoptlist = "tetml={encodingname=UTF-16} " + basedocoptlist;
            }
            else
            {
                docoptlist = "tetml={filename={" + args[1] + "}} "
                             + basedocoptlist;
            }

            int doc = tet.open_document(args[0], docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
            {
                tet.process_page(doc, pageno, pageoptlist);
            }

            /* This could be combined with the last page-related call. */
            tet.process_page(doc, 0, "tetml={trailer}");

            if (inmemory)
            {
                /* Get the XML document as a byte array. */
                byte[] tetml = tet.get_tetml(doc, "");

                if (tetml == null)
                {
                    Console.WriteLine("tetml: couldn't retrieve XML data");
                    return;
                }

                /* Process the in-memory XML document to print out some
                 * information that is extracted with the sax_handler class.
                 */
                XmlDocument  xmldoc   = new XmlDocument();
                UTF8Encoding utf8_enc = new UTF8Encoding();
                String       stetml   = utf8_enc.GetString(tetml);
                xmldoc.LoadXml(stetml);

                XmlNodeList nodeList;
                XmlElement  root = xmldoc.DocumentElement;

                /* Create an XmlNamespaceManager for resolving namespaces. */
                XmlNamespaceManager nsmgr =
                    new XmlNamespaceManager(xmldoc.NameTable);
                nsmgr.AddNamespace("tet",
                                   "http://www.pdflib.com/XML/TET5/TET-5.0");

                nodeList = root.SelectNodes("//tet:Font", nsmgr);
                IEnumerator ienum = nodeList.GetEnumerator();
                while (ienum.MoveNext())
                {
                    XmlNode font = (XmlNode)ienum.Current;
                    XmlAttributeCollection attrColl = font.Attributes;

                    XmlAttribute name_attr =
                        (XmlAttribute)attrColl.GetNamedItem("name");
                    XmlAttribute type_attr =
                        (XmlAttribute)attrColl.GetNamedItem("type");
                    Console.WriteLine("Font " + name_attr.Value + " "
                                      + type_attr.Value);
                }
                nodeList = root.SelectNodes("//tet:Word", nsmgr);
                Console.WriteLine("Found " + nodeList.Count
                                  + " words in document");
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            Console.WriteLine("Error " + e.get_errnum() + " in "
                              + e.get_apiname() + "(): " + e.get_errmsg());
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }
    }