Exemplo n.º 1
     * Print infos about the document.
     * @param tet The TET object
     * @param doc The TET document handle
     * @throws TETException
    private static void print_infos(TET tet, int doc)
        /* --------- general information (always available) */
        int pcosmode = (int)tet.pcos_get_number(doc, "pcosmode");

        Console.WriteLine("   File name: "
                          + tet.pcos_get_string(doc, "filename"));

        Console.WriteLine(" PDF version: "
                          + tet.pcos_get_string(doc, "pdfversionstring"));

        Console.WriteLine("  Encryption: "
                          + tet.pcos_get_string(doc, "encrypt/description"));

        Console.WriteLine("   Master pw: "
                          + (tet.pcos_get_number(doc, "encrypt/master") != 0 ? "yes" : "no"));

        Console.WriteLine("     User pw: "
                          + (tet.pcos_get_number(doc, "encrypt/user") != 0 ? "yes" : "no"));

        Console.WriteLine("Text copying: "
                          + (tet.pcos_get_number(doc, "encrypt/nocopy") != 0 ? "no" : "yes"));

        Console.WriteLine("  Linearized: "
                          + (tet.pcos_get_number(doc, "linearized") != 0 ? "yes" : "no"));

        if (pcosmode == 0)
            Console.WriteLine("Minimum mode: no more information available\n\n");
            print_userpassword_infos(tet, doc, pcosmode);
Exemplo n.º 2
    /* Print the following information for each image:
     * - image number
     * - pCOS id (required for indexing the images[] array)
     * - physical size of the placed image on the page
     * - pixel size of the underlying PDF Image XObject
     * - number of components, bits per component,and colorspace
     * - mergetype if different from "normal", i.e. "artificial"
     *   (=merged) or "consumed"
     *   - "stencilmask" property, i.e. /ImageMask in PDF
     *   - pCOS id of mask image, i.e. /Mask or /SMask
    static void report_image_info(TET tet, int doc, int imageid)
        int    width, height, bpc, cs, components, mergetype, stencilmask, maskid;
        String csname;

        width = (int)tet.pcos_get_number(doc,
                                         "images[" + imageid + "]/Width");
        height = (int)tet.pcos_get_number(doc,
                                          "images[" + imageid + "]/Height");
        bpc = (int)tet.pcos_get_number(doc,
                                       "images[" + imageid + "]/bpc");
        cs = (int)tet.pcos_get_number(doc,
                                      "images[" + imageid + "]/colorspaceid");
        components = (int)tet.pcos_get_number(doc,
                                              "colorspaces[" + cs + "]/components");

        Console.Write("image {0}: {1}x{2} pixel, ", imageid, width, height);

        csname = tet.pcos_get_string(doc, "colorspaces[" + cs + "]/name");
        Console.Write(components + "x" + bpc + " bit " + csname);

        if (csname == "Indexed")
            int    basecs = 0;
            String basecsname;
            basecs = (int)tet.pcos_get_number(doc,
                                              "colorspaces[" + cs + "]/baseid");
            basecsname = tet.pcos_get_string(doc,
                                             "colorspaces[" + basecs + "]/name");
            Console.Write(" " + basecsname);
        /* Check whether this image has been created by merging smaller images*/
        mergetype = (int)tet.pcos_get_number(doc,
                                             "images[" + imageid + "]/mergetype");
        if (mergetype == 1)
            Console.Write(", mergetype=artificial");

        stencilmask = (int)tet.pcos_get_number(doc,
                                               "images[" + imageid + "]/stencilmask");
        if (stencilmask == 1)
            Console.Write(", used as stencil mask");

        /* Check whether the image has an attached mask */
        maskid = (int)tet.pcos_get_number(doc,
                                          "images[" + imageid + "]/maskid");
        if (maskid != -1)
            Console.Write(", masked with image " + maskid);

Exemplo n.º 3
    static int Main(String[] args)
        int    exitstat   = 0;
        string searchpath = "{../data} {../../data}";

        if (args.Length != 1)
            Console.WriteLine("usage: dumper <filename>");
            exitstat = 2;
            TET tet = null;
                tet = new TET();
                String docoptlist    = "requiredmode=minimum";
                String globaloptlist = "";
                String optlist;

                optlist = "searchpath={" + searchpath + "}";


                int doc = tet.open_document(args[0], docoptlist);
                if (doc == -1)
                    Console.WriteLine("ERROR: " + tet.get_errmsg());
                    print_infos(tet, doc);
            catch (TETException e)
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "(): " + e.get_errmsg());
                exitstat = 1;
            catch (Exception e)
                Console.WriteLine("General Exception: " + e.ToString());
                exitstat = 1;
                if (tet != null)

Exemplo n.º 4
     * Print infos that require at least the user password.
     * @param tet The tet object
     * @param doc The tet document handle
     * @param pcosmode The pCOS mode for the document
     * @throws TETException
    private static void print_userpassword_infos(TET tet, int doc, int pcosmode)
        Console.WriteLine("PDF/X status: " + tet.pcos_get_string(doc, "pdfx"));

        Console.WriteLine("PDF/A status: " + tet.pcos_get_string(doc, "pdfa"));

        Console.WriteLine("    XFA data: "
                          + (tet.pcos_get_number(doc, "type:/Root/AcroForm/XFA") != 0 ? "yes" : "no"));

        Console.WriteLine("  Tagged PDF: "
                          + (tet.pcos_get_number(doc, "tagged") != 0 ? "yes" : "no"));

        Console.WriteLine("No. of pages: "
                          + (int)tet.pcos_get_number(doc, "length:pages"));

        Console.WriteLine(" Page 1 size: width="
                          + tet.pcos_get_number(doc, "pages[0]/width") + ", height="
                          + tet.pcos_get_number(doc, "pages[0]/height"));

        int count = (int)tet.pcos_get_number(doc, "length:fonts");

        Console.WriteLine("No. of fonts: " + count);

        for (int i = 0; i < count; i++)
            if (tet.pcos_get_number(doc, "fonts[" + i + "]/embedded") != 0)
                Console.Write("embedded ");
                Console.Write("unembedded ");

                          .pcos_get_string(doc, "fonts[" + i + "]/type")
                          + " font ");
                              .pcos_get_string(doc, "fonts[" + i + "]/name"));


        bool plainmetadata =
            tet.pcos_get_number(doc, "encrypt/plainmetadata") != 0;

        if (pcosmode == 1 && !plainmetadata &&
            tet.pcos_get_number(doc, "encrypt/nocopy") != 0)
            Console.WriteLine("Restricted mode: no more information available");
            print_masterpassword_infos(tet, doc);
     * Extract text from a document for which a TET handle is already available.
     * @param tet
     *            The TET object
     * @param doc
     *            A valid TET document handle
     * @param outfp
     *            Output file handle
     * @throws TETException
     * @throws IOException
    static void extract_text(TET tet, int doc, BinaryWriter outfp)
        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

         * Get number of pages in the document.
        int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

        /* loop over pages */
        for (int pageno = 1; pageno <= n_pages; ++pageno)
            String text;
            int    page;

            page = tet.open_page(doc, pageno, pageoptlist);

            if (page == -1)
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() on page " + pageno + ": "
                                  + tet.get_errmsg());
                continue; /* try next page */

             * Retrieve all text fragments; This loop is actually not required
             * for granularity=page, but must be used for other granularities.
            while ((text = tet.get_text(page)) != null)
                outfp.Write(unicode.GetBytes(text)); // print the retrieved text

                /* print a separator between chunks of text */

            if (tet.get_errnum() != 0)
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() on page " + pageno + ": "
                                  + tet.get_errmsg());

Exemplo n.º 6
     * Print document info keys and XMP metadata (requires master pw or
     * plaintext metadata).
     * @param tet
     * @param doc
     * @throws TETException
    private static void print_masterpassword_infos(TET tet, int doc)
        String objtype;
        int    count = (int)tet.pcos_get_number(doc, "length:/Info");

        for (int i = 0; i < count; i++)
            objtype = tet.pcos_get_string(doc, "type:/Info[" + i + "]");
            String key = tet.pcos_get_string(doc, "/Info[" + i + "].key");
            Console.Write(String.Empty.PadLeft(12 - key.Length) + key + ": ");

            /* Info entries can be stored as string or name objects */
            if (objtype == "string" || objtype == "name")
                                  + tet.pcos_get_string(doc, "/Info[" + i + "]") + "'");
                                  + tet.pcos_get_string(doc, "type:/Info[" + i + "]")
                                  + "object)");

        Console.Write("XMP meta data: ");

        objtype = tet.pcos_get_string(doc, "type:/Root/Metadata");
        if (objtype == "stream")
            byte[] contents = tet.pcos_get_stream(doc, "", "/Root/Metadata");
            Console.Write(contents.Length + " bytes ");

            UTF8Encoding utf8 = new UTF8Encoding();
            String       str  = utf8.GetString(contents);
            Console.WriteLine("(" + str.Length
                              + " Unicode characters)");
            Console.WriteLine("not present\n\n");
     * Process a single file.
     * @param outfp Output stream for messages
     * @param tet The TET object
     * @param doc The TET document handle
     * @throws TETException
     * @throws IOException
    private static void process_document(BinaryWriter outfp, TET tet, int doc)
        String          objtype;
        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        // -------------------- Extract the document's own page contents
        extract_text(tet, doc, outfp);

        // -------------------- Process all document-level file attachments

        // Get the number of document-level file attachments.
        int filecount = (int)tet.pcos_get_number(doc,

        for (int file = 0; file < filecount; file++)
            String attname;

             * fetch the name of the file attachment; check for Unicode file
             * name (a PDF 1.7 feature)
            objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                                          + file + "]/UF");

            if (objtype == "string")
                attname = tet.pcos_get_string(doc,
                                              "names/EmbeddedFiles[" + file + "]/UF");
                objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                                              + file + "]/F");

                if (objtype == "string")
                    attname = tet.pcos_get_string(doc, "names/EmbeddedFiles["
                                                  + file + "]/F");
                    attname = "(unnamed)";
            /* fetch the contents of the file attachment and process it */
            objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles["
                                          + file + "]/EF/F");

            if (objtype == "stream")
                outfp.Write(unicode.GetBytes("----- File attachment '" + attname + "':\n"));
                byte[] attdata = tet.pcos_get_stream(doc, "",
                                                     "names/EmbeddedFiles[" + file + "]/EF/F");

                process_document(outfp, null, attname, attdata);
                outfp.Write(unicode.GetBytes("----- End file attachment '" + attname + "'\n"));

        // -------------------- Process all page-level file attachments

        int pagecount = (int)tet.pcos_get_number(doc, "length:pages");

        // Check all pages for annotations of type FileAttachment
        for (int page = 0; page < pagecount; page++)
            int annotcount = (int)tet.pcos_get_number(doc, "length:pages["
                                                      + page + "]/Annots");

            for (int annot = 0; annot < annotcount; annot++)
                String val;
                String attname;

                val = tet.pcos_get_string(doc, "pages[" + page + "]/Annots["
                                          + annot + "]/Subtype");

                attname = "page " + (page + 1) + ", annotation " + (annot + 1);
                if (val == "FileAttachment")
                    String attpath = "pages[" + page
                                     + "]/Annots[" + annot + "]/FS/EF/F";

                     * fetch the contents of the attachment and process it
                    objtype = tet.pcos_get_string(doc, "type:" + attpath);

                    if (objtype == "stream")
                        outfp.Write(unicode.GetBytes("----- Page level attachment '" + attname + "':\n"));
                        byte[] attdata = tet.pcos_get_stream(doc, "", attpath);
                        process_document(outfp, null, attname, attdata);
                        outfp.Write(unicode.GetBytes("----- End page level attachment '" + attname + "'\n"));

     * Open a named physical or virtual file, extract the text from it, search
     * for document or page attachments, and process these recursively. Either
     * filename must be supplied for physical files, or data+length from which a
     * virtual file will be created. The caller cannot create the PVF file since
     * we create a new TET object here in case an exception happens with the
     * embedded document - the caller can happily continue with his TET object
     * even in case of an exception here.
     * @param outfp
     * @param filename
     * @param realname
     * @param data
     * @return 0 if successful, otherwise a non-null code to be used as exit
     *         status
    static int process_document(BinaryWriter outfp, String filename, String realname,
                                byte[] data)
        int retval = 0;
        TET tet    = null;

            String pvfname = "/pvf/attachment";

            tet = new TET();

             * Construct a PVF file if data instead of a filename was provided
            if (filename == null || filename.Length == 0)
                tet.create_pvf(pvfname, data, "");
                filename = pvfname;


            int doc = tet.open_document(filename, docoptlist);

            if (doc == -1)

                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() (source: attachment '"
                                  + realname + "'): " + tet.get_errmsg());

                retval = 5;
                process_document(outfp, tet, doc);

             * If there was no PVF file deleting it won't do any harm
        catch (TETException e)
            Console.WriteLine("Error " + e.get_errnum() + " in  "
                              + e.get_apiname() + "() (source: attachment '" + realname
                              + "'): " + e.get_errmsg());
            retval = 1;
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
            retval = 1;
            if (tet != null)

Exemplo n.º 9
    static int Main(string[] args)
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list  e.g
         * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}"
        string pageoptlist = "";

        TET    tet;
        int    pageno = 0;
        string outfilebase;

        if (args.Length != 1)
            Console.WriteLine("usage: image_resources <filename>");

        outfilebase = args.GetValue(0).ToString();
        if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF")))
            outfilebase = outfilebase.Substring(0, outfilebase.Length - 4);

        tet = new TET();

            int n_pages;


            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
            /* Get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages and extract images  */
            for (pageno = 1; pageno <= n_pages; ++pageno)
                int page;
                int imagecount = 0;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg());
                    continue; /* try next page */

                 * Retrieve all images on the page
                while ((tet.get_image_info(page)) == 1)
                    String imageoptlist;
                    int    maskid;


                    /* Report image details: pixel geometry, color space etc. */
                    report_image_info(tet, doc, tet.imageid);

                    /* Report placement geometry */
                    Console.WriteLine("  placed on page " + pageno +
                                      " at position (" + tet.x.ToString("f2") + ", " + tet.y.ToString("f2") + "): " +
                                      (int)tet.width + "x" + (int)tet.height + "pt, alpha=" + tet.alpha + ", beta=" +
                    /* Write image data to file */
                    imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "}";

                    if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1)
                        Console.WriteLine("\nError [" + tet.get_errnum() +
                                          " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
                        continue; /* try next image */

                    /* Check whether the image has a mask attached... */
                    maskid = (int)tet.pcos_get_number(doc,
                                                      "images[" + tet.imageid + "]/maskid");

                    /* and retrieve it if present */
                    if (maskid != -1)
                        Console.WriteLine("  masked with ");
                        report_image_info(tet, doc, maskid);

                        imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "mask_I" + maskid + "}";

                        if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1)
                            Console.WriteLine("\nError [" + tet.get_errnum() +
                                              " in " + tet.get_apiname() +
                                              "() for mask image: " + tet.get_errmsg());
                            continue; /* try next image */

                    if (tet.get_errnum() != 0)
                        Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                          tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg());

        catch (TETException e)
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
            if (tet != null)

Exemplo n.º 10
    static int Main(string[] args)
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list  e.g
         * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}"
        string pageoptlist = "";

        TET    tet;
        int    pageno = 0;
        string outfilebase;

        if (args.Length != 1)
            Console.WriteLine("usage: image_resources <filename>");

        outfilebase = args.GetValue(0).ToString();
        if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF")))
            outfilebase = outfilebase.Substring(0, outfilebase.Length - 4);

        tet = new TET();

            int n_pages;


            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());

            /* Images will only be merged upon opening a page.
             * In order to enumerate all merged image resources
             * we open all pages before extracting the images.

            /* get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over all pages to trigger image merging */
            for (pageno = 1; pageno <= n_pages; ++pageno)
                string text;
                int    page;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                    continue;                    /* process next page */

                if (tet.get_errnum() != 0)
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,

            int imageid, n_images;

            /* Get the number of images in the document */
            n_images = (int)tet.pcos_get_number(doc, "length:images");

            /* Loop over image resources in the document */
            for (imageid = 0; imageid < n_images; ++imageid)
                string imageoptlist;
                /* Skiop images which have been consumed by merging */
                int mergetype = (int)tet.pcos_get_number(doc,
                                                         "images[" + imageid + "]/mergetype");

                if (mergetype == 2)

                /* Skip small images (see "smallimages" option) */
                if (tet.pcos_get_number(doc, "images[" + imageid + "]/small") > 0)
                /* Report image details: pixel geometry, color space etc . */
                report_image_info(tet, doc, imageid);

                /* Write image data to file */

                imageoptlist = " filename={" + outfilebase + "_I" + imageid + "}";

                if (tet.write_image_file(doc, imageid, imageoptlist) == -1)
                        "Error {0} in {1}(): {2}",
                        tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                    continue;              /* process next image */
        catch (TETException e) {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
            if (tet != null)

Exemplo n.º 11
    static int Main(string[] args)
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list */
        string pageoptlist = "granularity=page";

        /* separator to emit after each chunk of text. This depends on the
         * applications needs; for granularity=word a space character may be useful.
        string separator = "\n";

        TET          tet;
        FileStream   outfile;
        BinaryWriter w;
        int          pageno = 0;

        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        Byte[] byteOrderMark = unicode.GetPreamble();

        if (args.Length != 2)
            Console.WriteLine("usage: extractor <infilename> <outfilename>");

        outfile = File.Create(args.GetValue(1).ToString());
        w       = new BinaryWriter(outfile);

        tet = new TET();

            int n_pages;


            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());

            /* get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* loop over pages in the document */
            for (pageno = 1; pageno <= n_pages; ++pageno)
                string text;
                int    page;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                    continue;                    /* try next page */

                /* Retrieve all text fragments; This is actually not required
                 * for granularity=page, but must be used for other
                 * granularities.
                while ((text = tet.get_text(page)) != null)
                    /* print the retrieved text */

                    /* print a separator between chunks of text */

                if (tet.get_errnum() != 0)
                    Console.WriteLine("Error {0} in {1}(): {3}",
                                      tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
        catch (TETException e) {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
            if (tet != null)

Exemplo n.º 12
    public static void Main(String[] args)
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string docoptlist = "";

        /* Page-specific option list. */
        string pageoptlist = "granularity=word";

        FileStream   outfile;
        StreamWriter outfp;

        if (args.Length != 2)
            Console.WriteLine("usage: glyphinfo <infilename> <outfilename>");

        outfile = File.Create(args.GetValue(1).ToString());
        outfp   = new StreamWriter(outfile, System.Text.Encoding.UTF8);

        TET tet = null;

            tet = new TET();


            int doc = tet.open_document(args[0], docoptlist);

            if (doc == -1)
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());

            /* get number of pages in the document */
            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
                string text;
                int    page;
                int    previouscolor = -1;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "() on page "
                                      + pageno + ": " + tet.get_errmsg());
                    continue;                        /* try next page */

                /* Administrative information */
                outfp.WriteLine("[ Document: '" +
                                tet.pcos_get_string(doc, "filename") + "' ]");

                outfp.WriteLine("[ Document options: '" + docoptlist + "' ]");

                outfp.WriteLine("[ Page options: '" + pageoptlist + "' ]");

                outfp.WriteLine("[ ----- Page " + pageno + " ----- ]");

                /* Retrieve all text fragments */
                while ((text = tet.get_text(page)) != null)
                    /* print the retrieved text */
                    outfp.WriteLine("[" + text + "]");

                    /* Loop over all glyphs and print their details */
                    while (tet.get_char_info(page) != -1)
                        string str;
                        string fontname;

                        /* Fetch the font name with pCOS (based on its ID) */
                        fontname = tet.pcos_get_string(doc,
                                                       "fonts[" + tet.fontid + "]/name");

                        /* Print the character */
                        str = String.Format("U+{0}", tet.uv.ToString("X4"));

                        /* ...and its UTF8 representation */
                        str = str + String.Format(" '" + (char)(tet.uv) + "'");

                        /* Print font name, size, and position */
                        str = str + String.Format(" {0} size={1} x={2} y={3}",
                                                  fontname, tet.fontsize.ToString("f2"),
                                                  tet.x.ToString("f2"), tet.y.ToString("f2"));
                        /* Print the color id */
                        str = str + String.Format(" colorid={0}", tet.colorid);

                        /* check wheather the text color changes */
                        if (tet.colorid != previouscolor)
                            str           = print_color_value(str, tet, doc, tet.colorid);
                            previouscolor = tet.colorid;
                        /* Examine the "type" member */
                        if (tet.type == 1)
                            str = str + " ligature_start";

                        else if (tet.type == 10)
                            str = str + " ligature_cont";

                        /* Separators are only inserted for granularity > word*/
                        else if (tet.type == 12)
                            str = str + " inserted";

                        /* Examine the bit flags in the "attributes" member */
                        const int ATTR_NONE    = 0;
                        const int ATTR_SUB     = 1;
                        const int ATTR_SUP     = 2;
                        const int ATTR_DROPCAP = 4;
                        const int ATTR_SHADOW  = 8;
                        const int ATTR_DH_PRE  = 16;
                        const int ATTR_DH_ARTF = 32;
                        const int ATTR_DH_POST = 64;

                        if (tet.attributes != ATTR_NONE)
                            if ((tet.attributes & ATTR_SUB) == ATTR_SUB)
                                str = str + "/sub";
                            if ((tet.attributes & ATTR_SUP) == ATTR_SUP)
                                str = str + "/sup";
                            if ((tet.attributes & ATTR_DROPCAP) == ATTR_DROPCAP)
                                str = str + "/dropcap";
                            if ((tet.attributes & ATTR_SHADOW) == ATTR_SHADOW)
                                str = str + "/shadow";
                            if ((tet.attributes & ATTR_DH_PRE) == ATTR_DH_PRE)
                                str = str + "/dehyphenation_pre";
                            if ((tet.attributes & ATTR_DH_ARTF) == ATTR_DH_ARTF)
                                str = str + "/dehyphenation_artifact";
                            if ((tet.attributes & ATTR_DH_POST) == ATTR_DH_POST)
                                str = str + "/dehyphenation_post";

                if (tet.get_errnum() != 0)
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "() on page "
                                      + pageno + ": " + tet.get_errmsg());


        catch (TETException e)
            Console.WriteLine("Error " + e.get_errnum() + " in "
                              + e.get_apiname() + "(): " + e.get_errmsg());
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
            if (tet != null)
Exemplo n.º 13
    public static String print_color_value(string str, TET tet, int doc, int colorid)
        int    colorinfo;
        String csname;                  /* color space name */
        int    i;

        /* We handle only the fill color, but ignore the stroke color.
         * The stroke color can be retrieved analogously with the
         * keyword "stroke".
        colorinfo = tet.get_color_info(doc, colorid, "usage=fill");
        if (tet.colorspaceid == -1 && tet.patternid == -1)
            str = str + String.Format(" (not filled)");

        str = str + String.Format(" (");

        if (tet.patternid != -1)
            int patterntype =
                (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PatternType");

            if (patterntype == 1)       /* Tiling pattern */
                int painttype =
                    (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PaintType");
                if (painttype == 1)
                    str = str + String.Format("colored Pattern)");
                else if (painttype == 2)
                    str = str + String.Format("uncolored Pattern, base color: ");
                    /* FALLTHROUGH to colorspaceid output */
            else if (patterntype == 2)  /* Shading pattern */
                int shadingtype =
                                             "patterns[" + tet.patternid + "]/Shading/ShadingType");

                str = str + String.Format("shading Pattern, ShadingType={0})", shadingtype);

        csname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/name");

        str = str + String.Format("{0}", csname);

        /* Emit more details depending on the colorspace type */
        if (csname.Equals("ICCBased"))
            int    iccprofileid;
            String profilename;
            String profilecs;
            String errormessage;

            iccprofileid = (int)tet.pcos_get_number(doc,
                                                    "colorspaces[" + tet.colorspaceid + "]/iccprofileid");

            errormessage = tet.pcos_get_string(doc,
                                               "iccprofiles[" + iccprofileid + "]/errormessage");

            /* Check whether the embedded profile is damaged */
            if (errormessage.Equals(""))
                str = str + String.Format(" ({0})", errormessage);
                profilename =
                                        "iccprofiles[" + iccprofileid + "]/profilename");
                str = str + String.Format(" '{0}'", profilename);

                profilecs = tet.pcos_get_string(doc,
                                                "iccprofiles[" + iccprofileid + "]/profilecs");
                str = str + String.Format(" '{0}'", profilecs);
        else if (csname.Equals("Separation"))
            String colorantname =
                tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/colorantname");
            str = str + String.Format(" '{0}'", colorantname);
        else if (csname.Equals("DeviceN"))
            str = str + String.Format(" ");

            for (i = 0; i < tet.components.Length; i++)
                String colorantname =
                                        "colorspaces[" + tet.colorspaceid + "]/colorantnames[" + i + "]");

                str = str + String.Format("{0}", colorantname);

                if (i != tet.components.Length - 1)
                    str = str + String.Format("/");
        else if (csname.Equals("Indexed"))
            int baseid =
                (int)tet.pcos_get_number(doc, "colorspaces[" + tet.colorspaceid + "]/baseid");

            csname = tet.pcos_get_string(doc, "colorspaces[" + baseid + "]/name");

            str = str + String.Format(" {0}", csname);

        str = str + String.Format(" ");
        for (i = 0; i < tet.components.Length; i++)
            str = str + String.Format("{0}", tet.components[i]);

            if (i != tet.components.Length - 1)
                str = str + String.Format("/");
        str = str + String.Format(")");
    public static void Main(String[] args)
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string docoptlist = "";

        /* Page-specific option list. */
        string pageoptlist = "granularity=line";

        /* Search text with at least this size (use 0 to catch all sizes). */
        double fontsizetrigger = 10;

        /* Catch text where the font name contains this string (use empty string
         * to catch all font names).
        String fontnametrigger = "Bold";

        TET tet    = null;
        int pageno = 0;

        if (args.Length != 1)
            Console.WriteLine("usage: fontfilter <infilename>");

            tet = new TET();

            int doc = tet.open_document(args[0], docoptlist);
            if (doc == -1)
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());

            /* Loop over pages in the document */
            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");
            for (pageno = 1; pageno <= n_pages; ++pageno)
                int page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "(): " + tet.get_errmsg());
                    return; /* try next page */

                /* Retrieve all text fragments for the page */
                String text;
                while ((text = tet.get_text(page)) != null)
                    /* Loop over all characters */
                    int ci;
                    while ((ci = tet.get_char_info(page)) != -1)
                        /* We need only the font name and size; the text
                         * position could be fetched from tet.x and tet.y.
                        String fontname = tet.pcos_get_string(doc,
                                                              "fonts[" + tet.fontid + "]/name");

                        /* Check whether we found a match */
                        if (tet.fontsize >= fontsizetrigger &&
                            fontname.IndexOf(fontnametrigger) != -1)
                            /* print the retrieved font name, size, and text */
                            Console.WriteLine("[{0} {1:0.00}] {2}", fontname,
                                              tet.fontsize, text);

                        /* In this sample we check only the first character of
                         * each fragment.

                if (tet.get_errnum() != 0)
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "(): " + tet.get_errmsg());


        catch (TETException e)
            if (pageno == 0)
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "() on page " + pageno + ": "
                                  + e.get_errmsg() + "\n");
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
Exemplo n.º 15
    public static void Main(String[] args)
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string basedocoptlist = "";

        /* Page-specific option list. */

        /* Remove the tetml= option if you don't need font and geometry
         * information */
        string pageoptlist = "granularity=word tetml={glyphdetails={all}}";

        /* set this to true to generate TETML output in memory */
        bool inmemory = false;

        if (args.Length != 2)
            Console.WriteLine("usage: tetml <pdffilename> <xmlfilename>");

        TET tet = null;

            String docoptlist;

            tet = new TET();

            if (inmemory)
                 * This program fetches the TETML data encoded in UTF-8.
                 * Subsequently the data is converted to a VisualBasic String,
                 * which is encoded in UTF-16.
                 * While it is not strictly necessary in case of this program, it
                 * is more clean to instruct TET to put 'encoding="UTF-16"' into
                 * the XML header.
                docoptlist = "tetml={encodingname=UTF-16} " + basedocoptlist;
                docoptlist = "tetml={filename={" + args[1] + "}} "
                             + basedocoptlist;

            int doc = tet.open_document(args[0], docoptlist);

            if (doc == -1)
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());

            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
                tet.process_page(doc, pageno, pageoptlist);

            /* This could be combined with the last page-related call. */
            tet.process_page(doc, 0, "tetml={trailer}");

            if (inmemory)
                /* Get the XML document as a byte array. */
                byte[] tetml = tet.get_tetml(doc, "");

                if (tetml == null)
                    Console.WriteLine("tetml: couldn't retrieve XML data");

                /* Process the in-memory XML document to print out some
                 * information that is extracted with the sax_handler class.
                XmlDocument  xmldoc   = new XmlDocument();
                UTF8Encoding utf8_enc = new UTF8Encoding();
                String       stetml   = utf8_enc.GetString(tetml);

                XmlNodeList nodeList;
                XmlElement  root = xmldoc.DocumentElement;

                /* Create an XmlNamespaceManager for resolving namespaces. */
                XmlNamespaceManager nsmgr =
                    new XmlNamespaceManager(xmldoc.NameTable);

                nodeList = root.SelectNodes("//tet:Font", nsmgr);
                IEnumerator ienum = nodeList.GetEnumerator();
                while (ienum.MoveNext())
                    XmlNode font = (XmlNode)ienum.Current;
                    XmlAttributeCollection attrColl = font.Attributes;

                    XmlAttribute name_attr =
                    XmlAttribute type_attr =
                    Console.WriteLine("Font " + name_attr.Value + " "
                                      + type_attr.Value);
                nodeList = root.SelectNodes("//tet:Word", nsmgr);
                Console.WriteLine("Found " + nodeList.Count
                                  + " words in document");

        catch (TETException e)
            Console.WriteLine("Error " + e.get_errnum() + " in "
                              + e.get_apiname() + "(): " + e.get_errmsg());
        catch (Exception e)
            Console.WriteLine("General Exception: " + e.ToString());
            if (tet != null)