/**
     * Extract text from a document for which a TET handle is already available.
     *
     * @param tet
     *            The TET object
     * @param doc
     *            A valid TET document handle
     * @param outfp
     *            Output file handle
     *
     * @throws TETException
     * @throws IOException
     */
    static void extract_text(TET tet, int doc, BinaryWriter outfp)
    {
        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        /*
         * Get number of pages in the document.
         */
        int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

        /* loop over pages */
        for (int pageno = 1; pageno <= n_pages; ++pageno)
        {
            String text;
            int    page;

            page = tet.open_page(doc, pageno, pageoptlist);

            if (page == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() on page " + pageno + ": "
                                  + tet.get_errmsg());
                continue; /* try next page */
            }

            /*
             * Retrieve all text fragments; This loop is actually not required
             * for granularity=page, but must be used for other granularities.
             */
            while ((text = tet.get_text(page)) != null)
            {
                outfp.Write(unicode.GetBytes(text)); // print the retrieved text

                /* print a separator between chunks of text */
                outfp.Write(unicode.GetBytes(separator));
            }

            if (tet.get_errnum() != 0)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in  "
                                  + tet.get_apiname() + "() on page " + pageno + ": "
                                  + tet.get_errmsg());
            }

            tet.close_page(page);
        }
    }
예제 #2
0
    static int Main(string[] args)
    {
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list  e.g
         * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}"
         */
        string pageoptlist = "";


        TET    tet;
        int    pageno = 0;
        string outfilebase;

        if (args.Length != 1)
        {
            Console.WriteLine("usage: image_resources <filename>");
            return(2);
        }

        outfilebase = args.GetValue(0).ToString();
        if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF")))
        {
            outfilebase = outfilebase.Substring(0, outfilebase.Length - 4);
        }

        tet = new TET();

        try
        {
            int n_pages;

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                return(2);
            }


            /* Images will only be merged upon opening a page.
             * In order to enumerate all merged image resources
             * we open all pages before extracting the images.
             */

            /* get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over all pages to trigger image merging */
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                string text;
                int    page;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                                      tet.get_errmsg());
                    continue;                    /* process next page */
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                                      tet.get_errmsg());
                }
                tet.close_page(page);
            }

            int imageid, n_images;

            /* Get the number of images in the document */
            n_images = (int)tet.pcos_get_number(doc, "length:images");

            /* Loop over image resources in the document */
            for (imageid = 0; imageid < n_images; ++imageid)
            {
                string imageoptlist;
                /* Skiop images which have been consumed by merging */
                int mergetype = (int)tet.pcos_get_number(doc,
                                                         "images[" + imageid + "]/mergetype");

                if (mergetype == 2)
                {
                    continue;
                }

                /* Skip small images (see "smallimages" option) */
                if (tet.pcos_get_number(doc, "images[" + imageid + "]/small") > 0)
                {
                    continue;
                }
                /* Report image details: pixel geometry, color space etc . */
                report_image_info(tet, doc, imageid);

                /* Write image data to file */

                imageoptlist = " filename={" + outfilebase + "_I" + imageid + "}";

                if (tet.write_image_file(doc, imageid, imageoptlist) == -1)
                {
                    Console.WriteLine(
                        "Error {0} in {1}(): {2}",
                        tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                    continue;              /* process next image */
                }
            }
            tet.close_document(doc);
        }
        catch (TETException e) {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
            return(2);
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            return(2);
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }

        return(0);
    }
예제 #3
0
    static int Main(string[] args)
    {
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list  e.g
         * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}"
         */
        string pageoptlist = "";


        TET    tet;
        int    pageno = 0;
        string outfilebase;

        if (args.Length != 1)
        {
            Console.WriteLine("usage: image_resources <filename>");
            return(2);
        }

        outfilebase = args.GetValue(0).ToString();
        if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF")))
        {
            outfilebase = outfilebase.Substring(0, outfilebase.Length - 4);
        }

        tet = new TET();

        try
        {
            int n_pages;

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                return(2);
            }
            /* Get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages and extract images  */
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                int page;
                int imagecount = 0;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg());
                    continue; /* try next page */
                }

                /*
                 * Retrieve all images on the page
                 */
                while ((tet.get_image_info(page)) == 1)
                {
                    String imageoptlist;
                    int    maskid;

                    imagecount++;

                    /* Report image details: pixel geometry, color space etc. */
                    report_image_info(tet, doc, tet.imageid);

                    /* Report placement geometry */
                    Console.WriteLine("  placed on page " + pageno +
                                      " at position (" + tet.x.ToString("f2") + ", " + tet.y.ToString("f2") + "): " +
                                      (int)tet.width + "x" + (int)tet.height + "pt, alpha=" + tet.alpha + ", beta=" +
                                      tet.beta);
                    /* Write image data to file */
                    imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "}";

                    if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1)
                    {
                        Console.WriteLine("\nError [" + tet.get_errnum() +
                                          " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
                        continue; /* try next image */
                    }

                    /* Check whether the image has a mask attached... */
                    maskid = (int)tet.pcos_get_number(doc,
                                                      "images[" + tet.imageid + "]/maskid");

                    /* and retrieve it if present */
                    if (maskid != -1)
                    {
                        Console.WriteLine("  masked with ");
                        report_image_info(tet, doc, maskid);

                        imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "mask_I" + maskid + "}";

                        if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1)
                        {
                            Console.WriteLine("\nError [" + tet.get_errnum() +
                                              " in " + tet.get_apiname() +
                                              "() for mask image: " + tet.get_errmsg());
                            continue; /* try next image */
                        }
                    }

                    if (tet.get_errnum() != 0)
                    {
                        Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                          tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg());
                    }
                }
                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
            return(2);
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            return(2);
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }

        return(0);
    }
예제 #4
0
    static int Main(string[] args)
    {
        /* global option list */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* document-specific  option list */
        string docoptlist = "";

        /* page-specific option list */
        string pageoptlist = "granularity=page";

        /* separator to emit after each chunk of text. This depends on the
         * applications needs; for granularity=word a space character may be useful.
         */
        string separator = "\n";

        TET          tet;
        FileStream   outfile;
        BinaryWriter w;
        int          pageno = 0;

        UnicodeEncoding unicode = new UnicodeEncoding(false, true);

        Byte[] byteOrderMark = unicode.GetPreamble();


        if (args.Length != 2)
        {
            Console.WriteLine("usage: extractor <infilename> <outfilename>");
            return(2);
        }

        outfile = File.Create(args.GetValue(1).ToString());
        w       = new BinaryWriter(outfile);
        w.Write(byteOrderMark);

        tet = new TET();

        try
        {
            int n_pages;

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error {0} in {1}(): {2}",
                                  tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                return(2);
            }

            /* get number of pages in the document */
            n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* loop over pages in the document */
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                string text;
                int    page;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error {0} in {1}() on page {2}: {3}",
                                      tet.get_errnum(), tet.get_apiname(), pageno,
                                      tet.get_errmsg());
                    continue;                    /* try next page */
                }

                /* Retrieve all text fragments; This is actually not required
                 * for granularity=page, but must be used for other
                 * granularities.
                 */
                while ((text = tet.get_text(page)) != null)
                {
                    /* print the retrieved text */
                    w.Write(unicode.GetBytes(text));

                    /* print a separator between chunks of text */
                    w.Write(unicode.GetBytes(separator));
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error {0} in {1}(): {3}",
                                      tet.get_errnum(), tet.get_apiname(), tet.get_errmsg());
                }
                tet.close_page(page);
            }
            tet.close_document(doc);
        }
        catch (TETException e) {
            /* caught exception thrown by TET */
            Console.WriteLine("Error {0} in {1}(): {2}",
                              e.get_errnum(), e.get_apiname(), e.get_errmsg());
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
            return(2);
        }
        finally
        {
            outfile.Close();
            if (tet != null)
            {
                tet.Dispose();
            }
        }

        return(0);
    }
예제 #5
0
    public static void Main(String[] args)
    {
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string docoptlist = "";

        /* Page-specific option list. */
        string pageoptlist = "granularity=word";

        FileStream   outfile;
        StreamWriter outfp;

        if (args.Length != 2)
        {
            Console.WriteLine("usage: glyphinfo <infilename> <outfilename>");
            return;
        }

        outfile = File.Create(args.GetValue(1).ToString());
        outfp   = new StreamWriter(outfile, System.Text.Encoding.UTF8);

        TET tet = null;

        try
        {
            tet = new TET();

            tet.set_option(globaloptlist);

            int doc = tet.open_document(args[0], docoptlist);

            if (doc == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            /* get number of pages in the document */
            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");

            /* Loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno)
            {
                string text;
                int    page;
                int    previouscolor = -1;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "() on page "
                                      + pageno + ": " + tet.get_errmsg());
                    continue;                        /* try next page */
                }

                /* Administrative information */
                outfp.WriteLine("[ Document: '" +
                                tet.pcos_get_string(doc, "filename") + "' ]");

                outfp.WriteLine("[ Document options: '" + docoptlist + "' ]");

                outfp.WriteLine("[ Page options: '" + pageoptlist + "' ]");

                outfp.WriteLine("[ ----- Page " + pageno + " ----- ]");

                /* Retrieve all text fragments */
                while ((text = tet.get_text(page)) != null)
                {
                    /* print the retrieved text */
                    outfp.WriteLine("[" + text + "]");

                    /* Loop over all glyphs and print their details */
                    while (tet.get_char_info(page) != -1)
                    {
                        string str;
                        string fontname;

                        /* Fetch the font name with pCOS (based on its ID) */
                        fontname = tet.pcos_get_string(doc,
                                                       "fonts[" + tet.fontid + "]/name");

                        /* Print the character */
                        str = String.Format("U+{0}", tet.uv.ToString("X4"));

                        /* ...and its UTF8 representation */
                        str = str + String.Format(" '" + (char)(tet.uv) + "'");

                        /* Print font name, size, and position */
                        str = str + String.Format(" {0} size={1} x={2} y={3}",
                                                  fontname, tet.fontsize.ToString("f2"),
                                                  tet.x.ToString("f2"), tet.y.ToString("f2"));
                        /* Print the color id */
                        str = str + String.Format(" colorid={0}", tet.colorid);

                        /* check wheather the text color changes */
                        if (tet.colorid != previouscolor)
                        {
                            str           = print_color_value(str, tet, doc, tet.colorid);
                            previouscolor = tet.colorid;
                        }
                        /* Examine the "type" member */
                        if (tet.type == 1)
                        {
                            str = str + " ligature_start";
                        }

                        else if (tet.type == 10)
                        {
                            str = str + " ligature_cont";
                        }

                        /* Separators are only inserted for granularity > word*/
                        else if (tet.type == 12)
                        {
                            str = str + " inserted";
                        }

                        /* Examine the bit flags in the "attributes" member */
                        const int ATTR_NONE    = 0;
                        const int ATTR_SUB     = 1;
                        const int ATTR_SUP     = 2;
                        const int ATTR_DROPCAP = 4;
                        const int ATTR_SHADOW  = 8;
                        const int ATTR_DH_PRE  = 16;
                        const int ATTR_DH_ARTF = 32;
                        const int ATTR_DH_POST = 64;

                        if (tet.attributes != ATTR_NONE)
                        {
                            if ((tet.attributes & ATTR_SUB) == ATTR_SUB)
                            {
                                str = str + "/sub";
                            }
                            if ((tet.attributes & ATTR_SUP) == ATTR_SUP)
                            {
                                str = str + "/sup";
                            }
                            if ((tet.attributes & ATTR_DROPCAP) == ATTR_DROPCAP)
                            {
                                str = str + "/dropcap";
                            }
                            if ((tet.attributes & ATTR_SHADOW) == ATTR_SHADOW)
                            {
                                str = str + "/shadow";
                            }
                            if ((tet.attributes & ATTR_DH_PRE) == ATTR_DH_PRE)
                            {
                                str = str + "/dehyphenation_pre";
                            }
                            if ((tet.attributes & ATTR_DH_ARTF) == ATTR_DH_ARTF)
                            {
                                str = str + "/dehyphenation_artifact";
                            }
                            if ((tet.attributes & ATTR_DH_POST) == ATTR_DH_POST)
                            {
                                str = str + "/dehyphenation_post";
                            }
                        }
                        outfp.WriteLine(str);
                    }
                    outfp.WriteLine("");
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "() on page "
                                      + pageno + ": " + tet.get_errmsg());
                }

                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            Console.WriteLine("Error " + e.get_errnum() + " in "
                              + e.get_apiname() + "(): " + e.get_errmsg());
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
        }
        finally
        {
            if (tet != null)
            {
                tet.Dispose();
            }
        }
    }
    public static void Main(String[] args)
    {
        /* Global option list. */
        string globaloptlist = "searchpath={{../data} {../../data}}";

        /* Document specific option list. */
        string docoptlist = "";

        /* Page-specific option list. */
        string pageoptlist = "granularity=line";

        /* Search text with at least this size (use 0 to catch all sizes). */
        double fontsizetrigger = 10;

        /* Catch text where the font name contains this string (use empty string
         * to catch all font names).
         */
        String fontnametrigger = "Bold";

        TET tet    = null;
        int pageno = 0;

        if (args.Length != 1)
        {
            Console.WriteLine("usage: fontfilter <infilename>");
            return;
        }

        try
        {
            tet = new TET();
            tet.set_option(globaloptlist);

            int doc = tet.open_document(args[0], docoptlist);
            if (doc == -1)
            {
                Console.WriteLine("Error " + tet.get_errnum() + " in "
                                  + tet.get_apiname() + "(): " + tet.get_errmsg());
                return;
            }

            /* Loop over pages in the document */
            int n_pages = (int)tet.pcos_get_number(doc, "length:pages");
            for (pageno = 1; pageno <= n_pages; ++pageno)
            {
                int page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "(): " + tet.get_errmsg());
                    return; /* try next page */
                }

                /* Retrieve all text fragments for the page */
                String text;
                while ((text = tet.get_text(page)) != null)
                {
                    /* Loop over all characters */
                    int ci;
                    while ((ci = tet.get_char_info(page)) != -1)
                    {
                        /* We need only the font name and size; the text
                         * position could be fetched from tet.x and tet.y.
                         */
                        String fontname = tet.pcos_get_string(doc,
                                                              "fonts[" + tet.fontid + "]/name");

                        /* Check whether we found a match */
                        if (tet.fontsize >= fontsizetrigger &&
                            fontname.IndexOf(fontnametrigger) != -1)
                        {
                            /* print the retrieved font name, size, and text */
                            Console.WriteLine("[{0} {1:0.00}] {2}", fontname,
                                              tet.fontsize, text);
                        }

                        /* In this sample we check only the first character of
                         * each fragment.
                         */
                        break;
                    }
                }

                if (tet.get_errnum() != 0)
                {
                    Console.WriteLine("Error " + tet.get_errnum() + " in "
                                      + tet.get_apiname() + "(): " + tet.get_errmsg());
                }

                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e)
        {
            if (pageno == 0)
            {
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            }
            else
            {
                Console.WriteLine("Error " + e.get_errnum() + " in "
                                  + e.get_apiname() + "() on page " + pageno + ": "
                                  + e.get_errmsg() + "\n");
            }
        }
        catch (Exception e)
        {
            Console.WriteLine("General Exception: " + e.ToString());
        }
        finally
        {
            tet.Dispose();
        }
    }