/** * Print infos about the document. * * @param tet The TET object * @param doc The TET document handle * * @throws TETException */ private static void print_infos(TET tet, int doc) { /* --------- general information (always available) */ int pcosmode = (int)tet.pcos_get_number(doc, "pcosmode"); Console.WriteLine(" File name: " + tet.pcos_get_string(doc, "filename")); Console.WriteLine(" PDF version: " + tet.pcos_get_string(doc, "pdfversionstring")); Console.WriteLine(" Encryption: " + tet.pcos_get_string(doc, "encrypt/description")); Console.WriteLine(" Master pw: " + (tet.pcos_get_number(doc, "encrypt/master") != 0 ? "yes" : "no")); Console.WriteLine(" User pw: " + (tet.pcos_get_number(doc, "encrypt/user") != 0 ? "yes" : "no")); Console.WriteLine("Text copying: " + (tet.pcos_get_number(doc, "encrypt/nocopy") != 0 ? "no" : "yes")); Console.WriteLine(" Linearized: " + (tet.pcos_get_number(doc, "linearized") != 0 ? "yes" : "no")); if (pcosmode == 0) { Console.WriteLine("Minimum mode: no more information available\n\n"); } else { print_userpassword_infos(tet, doc, pcosmode); } }
/* Print the following information for each image: * - image number * - pCOS id (required for indexing the images[] array) * - physical size of the placed image on the page * - pixel size of the underlying PDF Image XObject * - number of components, bits per component,and colorspace * - mergetype if different from "normal", i.e. "artificial" * (=merged) or "consumed" * - "stencilmask" property, i.e. /ImageMask in PDF * - pCOS id of mask image, i.e. /Mask or /SMask */ static void report_image_info(TET tet, int doc, int imageid) { int width, height, bpc, cs, components, mergetype, stencilmask, maskid; String csname; width = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/Width"); height = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/Height"); bpc = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/bpc"); cs = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/colorspaceid"); components = (int)tet.pcos_get_number(doc, "colorspaces[" + cs + "]/components"); Console.Write("image {0}: {1}x{2} pixel, ", imageid, width, height); csname = tet.pcos_get_string(doc, "colorspaces[" + cs + "]/name"); Console.Write(components + "x" + bpc + " bit " + csname); if (csname == "Indexed") { int basecs = 0; String basecsname; basecs = (int)tet.pcos_get_number(doc, "colorspaces[" + cs + "]/baseid"); basecsname = tet.pcos_get_string(doc, "colorspaces[" + basecs + "]/name"); Console.Write(" " + basecsname); } /* Check whether this image has been created by merging smaller images*/ mergetype = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/mergetype"); if (mergetype == 1) { Console.Write(", mergetype=artificial"); } stencilmask = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/stencilmask"); if (stencilmask == 1) { Console.Write(", used as stencil mask"); } /* Check whether the image has an attached mask */ maskid = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/maskid"); if (maskid != -1) { Console.Write(", masked with image " + maskid); } Console.WriteLine(""); }
static int Main(String[] args) { int exitstat = 0; string searchpath = "{../data} {../../data}"; if (args.Length != 1) { Console.WriteLine("usage: dumper <filename>"); exitstat = 2; } else { TET tet = null; try { tet = new TET(); String docoptlist = "requiredmode=minimum"; String globaloptlist = ""; String optlist; optlist = "searchpath={" + searchpath + "}"; tet.set_option(optlist); tet.set_option(globaloptlist); int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("ERROR: " + tet.get_errmsg()); } else { print_infos(tet, doc); tet.close_document(doc); } } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); exitstat = 1; } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); exitstat = 1; } finally { if (tet != null) { tet.Dispose(); } } } return(exitstat); }
/** * Print infos that require at least the user password. * * @param tet The tet object * @param doc The tet document handle * @param pcosmode The pCOS mode for the document * * @throws TETException */ private static void print_userpassword_infos(TET tet, int doc, int pcosmode) { Console.WriteLine("PDF/X status: " + tet.pcos_get_string(doc, "pdfx")); Console.WriteLine("PDF/A status: " + tet.pcos_get_string(doc, "pdfa")); Console.WriteLine(" XFA data: " + (tet.pcos_get_number(doc, "type:/Root/AcroForm/XFA") != 0 ? "yes" : "no")); Console.WriteLine(" Tagged PDF: " + (tet.pcos_get_number(doc, "tagged") != 0 ? "yes" : "no")); Console.WriteLine(); Console.WriteLine("No. of pages: " + (int)tet.pcos_get_number(doc, "length:pages")); Console.WriteLine(" Page 1 size: width=" + tet.pcos_get_number(doc, "pages[0]/width") + ", height=" + tet.pcos_get_number(doc, "pages[0]/height")); int count = (int)tet.pcos_get_number(doc, "length:fonts"); Console.WriteLine("No. of fonts: " + count); for (int i = 0; i < count; i++) { if (tet.pcos_get_number(doc, "fonts[" + i + "]/embedded") != 0) { Console.Write("embedded "); } else { Console.Write("unembedded "); } Console.Write(tet .pcos_get_string(doc, "fonts[" + i + "]/type") + " font "); Console.WriteLine(tet .pcos_get_string(doc, "fonts[" + i + "]/name")); } Console.WriteLine(); bool plainmetadata = tet.pcos_get_number(doc, "encrypt/plainmetadata") != 0; if (pcosmode == 1 && !plainmetadata && tet.pcos_get_number(doc, "encrypt/nocopy") != 0) { Console.WriteLine("Restricted mode: no more information available"); } else { print_masterpassword_infos(tet, doc); } }
/** * Extract text from a document for which a TET handle is already available. * * @param tet * The TET object * @param doc * A valid TET document handle * @param outfp * Output file handle * * @throws TETException * @throws IOException */ static void extract_text(TET tet, int doc, BinaryWriter outfp) { UnicodeEncoding unicode = new UnicodeEncoding(false, true); /* * Get number of pages in the document. */ int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* loop over pages */ for (int pageno = 1; pageno <= n_pages; ++pageno) { String text; int page; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); continue; /* try next page */ } /* * Retrieve all text fragments; This loop is actually not required * for granularity=page, but must be used for other granularities. */ while ((text = tet.get_text(page)) != null) { outfp.Write(unicode.GetBytes(text)); // print the retrieved text /* print a separator between chunks of text */ outfp.Write(unicode.GetBytes(separator)); } if (tet.get_errnum() != 0) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); } tet.close_page(page); } }
/** * Print document info keys and XMP metadata (requires master pw or * plaintext metadata). * * @param tet * @param doc * @throws TETException */ private static void print_masterpassword_infos(TET tet, int doc) { String objtype; int count = (int)tet.pcos_get_number(doc, "length:/Info"); for (int i = 0; i < count; i++) { objtype = tet.pcos_get_string(doc, "type:/Info[" + i + "]"); String key = tet.pcos_get_string(doc, "/Info[" + i + "].key"); Console.Write(String.Empty.PadLeft(12 - key.Length) + key + ": "); /* Info entries can be stored as string or name objects */ if (objtype == "string" || objtype == "name") { Console.WriteLine("'" + tet.pcos_get_string(doc, "/Info[" + i + "]") + "'"); } else { Console.WriteLine("(" + tet.pcos_get_string(doc, "type:/Info[" + i + "]") + "object)"); } } Console.WriteLine(); Console.Write("XMP meta data: "); objtype = tet.pcos_get_string(doc, "type:/Root/Metadata"); if (objtype == "stream") { byte[] contents = tet.pcos_get_stream(doc, "", "/Root/Metadata"); Console.Write(contents.Length + " bytes "); UTF8Encoding utf8 = new UTF8Encoding(); String str = utf8.GetString(contents); Console.WriteLine("(" + str.Length + " Unicode characters)"); } else { Console.WriteLine("not present\n\n"); } }
/** * Process a single file. * * @param outfp Output stream for messages * @param tet The TET object * @param doc The TET document handle * * @throws TETException * @throws IOException */ private static void process_document(BinaryWriter outfp, TET tet, int doc) { String objtype; UnicodeEncoding unicode = new UnicodeEncoding(false, true); // -------------------- Extract the document's own page contents extract_text(tet, doc, outfp); // -------------------- Process all document-level file attachments // Get the number of document-level file attachments. int filecount = (int)tet.pcos_get_number(doc, "length:names/EmbeddedFiles"); for (int file = 0; file < filecount; file++) { String attname; /* * fetch the name of the file attachment; check for Unicode file * name (a PDF 1.7 feature) */ objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles[" + file + "]/UF"); if (objtype == "string") { attname = tet.pcos_get_string(doc, "names/EmbeddedFiles[" + file + "]/UF"); } else { objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles[" + file + "]/F"); if (objtype == "string") { attname = tet.pcos_get_string(doc, "names/EmbeddedFiles[" + file + "]/F"); } else { attname = "(unnamed)"; } } /* fetch the contents of the file attachment and process it */ objtype = tet.pcos_get_string(doc, "type:names/EmbeddedFiles[" + file + "]/EF/F"); if (objtype == "stream") { outfp.Write(unicode.GetBytes("----- File attachment '" + attname + "':\n")); byte[] attdata = tet.pcos_get_stream(doc, "", "names/EmbeddedFiles[" + file + "]/EF/F"); process_document(outfp, null, attname, attdata); outfp.Write(unicode.GetBytes("----- End file attachment '" + attname + "'\n")); } } // -------------------- Process all page-level file attachments int pagecount = (int)tet.pcos_get_number(doc, "length:pages"); // Check all pages for annotations of type FileAttachment for (int page = 0; page < pagecount; page++) { int annotcount = (int)tet.pcos_get_number(doc, "length:pages[" + page + "]/Annots"); for (int annot = 0; annot < annotcount; annot++) { String val; String attname; val = tet.pcos_get_string(doc, "pages[" + page + "]/Annots[" + annot + "]/Subtype"); attname = "page " + (page + 1) + ", annotation " + (annot + 1); if (val == "FileAttachment") { String attpath = "pages[" + page + "]/Annots[" + annot + "]/FS/EF/F"; /* * fetch the contents of the attachment and process it */ objtype = tet.pcos_get_string(doc, "type:" + attpath); if (objtype == "stream") { outfp.Write(unicode.GetBytes("----- Page level attachment '" + attname + "':\n")); byte[] attdata = tet.pcos_get_stream(doc, "", attpath); process_document(outfp, null, attname, attdata); outfp.Write(unicode.GetBytes("----- End page level attachment '" + attname + "'\n")); } } } } tet.close_document(doc); }
/** * Open a named physical or virtual file, extract the text from it, search * for document or page attachments, and process these recursively. Either * filename must be supplied for physical files, or data+length from which a * virtual file will be created. The caller cannot create the PVF file since * we create a new TET object here in case an exception happens with the * embedded document - the caller can happily continue with his TET object * even in case of an exception here. * * @param outfp * @param filename * @param realname * @param data * * @return 0 if successful, otherwise a non-null code to be used as exit * status */ static int process_document(BinaryWriter outfp, String filename, String realname, byte[] data) { int retval = 0; TET tet = null; try { String pvfname = "/pvf/attachment"; tet = new TET(); /* * Construct a PVF file if data instead of a filename was provided */ if (filename == null || filename.Length == 0) { tet.create_pvf(pvfname, data, ""); filename = pvfname; } tet.set_option(globaloptlist); int doc = tet.open_document(filename, docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() (source: attachment '" + realname + "'): " + tet.get_errmsg()); retval = 5; } else { process_document(outfp, tet, doc); } /* * If there was no PVF file deleting it won't do any harm */ tet.delete_pvf(pvfname); } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "() (source: attachment '" + realname + "'): " + e.get_errmsg()); retval = 1; } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); retval = 1; } finally { if (tet != null) { tet.Dispose(); } } return(retval); }
static int Main(string[] args) { /* global option list */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* document-specific option list */ string docoptlist = ""; /* page-specific option list e.g * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}" */ string pageoptlist = ""; TET tet; int pageno = 0; string outfilebase; if (args.Length != 1) { Console.WriteLine("usage: image_resources <filename>"); return(2); } outfilebase = args.GetValue(0).ToString(); if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF"))) { outfilebase = outfilebase.Substring(0, outfilebase.Length - 4); } tet = new TET(); try { int n_pages; tet.set_option(globaloptlist); int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist); if (doc == -1) { Console.WriteLine("Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); return(2); } /* Get number of pages in the document */ n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over pages and extract images */ for (pageno = 1; pageno <= n_pages; ++pageno) { int page; int imagecount = 0; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); continue; /* try next page */ } /* * Retrieve all images on the page */ while ((tet.get_image_info(page)) == 1) { String imageoptlist; int maskid; imagecount++; /* Report image details: pixel geometry, color space etc. */ report_image_info(tet, doc, tet.imageid); /* Report placement geometry */ Console.WriteLine(" placed on page " + pageno + " at position (" + tet.x.ToString("f2") + ", " + tet.y.ToString("f2") + "): " + (int)tet.width + "x" + (int)tet.height + "pt, alpha=" + tet.alpha + ", beta=" + tet.beta); /* Write image data to file */ imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "}"; if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1) { Console.WriteLine("\nError [" + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); continue; /* try next image */ } /* Check whether the image has a mask attached... */ maskid = (int)tet.pcos_get_number(doc, "images[" + tet.imageid + "]/maskid"); /* and retrieve it if present */ if (maskid != -1) { Console.WriteLine(" masked with "); report_image_info(tet, doc, maskid); imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "mask_I" + maskid + "}"; if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1) { Console.WriteLine("\nError [" + tet.get_errnum() + " in " + tet.get_apiname() + "() for mask image: " + tet.get_errmsg()); continue; /* try next image */ } } if (tet.get_errnum() != 0) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); } } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { /* caught exception thrown by TET */ Console.WriteLine("Error {0} in {1}(): {2}", e.get_errnum(), e.get_apiname(), e.get_errmsg()); return(2); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); return(2); } finally { if (tet != null) { tet.Dispose(); } } return(0); }
static int Main(string[] args) { /* global option list */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* document-specific option list */ string docoptlist = ""; /* page-specific option list e.g * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}" */ string pageoptlist = ""; TET tet; int pageno = 0; string outfilebase; if (args.Length != 1) { Console.WriteLine("usage: image_resources <filename>"); return(2); } outfilebase = args.GetValue(0).ToString(); if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF"))) { outfilebase = outfilebase.Substring(0, outfilebase.Length - 4); } tet = new TET(); try { int n_pages; tet.set_option(globaloptlist); int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist); if (doc == -1) { Console.WriteLine("Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); return(2); } /* Images will only be merged upon opening a page. * In order to enumerate all merged image resources * we open all pages before extracting the images. */ /* get number of pages in the document */ n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over all pages to trigger image merging */ for (pageno = 1; pageno <= n_pages; ++pageno) { string text; int page; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); continue; /* process next page */ } if (tet.get_errnum() != 0) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); } tet.close_page(page); } int imageid, n_images; /* Get the number of images in the document */ n_images = (int)tet.pcos_get_number(doc, "length:images"); /* Loop over image resources in the document */ for (imageid = 0; imageid < n_images; ++imageid) { string imageoptlist; /* Skiop images which have been consumed by merging */ int mergetype = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/mergetype"); if (mergetype == 2) { continue; } /* Skip small images (see "smallimages" option) */ if (tet.pcos_get_number(doc, "images[" + imageid + "]/small") > 0) { continue; } /* Report image details: pixel geometry, color space etc . */ report_image_info(tet, doc, imageid); /* Write image data to file */ imageoptlist = " filename={" + outfilebase + "_I" + imageid + "}"; if (tet.write_image_file(doc, imageid, imageoptlist) == -1) { Console.WriteLine( "Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); continue; /* process next image */ } } tet.close_document(doc); } catch (TETException e) { /* caught exception thrown by TET */ Console.WriteLine("Error {0} in {1}(): {2}", e.get_errnum(), e.get_apiname(), e.get_errmsg()); return(2); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); return(2); } finally { if (tet != null) { tet.Dispose(); } } return(0); }
static int Main(string[] args) { /* global option list */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* document-specific option list */ string docoptlist = ""; /* page-specific option list */ string pageoptlist = "granularity=page"; /* separator to emit after each chunk of text. This depends on the * applications needs; for granularity=word a space character may be useful. */ string separator = "\n"; TET tet; FileStream outfile; BinaryWriter w; int pageno = 0; UnicodeEncoding unicode = new UnicodeEncoding(false, true); Byte[] byteOrderMark = unicode.GetPreamble(); if (args.Length != 2) { Console.WriteLine("usage: extractor <infilename> <outfilename>"); return(2); } outfile = File.Create(args.GetValue(1).ToString()); w = new BinaryWriter(outfile); w.Write(byteOrderMark); tet = new TET(); try { int n_pages; tet.set_option(globaloptlist); int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist); if (doc == -1) { Console.WriteLine("Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); return(2); } /* get number of pages in the document */ n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* loop over pages in the document */ for (pageno = 1; pageno <= n_pages; ++pageno) { string text; int page; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); continue; /* try next page */ } /* Retrieve all text fragments; This is actually not required * for granularity=page, but must be used for other * granularities. */ while ((text = tet.get_text(page)) != null) { /* print the retrieved text */ w.Write(unicode.GetBytes(text)); /* print a separator between chunks of text */ w.Write(unicode.GetBytes(separator)); } if (tet.get_errnum() != 0) { Console.WriteLine("Error {0} in {1}(): {3}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { /* caught exception thrown by TET */ Console.WriteLine("Error {0} in {1}(): {2}", e.get_errnum(), e.get_apiname(), e.get_errmsg()); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); return(2); } finally { outfile.Close(); if (tet != null) { tet.Dispose(); } } return(0); }
public static void Main(String[] args) { /* Global option list. */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* Document specific option list. */ string docoptlist = ""; /* Page-specific option list. */ string pageoptlist = "granularity=word"; FileStream outfile; StreamWriter outfp; if (args.Length != 2) { Console.WriteLine("usage: glyphinfo <infilename> <outfilename>"); return; } outfile = File.Create(args.GetValue(1).ToString()); outfp = new StreamWriter(outfile, System.Text.Encoding.UTF8); TET tet = null; try { tet = new TET(); tet.set_option(globaloptlist); int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } /* get number of pages in the document */ int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over pages in the document */ for (int pageno = 1; pageno <= n_pages; ++pageno) { string text; int page; int previouscolor = -1; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); continue; /* try next page */ } /* Administrative information */ outfp.WriteLine("[ Document: '" + tet.pcos_get_string(doc, "filename") + "' ]"); outfp.WriteLine("[ Document options: '" + docoptlist + "' ]"); outfp.WriteLine("[ Page options: '" + pageoptlist + "' ]"); outfp.WriteLine("[ ----- Page " + pageno + " ----- ]"); /* Retrieve all text fragments */ while ((text = tet.get_text(page)) != null) { /* print the retrieved text */ outfp.WriteLine("[" + text + "]"); /* Loop over all glyphs and print their details */ while (tet.get_char_info(page) != -1) { string str; string fontname; /* Fetch the font name with pCOS (based on its ID) */ fontname = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name"); /* Print the character */ str = String.Format("U+{0}", tet.uv.ToString("X4")); /* ...and its UTF8 representation */ str = str + String.Format(" '" + (char)(tet.uv) + "'"); /* Print font name, size, and position */ str = str + String.Format(" {0} size={1} x={2} y={3}", fontname, tet.fontsize.ToString("f2"), tet.x.ToString("f2"), tet.y.ToString("f2")); /* Print the color id */ str = str + String.Format(" colorid={0}", tet.colorid); /* check wheather the text color changes */ if (tet.colorid != previouscolor) { str = print_color_value(str, tet, doc, tet.colorid); previouscolor = tet.colorid; } /* Examine the "type" member */ if (tet.type == 1) { str = str + " ligature_start"; } else if (tet.type == 10) { str = str + " ligature_cont"; } /* Separators are only inserted for granularity > word*/ else if (tet.type == 12) { str = str + " inserted"; } /* Examine the bit flags in the "attributes" member */ const int ATTR_NONE = 0; const int ATTR_SUB = 1; const int ATTR_SUP = 2; const int ATTR_DROPCAP = 4; const int ATTR_SHADOW = 8; const int ATTR_DH_PRE = 16; const int ATTR_DH_ARTF = 32; const int ATTR_DH_POST = 64; if (tet.attributes != ATTR_NONE) { if ((tet.attributes & ATTR_SUB) == ATTR_SUB) { str = str + "/sub"; } if ((tet.attributes & ATTR_SUP) == ATTR_SUP) { str = str + "/sup"; } if ((tet.attributes & ATTR_DROPCAP) == ATTR_DROPCAP) { str = str + "/dropcap"; } if ((tet.attributes & ATTR_SHADOW) == ATTR_SHADOW) { str = str + "/shadow"; } if ((tet.attributes & ATTR_DH_PRE) == ATTR_DH_PRE) { str = str + "/dehyphenation_pre"; } if ((tet.attributes & ATTR_DH_ARTF) == ATTR_DH_ARTF) { str = str + "/dehyphenation_artifact"; } if ((tet.attributes & ATTR_DH_POST) == ATTR_DH_POST) { str = str + "/dehyphenation_post"; } } outfp.WriteLine(str); } outfp.WriteLine(""); } if (tet.get_errnum() != 0) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); } finally { if (tet != null) { tet.Dispose(); } } }
public static String print_color_value(string str, TET tet, int doc, int colorid) { int colorinfo; String csname; /* color space name */ int i; /* We handle only the fill color, but ignore the stroke color. * The stroke color can be retrieved analogously with the * keyword "stroke". */ colorinfo = tet.get_color_info(doc, colorid, "usage=fill"); if (tet.colorspaceid == -1 && tet.patternid == -1) { str = str + String.Format(" (not filled)"); return(str); } str = str + String.Format(" ("); if (tet.patternid != -1) { int patterntype = (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PatternType"); if (patterntype == 1) /* Tiling pattern */ { int painttype = (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PaintType"); if (painttype == 1) { str = str + String.Format("colored Pattern)"); return(str); } else if (painttype == 2) { str = str + String.Format("uncolored Pattern, base color: "); /* FALLTHROUGH to colorspaceid output */ } } else if (patterntype == 2) /* Shading pattern */ { int shadingtype = (int)tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/Shading/ShadingType"); str = str + String.Format("shading Pattern, ShadingType={0})", shadingtype); return(str); } } csname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/name"); str = str + String.Format("{0}", csname); /* Emit more details depending on the colorspace type */ if (csname.Equals("ICCBased")) { int iccprofileid; String profilename; String profilecs; String errormessage; iccprofileid = (int)tet.pcos_get_number(doc, "colorspaces[" + tet.colorspaceid + "]/iccprofileid"); errormessage = tet.pcos_get_string(doc, "iccprofiles[" + iccprofileid + "]/errormessage"); /* Check whether the embedded profile is damaged */ if (errormessage.Equals("")) { str = str + String.Format(" ({0})", errormessage); } else { profilename = tet.pcos_get_string(doc, "iccprofiles[" + iccprofileid + "]/profilename"); str = str + String.Format(" '{0}'", profilename); profilecs = tet.pcos_get_string(doc, "iccprofiles[" + iccprofileid + "]/profilecs"); str = str + String.Format(" '{0}'", profilecs); } } else if (csname.Equals("Separation")) { String colorantname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/colorantname"); str = str + String.Format(" '{0}'", colorantname); } else if (csname.Equals("DeviceN")) { str = str + String.Format(" "); for (i = 0; i < tet.components.Length; i++) { String colorantname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/colorantnames[" + i + "]"); str = str + String.Format("{0}", colorantname); if (i != tet.components.Length - 1) { str = str + String.Format("/"); } } } else if (csname.Equals("Indexed")) { int baseid = (int)tet.pcos_get_number(doc, "colorspaces[" + tet.colorspaceid + "]/baseid"); csname = tet.pcos_get_string(doc, "colorspaces[" + baseid + "]/name"); str = str + String.Format(" {0}", csname); } str = str + String.Format(" "); for (i = 0; i < tet.components.Length; i++) { str = str + String.Format("{0}", tet.components[i]); if (i != tet.components.Length - 1) { str = str + String.Format("/"); } } str = str + String.Format(")"); return(str); }
public static void Main(String[] args) { /* Global option list. */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* Document specific option list. */ string docoptlist = ""; /* Page-specific option list. */ string pageoptlist = "granularity=line"; /* Search text with at least this size (use 0 to catch all sizes). */ double fontsizetrigger = 10; /* Catch text where the font name contains this string (use empty string * to catch all font names). */ String fontnametrigger = "Bold"; TET tet = null; int pageno = 0; if (args.Length != 1) { Console.WriteLine("usage: fontfilter <infilename>"); return; } try { tet = new TET(); tet.set_option(globaloptlist); int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } /* Loop over pages in the document */ int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); for (pageno = 1; pageno <= n_pages; ++pageno) { int page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; /* try next page */ } /* Retrieve all text fragments for the page */ String text; while ((text = tet.get_text(page)) != null) { /* Loop over all characters */ int ci; while ((ci = tet.get_char_info(page)) != -1) { /* We need only the font name and size; the text * position could be fetched from tet.x and tet.y. */ String fontname = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name"); /* Check whether we found a match */ if (tet.fontsize >= fontsizetrigger && fontname.IndexOf(fontnametrigger) != -1) { /* print the retrieved font name, size, and text */ Console.WriteLine("[{0} {1:0.00}] {2}", fontname, tet.fontsize, text); } /* In this sample we check only the first character of * each fragment. */ break; } } if (tet.get_errnum() != 0) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { if (pageno == 0) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); } finally { tet.Dispose(); } }
public static void Main(String[] args) { /* Global option list. */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* Document specific option list. */ string basedocoptlist = ""; /* Page-specific option list. */ /* Remove the tetml= option if you don't need font and geometry * information */ string pageoptlist = "granularity=word tetml={glyphdetails={all}}"; /* set this to true to generate TETML output in memory */ bool inmemory = false; if (args.Length != 2) { Console.WriteLine("usage: tetml <pdffilename> <xmlfilename>"); return; } TET tet = null; try { String docoptlist; tet = new TET(); tet.set_option(globaloptlist); if (inmemory) { /* * This program fetches the TETML data encoded in UTF-8. * Subsequently the data is converted to a VisualBasic String, * which is encoded in UTF-16. * While it is not strictly necessary in case of this program, it * is more clean to instruct TET to put 'encoding="UTF-16"' into * the XML header. */ docoptlist = "tetml={encodingname=UTF-16} " + basedocoptlist; } else { docoptlist = "tetml={filename={" + args[1] + "}} " + basedocoptlist; } int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over pages in the document */ for (int pageno = 1; pageno <= n_pages; ++pageno) { tet.process_page(doc, pageno, pageoptlist); } /* This could be combined with the last page-related call. */ tet.process_page(doc, 0, "tetml={trailer}"); if (inmemory) { /* Get the XML document as a byte array. */ byte[] tetml = tet.get_tetml(doc, ""); if (tetml == null) { Console.WriteLine("tetml: couldn't retrieve XML data"); return; } /* Process the in-memory XML document to print out some * information that is extracted with the sax_handler class. */ XmlDocument xmldoc = new XmlDocument(); UTF8Encoding utf8_enc = new UTF8Encoding(); String stetml = utf8_enc.GetString(tetml); xmldoc.LoadXml(stetml); XmlNodeList nodeList; XmlElement root = xmldoc.DocumentElement; /* Create an XmlNamespaceManager for resolving namespaces. */ XmlNamespaceManager nsmgr = new XmlNamespaceManager(xmldoc.NameTable); nsmgr.AddNamespace("tet", "http://www.pdflib.com/XML/TET5/TET-5.0"); nodeList = root.SelectNodes("//tet:Font", nsmgr); IEnumerator ienum = nodeList.GetEnumerator(); while (ienum.MoveNext()) { XmlNode font = (XmlNode)ienum.Current; XmlAttributeCollection attrColl = font.Attributes; XmlAttribute name_attr = (XmlAttribute)attrColl.GetNamedItem("name"); XmlAttribute type_attr = (XmlAttribute)attrColl.GetNamedItem("type"); Console.WriteLine("Font " + name_attr.Value + " " + type_attr.Value); } nodeList = root.SelectNodes("//tet:Word", nsmgr); Console.WriteLine("Found " + nodeList.Count + " words in document"); } tet.close_document(doc); } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); } finally { if (tet != null) { tet.Dispose(); } } }