/** * Extract text from a document for which a TET handle is already available. * * @param tet * The TET object * @param doc * A valid TET document handle * @param outfp * Output file handle * * @throws TETException * @throws IOException */ static void extract_text(TET tet, int doc, BinaryWriter outfp) { UnicodeEncoding unicode = new UnicodeEncoding(false, true); /* * Get number of pages in the document. */ int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* loop over pages */ for (int pageno = 1; pageno <= n_pages; ++pageno) { String text; int page; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); continue; /* try next page */ } /* * Retrieve all text fragments; This loop is actually not required * for granularity=page, but must be used for other granularities. */ while ((text = tet.get_text(page)) != null) { outfp.Write(unicode.GetBytes(text)); // print the retrieved text /* print a separator between chunks of text */ outfp.Write(unicode.GetBytes(separator)); } if (tet.get_errnum() != 0) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); } tet.close_page(page); } }
static int Main(string[] args) { /* global option list */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* document-specific option list */ string docoptlist = ""; /* page-specific option list e.g * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}" */ string pageoptlist = ""; TET tet; int pageno = 0; string outfilebase; if (args.Length != 1) { Console.WriteLine("usage: image_resources <filename>"); return(2); } outfilebase = args.GetValue(0).ToString(); if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF"))) { outfilebase = outfilebase.Substring(0, outfilebase.Length - 4); } tet = new TET(); try { int n_pages; tet.set_option(globaloptlist); int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist); if (doc == -1) { Console.WriteLine("Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); return(2); } /* Images will only be merged upon opening a page. * In order to enumerate all merged image resources * we open all pages before extracting the images. */ /* get number of pages in the document */ n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over all pages to trigger image merging */ for (pageno = 1; pageno <= n_pages; ++pageno) { string text; int page; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); continue; /* process next page */ } if (tet.get_errnum() != 0) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); } tet.close_page(page); } int imageid, n_images; /* Get the number of images in the document */ n_images = (int)tet.pcos_get_number(doc, "length:images"); /* Loop over image resources in the document */ for (imageid = 0; imageid < n_images; ++imageid) { string imageoptlist; /* Skiop images which have been consumed by merging */ int mergetype = (int)tet.pcos_get_number(doc, "images[" + imageid + "]/mergetype"); if (mergetype == 2) { continue; } /* Skip small images (see "smallimages" option) */ if (tet.pcos_get_number(doc, "images[" + imageid + "]/small") > 0) { continue; } /* Report image details: pixel geometry, color space etc . */ report_image_info(tet, doc, imageid); /* Write image data to file */ imageoptlist = " filename={" + outfilebase + "_I" + imageid + "}"; if (tet.write_image_file(doc, imageid, imageoptlist) == -1) { Console.WriteLine( "Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); continue; /* process next image */ } } tet.close_document(doc); } catch (TETException e) { /* caught exception thrown by TET */ Console.WriteLine("Error {0} in {1}(): {2}", e.get_errnum(), e.get_apiname(), e.get_errmsg()); return(2); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); return(2); } finally { if (tet != null) { tet.Dispose(); } } return(0); }
/** * Open a named physical or virtual file, extract the text from it, search * for document or page attachments, and process these recursively. Either * filename must be supplied for physical files, or data+length from which a * virtual file will be created. The caller cannot create the PVF file since * we create a new TET object here in case an exception happens with the * embedded document - the caller can happily continue with his TET object * even in case of an exception here. * * @param outfp * @param filename * @param realname * @param data * * @return 0 if successful, otherwise a non-null code to be used as exit * status */ static int process_document(BinaryWriter outfp, String filename, String realname, byte[] data) { int retval = 0; TET tet = null; try { String pvfname = "/pvf/attachment"; tet = new TET(); /* * Construct a PVF file if data instead of a filename was provided */ if (filename == null || filename.Length == 0) { tet.create_pvf(pvfname, data, ""); filename = pvfname; } tet.set_option(globaloptlist); int doc = tet.open_document(filename, docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() (source: attachment '" + realname + "'): " + tet.get_errmsg()); retval = 5; } else { process_document(outfp, tet, doc); } /* * If there was no PVF file deleting it won't do any harm */ tet.delete_pvf(pvfname); } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "() (source: attachment '" + realname + "'): " + e.get_errmsg()); retval = 1; } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); retval = 1; } finally { if (tet != null) { tet.Dispose(); } } return(retval); }
static int Main(string[] args) { /* global option list */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* document-specific option list */ string docoptlist = ""; /* page-specific option list e.g * "imageanalysis={merge={gap=1} smallimages={maxwidth=20}}" */ string pageoptlist = ""; TET tet; int pageno = 0; string outfilebase; if (args.Length != 1) { Console.WriteLine("usage: image_resources <filename>"); return(2); } outfilebase = args.GetValue(0).ToString(); if ((outfilebase.Length > 4) && (outfilebase.Substring(outfilebase.Length - 4).Equals(".pdf")) || (outfilebase.Substring(outfilebase.Length - 4).Equals(".PDF"))) { outfilebase = outfilebase.Substring(0, outfilebase.Length - 4); } tet = new TET(); try { int n_pages; tet.set_option(globaloptlist); int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist); if (doc == -1) { Console.WriteLine("Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); return(2); } /* Get number of pages in the document */ n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over pages and extract images */ for (pageno = 1; pageno <= n_pages; ++pageno) { int page; int imagecount = 0; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); continue; /* try next page */ } /* * Retrieve all images on the page */ while ((tet.get_image_info(page)) == 1) { String imageoptlist; int maskid; imagecount++; /* Report image details: pixel geometry, color space etc. */ report_image_info(tet, doc, tet.imageid); /* Report placement geometry */ Console.WriteLine(" placed on page " + pageno + " at position (" + tet.x.ToString("f2") + ", " + tet.y.ToString("f2") + "): " + (int)tet.width + "x" + (int)tet.height + "pt, alpha=" + tet.alpha + ", beta=" + tet.beta); /* Write image data to file */ imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "}"; if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1) { Console.WriteLine("\nError [" + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); continue; /* try next image */ } /* Check whether the image has a mask attached... */ maskid = (int)tet.pcos_get_number(doc, "images[" + tet.imageid + "]/maskid"); /* and retrieve it if present */ if (maskid != -1) { Console.WriteLine(" masked with "); report_image_info(tet, doc, maskid); imageoptlist = "filename={" + outfilebase + "_p" + pageno + "_" + imagecount + "_I" + tet.imageid + "mask_I" + maskid + "}"; if (tet.write_image_file(doc, tet.imageid, imageoptlist) == -1) { Console.WriteLine("\nError [" + tet.get_errnum() + " in " + tet.get_apiname() + "() for mask image: " + tet.get_errmsg()); continue; /* try next image */ } } if (tet.get_errnum() != 0) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); } } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { /* caught exception thrown by TET */ Console.WriteLine("Error {0} in {1}(): {2}", e.get_errnum(), e.get_apiname(), e.get_errmsg()); return(2); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); return(2); } finally { if (tet != null) { tet.Dispose(); } } return(0); }
static int Main(string[] args) { /* global option list */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* document-specific option list */ string docoptlist = ""; /* page-specific option list */ string pageoptlist = "granularity=page"; /* separator to emit after each chunk of text. This depends on the * applications needs; for granularity=word a space character may be useful. */ string separator = "\n"; TET tet; FileStream outfile; BinaryWriter w; int pageno = 0; UnicodeEncoding unicode = new UnicodeEncoding(false, true); Byte[] byteOrderMark = unicode.GetPreamble(); if (args.Length != 2) { Console.WriteLine("usage: extractor <infilename> <outfilename>"); return(2); } outfile = File.Create(args.GetValue(1).ToString()); w = new BinaryWriter(outfile); w.Write(byteOrderMark); tet = new TET(); try { int n_pages; tet.set_option(globaloptlist); int doc = tet.open_document(args.GetValue(0).ToString(), docoptlist); if (doc == -1) { Console.WriteLine("Error {0} in {1}(): {2}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); return(2); } /* get number of pages in the document */ n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* loop over pages in the document */ for (pageno = 1; pageno <= n_pages; ++pageno) { string text; int page; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error {0} in {1}() on page {2}: {3}", tet.get_errnum(), tet.get_apiname(), pageno, tet.get_errmsg()); continue; /* try next page */ } /* Retrieve all text fragments; This is actually not required * for granularity=page, but must be used for other * granularities. */ while ((text = tet.get_text(page)) != null) { /* print the retrieved text */ w.Write(unicode.GetBytes(text)); /* print a separator between chunks of text */ w.Write(unicode.GetBytes(separator)); } if (tet.get_errnum() != 0) { Console.WriteLine("Error {0} in {1}(): {3}", tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { /* caught exception thrown by TET */ Console.WriteLine("Error {0} in {1}(): {2}", e.get_errnum(), e.get_apiname(), e.get_errmsg()); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); return(2); } finally { outfile.Close(); if (tet != null) { tet.Dispose(); } } return(0); }
public static void Main(String[] args) { /* Global option list. */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* Document specific option list. */ string docoptlist = ""; /* Page-specific option list. */ string pageoptlist = "granularity=word"; FileStream outfile; StreamWriter outfp; if (args.Length != 2) { Console.WriteLine("usage: glyphinfo <infilename> <outfilename>"); return; } outfile = File.Create(args.GetValue(1).ToString()); outfp = new StreamWriter(outfile, System.Text.Encoding.UTF8); TET tet = null; try { tet = new TET(); tet.set_option(globaloptlist); int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } /* get number of pages in the document */ int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over pages in the document */ for (int pageno = 1; pageno <= n_pages; ++pageno) { string text; int page; int previouscolor = -1; page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); continue; /* try next page */ } /* Administrative information */ outfp.WriteLine("[ Document: '" + tet.pcos_get_string(doc, "filename") + "' ]"); outfp.WriteLine("[ Document options: '" + docoptlist + "' ]"); outfp.WriteLine("[ Page options: '" + pageoptlist + "' ]"); outfp.WriteLine("[ ----- Page " + pageno + " ----- ]"); /* Retrieve all text fragments */ while ((text = tet.get_text(page)) != null) { /* print the retrieved text */ outfp.WriteLine("[" + text + "]"); /* Loop over all glyphs and print their details */ while (tet.get_char_info(page) != -1) { string str; string fontname; /* Fetch the font name with pCOS (based on its ID) */ fontname = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name"); /* Print the character */ str = String.Format("U+{0}", tet.uv.ToString("X4")); /* ...and its UTF8 representation */ str = str + String.Format(" '" + (char)(tet.uv) + "'"); /* Print font name, size, and position */ str = str + String.Format(" {0} size={1} x={2} y={3}", fontname, tet.fontsize.ToString("f2"), tet.x.ToString("f2"), tet.y.ToString("f2")); /* Print the color id */ str = str + String.Format(" colorid={0}", tet.colorid); /* check wheather the text color changes */ if (tet.colorid != previouscolor) { str = print_color_value(str, tet, doc, tet.colorid); previouscolor = tet.colorid; } /* Examine the "type" member */ if (tet.type == 1) { str = str + " ligature_start"; } else if (tet.type == 10) { str = str + " ligature_cont"; } /* Separators are only inserted for granularity > word*/ else if (tet.type == 12) { str = str + " inserted"; } /* Examine the bit flags in the "attributes" member */ const int ATTR_NONE = 0; const int ATTR_SUB = 1; const int ATTR_SUP = 2; const int ATTR_DROPCAP = 4; const int ATTR_SHADOW = 8; const int ATTR_DH_PRE = 16; const int ATTR_DH_ARTF = 32; const int ATTR_DH_POST = 64; if (tet.attributes != ATTR_NONE) { if ((tet.attributes & ATTR_SUB) == ATTR_SUB) { str = str + "/sub"; } if ((tet.attributes & ATTR_SUP) == ATTR_SUP) { str = str + "/sup"; } if ((tet.attributes & ATTR_DROPCAP) == ATTR_DROPCAP) { str = str + "/dropcap"; } if ((tet.attributes & ATTR_SHADOW) == ATTR_SHADOW) { str = str + "/shadow"; } if ((tet.attributes & ATTR_DH_PRE) == ATTR_DH_PRE) { str = str + "/dehyphenation_pre"; } if ((tet.attributes & ATTR_DH_ARTF) == ATTR_DH_ARTF) { str = str + "/dehyphenation_artifact"; } if ((tet.attributes & ATTR_DH_POST) == ATTR_DH_POST) { str = str + "/dehyphenation_post"; } } outfp.WriteLine(str); } outfp.WriteLine(""); } if (tet.get_errnum() != 0) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); } finally { if (tet != null) { tet.Dispose(); } } }
public static void Main(String[] args) { /* Global option list. */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* Document specific option list. */ string docoptlist = ""; /* Page-specific option list. */ string pageoptlist = "granularity=line"; /* Search text with at least this size (use 0 to catch all sizes). */ double fontsizetrigger = 10; /* Catch text where the font name contains this string (use empty string * to catch all font names). */ String fontnametrigger = "Bold"; TET tet = null; int pageno = 0; if (args.Length != 1) { Console.WriteLine("usage: fontfilter <infilename>"); return; } try { tet = new TET(); tet.set_option(globaloptlist); int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } /* Loop over pages in the document */ int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); for (pageno = 1; pageno <= n_pages; ++pageno) { int page = tet.open_page(doc, pageno, pageoptlist); if (page == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; /* try next page */ } /* Retrieve all text fragments for the page */ String text; while ((text = tet.get_text(page)) != null) { /* Loop over all characters */ int ci; while ((ci = tet.get_char_info(page)) != -1) { /* We need only the font name and size; the text * position could be fetched from tet.x and tet.y. */ String fontname = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name"); /* Check whether we found a match */ if (tet.fontsize >= fontsizetrigger && fontname.IndexOf(fontnametrigger) != -1) { /* print the retrieved font name, size, and text */ Console.WriteLine("[{0} {1:0.00}] {2}", fontname, tet.fontsize, text); } /* In this sample we check only the first character of * each fragment. */ break; } } if (tet.get_errnum() != 0) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { if (pageno == 0) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); } finally { tet.Dispose(); } }
public static void Main(String[] args) { /* Global option list. */ string globaloptlist = "searchpath={{../data} {../../data}}"; /* Document specific option list. */ string basedocoptlist = ""; /* Page-specific option list. */ /* Remove the tetml= option if you don't need font and geometry * information */ string pageoptlist = "granularity=word tetml={glyphdetails={all}}"; /* set this to true to generate TETML output in memory */ bool inmemory = false; if (args.Length != 2) { Console.WriteLine("usage: tetml <pdffilename> <xmlfilename>"); return; } TET tet = null; try { String docoptlist; tet = new TET(); tet.set_option(globaloptlist); if (inmemory) { /* * This program fetches the TETML data encoded in UTF-8. * Subsequently the data is converted to a VisualBasic String, * which is encoded in UTF-16. * While it is not strictly necessary in case of this program, it * is more clean to instruct TET to put 'encoding="UTF-16"' into * the XML header. */ docoptlist = "tetml={encodingname=UTF-16} " + basedocoptlist; } else { docoptlist = "tetml={filename={" + args[1] + "}} " + basedocoptlist; } int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { Console.WriteLine("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } int n_pages = (int)tet.pcos_get_number(doc, "length:pages"); /* Loop over pages in the document */ for (int pageno = 1; pageno <= n_pages; ++pageno) { tet.process_page(doc, pageno, pageoptlist); } /* This could be combined with the last page-related call. */ tet.process_page(doc, 0, "tetml={trailer}"); if (inmemory) { /* Get the XML document as a byte array. */ byte[] tetml = tet.get_tetml(doc, ""); if (tetml == null) { Console.WriteLine("tetml: couldn't retrieve XML data"); return; } /* Process the in-memory XML document to print out some * information that is extracted with the sax_handler class. */ XmlDocument xmldoc = new XmlDocument(); UTF8Encoding utf8_enc = new UTF8Encoding(); String stetml = utf8_enc.GetString(tetml); xmldoc.LoadXml(stetml); XmlNodeList nodeList; XmlElement root = xmldoc.DocumentElement; /* Create an XmlNamespaceManager for resolving namespaces. */ XmlNamespaceManager nsmgr = new XmlNamespaceManager(xmldoc.NameTable); nsmgr.AddNamespace("tet", "http://www.pdflib.com/XML/TET5/TET-5.0"); nodeList = root.SelectNodes("//tet:Font", nsmgr); IEnumerator ienum = nodeList.GetEnumerator(); while (ienum.MoveNext()) { XmlNode font = (XmlNode)ienum.Current; XmlAttributeCollection attrColl = font.Attributes; XmlAttribute name_attr = (XmlAttribute)attrColl.GetNamedItem("name"); XmlAttribute type_attr = (XmlAttribute)attrColl.GetNamedItem("type"); Console.WriteLine("Font " + name_attr.Value + " " + type_attr.Value); } nodeList = root.SelectNodes("//tet:Word", nsmgr); Console.WriteLine("Found " + nodeList.Count + " words in document"); } tet.close_document(doc); } catch (TETException e) { Console.WriteLine("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); } catch (Exception e) { Console.WriteLine("General Exception: " + e.ToString()); } finally { if (tet != null) { tet.Dispose(); } } }