////////////////////////////////////////////////////////////////////////////////////////////////// // GetFirstParagraph // get reference to the first paragraph on the page ////////////////////////////////////////////////////////////////////////////////////////////////// private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem) { // search kid struct elements for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_elem == null) { PdfixEngine.ThrowException(); } string type = kid_elem.GetType_(true); if (type == "P") { return(kid_elem); } var paragraph = GetFirstParagraph(kid_elem); if (paragraph != null) { return(paragraph); } } } return(null); }
private static void ProcessPageObject(PdfPage page, PdsPageObject obj, string savePath) { if (obj == null) { PdfixEngine.ThrowException(); } switch (obj.GetObjectType()) { case PdfPageObjectType.kPdsPageImage: ExtractImage(page, (PdsImage)obj, savePath); break; case PdfPageObjectType.kPdsPageForm: { var form = (PdsForm)obj; var content = form.GetContent(); for (int i = 0; i < content.GetNumObjects(); i++) { ProcessPageObject(page, content.GetObject(i), savePath); } } break; } }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetFirstParagraph // get reference to the first paragraph on the page ////////////////////////////////////////////////////////////////////////////////////////////////// private static bool MoveParagraphToParent(PdsStructElement struct_elem) { // search kid struct elements for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_elem == null) { PdfixEngine.ThrowException(); } string type = kid_elem.GetType_(true); if (type == "P") { if (!struct_elem.MoveChild(i, struct_elem, struct_elem.GetNumChildren() - 1)) { throw new Exception(); } return(true); } var paragraph = MoveParagraphToParent(kid_elem); return(paragraph); } } return(false); }
////////////////////////////////////////////////////////////////////////////////////////////////// // Remove paragraph from structure tree // re-tag the struct element to heading based on font properties ////////////////////////////////////////////////////////////////////////////////////////////////// internal static void RemoveParagraph(PdsStructElement struct_elem) { // remove last 2 P struct elements from struct tree for (int i = struct_elem.GetNumChildren() - 1; i >= 0; i--) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); string type = kid_elem.GetType_(true); if (type == "P") { for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--) { if (!kid_elem.RemoveChild(j)) { PdfixEngine.ThrowException(); } } } else if (type == "Figure") { // remove figure if does not contain an alt text string alt = kid_elem.GetAlt(); if (alt.Length == 0) { for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--) { if (!kid_elem.RemoveChild(j)) { PdfixEngine.ThrowException(); } } } } else { RemoveParagraph(kid_elem); } // remove this element if it has no kids if (kid_elem.GetNumChildren() == 0) { struct_elem.RemoveChild(i); } } // remove only 2 paragraphs in this sample if (++count >= 2) { break; } } }
static void Main() { try { PdfixEngine.Init(); PdfixSamples.Run("..\\..\\..\\"); Console.WriteLine("SUCCESS"); } catch (System.Exception ex) { Console.WriteLine(ex.Message); } finally { PdfixEngine.Terminate(); } }
static void DoSomething() { Console.WriteLine("{0} = waiting", Thread.CurrentThread.Name); semaphore.WaitOne(); Console.WriteLine("{0} begins!", Thread.CurrentThread.Name); PdfPage page = doc.AcquirePage(0); if (page == null) { PdfixEngine.ThrowException(); } Thread.Sleep(1000); page.Release(); Console.WriteLine("{0} releasing...", Thread.CurrentThread.Name); semaphore.Release(); }
private static void ExtractImage(PdfPage page, PdsImage image, string savePath) { // widget specific properties var bbox = image.GetBBox(); var page_view = page.AcquirePageView(1, PdfRotate.kRotate0); var rect = page_view.RectToDevice(bbox); PdfMatrix matrix = page_view.GetDeviceMatrix(); page_view.Release(); string imgPath = savePath + "/image_" + (++_imageIndex).ToString() + ".jpg"; Console.WriteLine("Image Found: " + imgPath); var img = PdfixEngine.Instance.CreateImage(rect.right - rect.left, rect.bottom - rect.top, PsImageDIBFormat.kImageDIBFormatArgb); PdfPageRenderParams renderParams = new PdfPageRenderParams { clip_box = bbox, matrix = matrix, image = img }; if (!page.DrawContent(renderParams, null, null)) { PdfixEngine.ThrowException(); } PdfImageParams imgParams = new PdfImageParams { format = PdfImageFormat.kImageFormatJpg, quality = 80 }; if (!img.Save(imgPath, imgParams)) { PdfixEngine.ThrowException(); } img.Destroy(); }
// process struct element childs and collect all bboxes of the content elements // bounding boxes of child elements and content can be placed on different pages static List <PdfRect> GetStructElementBboxes(PdfDoc doc, PdsStructElement struct_elem) { var bboxes = new List <PdfRect>(); int num_kids = struct_elem.GetNumChildren(); for (int i = 0; i < num_kids; i++) { var kid_obj = struct_elem.GetChildObject(i); switch (struct_elem.GetChildType(i)) { case PdfStructElementType.kPdsStructChildElement: { var kid_struct_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_struct_elem == null) { PdfixEngine.ThrowException(); } bboxes.AddRange(GetStructElementBboxes(doc, kid_struct_elem)); } break; case PdfStructElementType.kPdsStructChildObject: break; case PdfStructElementType.kPdsStructChildStreamContent: case PdfStructElementType.kPdsStructChildPageContent: { var kid_page_num = struct_elem.GetChildPageNumber(i); var mcid = struct_elem.GetChildMcid(i); bboxes.AddRange(GetMcidBBoxes(doc, kid_page_num, mcid)); } break; } } return(bboxes); }
static void ProcessStructElement(PdfDoc doc, PdsStructElement struct_elem, string indent) { indent += " "; if (struct_elem == null) { PdfixEngine.ThrowException(); } // get the element type string type_str = struct_elem.GetType_(true); if (type_str.Length != 0) { Console.WriteLine(indent + "Struct Element: " + type_str); } // get struct element bounding boxes (can be on multiple pages) // skip document struct element - can take long time to collect all bboxes if (type_str != "Document") { List <PdfRect> bboxes = GetStructElementBboxes(doc, struct_elem); bboxes.ForEach(delegate(PdfRect bbox) { Console.WriteLine(indent + "[" + bbox.left + ", " + bbox.bottom + ", " + bbox.right + ", " + bbox.top + "]"); }); } string tile_str = struct_elem.GetTitle(); if (tile_str.Length != 0) { Console.WriteLine(indent + "title: " + tile_str); } string actual_text_str = struct_elem.GetActualText(); if (actual_text_str.Length != 0) { Console.WriteLine(indent + "actual text: " + actual_text_str); } string alt_str = struct_elem.GetAlt(); if (alt_str.Length != 0) { Console.WriteLine(indent, "alt: " + alt_str); } var page_num = struct_elem.GetPageNumber(); if (page_num != -1) { Console.WriteLine(indent, "Page number: " + page_num); } int num_kids = struct_elem.GetNumChildren(); for (int i = 0; i < num_kids; i++) { var kid_obj = struct_elem.GetChildObject(i); // based on structure element you can obtain different data switch (struct_elem.GetChildType(i)) { case PdfStructElementType.kPdsStructChildElement: { var kid_struct_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_struct_elem == null) { PdfixEngine.ThrowException(); } ProcessStructElement(doc, kid_struct_elem, indent); } break; case PdfStructElementType.kPdsStructChildObject: break; case PdfStructElementType.kPdsStructChildStreamContent: { var kid_page_num = struct_elem.GetChildPageNumber(i); Console.WriteLine(indent + "Kid Page number: " + kid_page_num); var mcid = struct_elem.GetChildMcid(i); Console.WriteLine(indent + "MCID: " + mcid); } break; case PdfStructElementType.kPdsStructChildPageContent: { var mcid = struct_elem.GetChildMcid(i); Console.WriteLine(indent + "MCID: " + mcid); } break; } } Console.WriteLine(""); }