public void Process() { // Check for folders with TIFF files List <string> directories_to_check = new List <string>(); OnNewTask("Checking for folders with TIFFs", true); recursively_check_for_folders(directory, directories_to_check); // If none to process, throw message and complete if (directories_to_check.Count == 0) { OnNewTask("No folders with TIFF files found", true); OnProcessComplete(); return; } // Step through each folder Image_Derivative_Creation_Processor processor = new Image_Derivative_Creation_Processor(imagemagick_path, kakadu_path, create_jpegs, create_jp2s, jpeg_width, jpeg_height, false, thumbnail_width, thumbnail_height); processor.New_Progress += processor_New_Progress; processor.New_Task_String += processor_New_Task_String; processor.Process_Complete += processor_Process_Complete; foreach (string thisDir in directories_to_check) { OnNewVolume(thisDir); string[] tiff_files = Directory.GetFiles(thisDir, "*.tif"); processor.Process(thisDir, String.Empty, String.Empty, tiff_files); } OnProcessComplete(); }
/// <summary> Pre-process any resource files in the incoming folder </summary> /// <param name="Resource"></param> /// <param name="NewImageFiles"></param> private void PreProcess_Any_Resource_Files( Incoming_Digital_Resource Resource, List<string> NewImageFiles ) { string resourceFolder = Resource.Resource_Folder; string bibID = Resource.BibID; string vid = Resource.VID; // Should we try to convert office files? if (SobekCM_Library_Settings.Convert_Office_Files_To_PDF) { try { // Preprocess each Powerpoint document to PDF string[] ppt_files = Directory.GetFiles(resourceFolder, "*.ppt*"); foreach (string thisPowerpoint in ppt_files) { // Get the fileinfo and the name FileInfo thisPowerpointInfo = new FileInfo(thisPowerpoint); string filename = thisPowerpointInfo.Name.Replace(thisPowerpointInfo.Extension, ""); // Does a PDF version exist for this item? string pdf_version = resourceFolder + "\\" + filename + ".pdf"; if (!File.Exists(pdf_version)) { int conversion_error = Word_Powerpoint_to_PDF_Converter.Powerpoint_To_PDF(thisPowerpoint, pdf_version); switch (conversion_error) { case 1: Add_Error_To_Log("Error converting PPT to PDF: Can't open input file", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; case 2: Add_Error_To_Log("Error converting PPT to PDF: Can't create output file", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; case 3: Add_Error_To_Log("Error converting PPT to PDF: Converting failed", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; case 4: Add_Error_To_Log("Error converting PPT to PDF: MS Office not installed", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; } } } // Preprocess each Word document to PDF string[] doc_files = Directory.GetFiles(resourceFolder, "*.doc*"); foreach (string thisWordDoc in doc_files) { // Get the fileinfo and the name FileInfo thisWordDocInfo = new FileInfo(thisWordDoc); string filename = thisWordDocInfo.Name.Replace(thisWordDocInfo.Extension, ""); // Does a PDF version exist for this item? string pdf_version = resourceFolder + "\\" + filename + ".pdf"; if (!File.Exists(pdf_version)) { int conversion_error = Word_Powerpoint_to_PDF_Converter.Word_To_PDF(thisWordDoc, pdf_version); switch (conversion_error) { case 1: Add_Error_To_Log("Error converting Word DOC to PDF: Can't open input file", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; case 2: Add_Error_To_Log("Error converting Word DOC to PDF: Can't create output file", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; case 3: Add_Error_To_Log("Error converting Word DOC to PDF: Converting failed", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; case 4: Add_Error_To_Log("Error converting Word DOC to PDF: MS Office not installed", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); break; } } } } catch (Exception ee) { StreamWriter errorWriter = new StreamWriter(Application.StartupPath + "\\Logs\\error.log", true); errorWriter.WriteLine("Message: " + ee.Message); errorWriter.WriteLine("Stack Trace: " + ee.StackTrace); errorWriter.Flush(); errorWriter.Close(); Add_Error_To_Log("Unknown error converting office files to PDF", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); Add_Error_To_Log(ee.Message, Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); } } // Preprocess each PDF string[] pdfs = Directory.GetFiles(resourceFolder, "*.pdf"); foreach (string thisPdf in pdfs) { // Get the fileinfo and the name FileInfo thisPdfInfo = new FileInfo(thisPdf); string fileName = thisPdfInfo.Name.Replace(thisPdfInfo.Extension, ""); // Does the full text exist for this item? if (!File.Exists(resourceFolder + "\\" + fileName + "_pdf.txt")) { PDF_Tools.Extract_Text(thisPdf, resourceFolder + "\\" + fileName + "_pdf.txt"); } // Does the thumbnail exist for this item? if ((ghostscript_executable.Length > 0) && (imagemagick_executable.Length > 0)) { if (!File.Exists(resourceFolder + "\\" + fileName + "thm.jpg")) { PDF_Tools.Create_Thumbnail( resourceFolder, thisPdf, resourceFolder + "\\" + fileName + "thm.jpg", ghostscript_executable, imagemagick_executable); } } } // Preprocess each HTML file for the text string[] html_files = Directory.GetFiles(resourceFolder, "*.htm*"); foreach (string thisHtml in html_files) { // Get the fileinfo and the name FileInfo thisHtmlInfo = new FileInfo(thisHtml); // Exclude QC_Error.html if (thisHtmlInfo.Name.ToUpper() != "QC_ERROR.HTML") { // Just don't pull text for the static page if (thisHtmlInfo.Name.ToUpper() != bibID.ToUpper() + "_" + vid.ToUpper() + ".HTML") { string text_fileName = thisHtmlInfo.Name.Replace(".", "_") + ".txt"; // Does the full text exist for this item? if (!File.Exists(resourceFolder + "\\" + text_fileName)) { HTML_XML_Text_Extractor.Extract_Text(thisHtml, resourceFolder + "\\" + text_fileName); } } } } // Preprocess each XML file for the text string[] xml_files = Directory.GetFiles(resourceFolder, "*.xml"); foreach (string thisXml in xml_files) { // Get the fileinfo and the name FileInfo thisXmlInfo = new FileInfo(thisXml); // Just don't pull text for the static page string xml_upper = thisXmlInfo.Name.ToUpper(); if (( xml_upper.IndexOf(".METS") < 0 ) && ( xml_upper != "DOC.XML" ) && ( xml_upper != "CITATION_METS.XML") && ( xml_upper != "MARC.XML" )) { string text_fileName = thisXmlInfo.Name.Replace(".", "_") + ".txt"; // Does the full text exist for this item? if (!File.Exists(resourceFolder + "\\" + text_fileName)) { HTML_XML_Text_Extractor.Extract_Text(thisXml, resourceFolder + "\\" + text_fileName); } } } // Run OCR for any TIFF files that do not have any corresponding TXT files if (SobekCM_Library_Settings.OCR_Command_Prompt.Length > 0) { string[] ocr_tiff_files = Directory.GetFiles(resourceFolder, "*.tif"); foreach (string thisTiffFile in ocr_tiff_files) { FileInfo thisTiffFileInfo = new FileInfo(thisTiffFile); string text_file = resourceFolder + "\\" + thisTiffFileInfo.Name.Replace(thisTiffFileInfo.Extension,"") + ".txt"; if ( !File.Exists( text_file )) { try { string command = String.Format( SobekCM_Library_Settings.OCR_Command_Prompt, thisTiffFile, text_file ); Process ocrProcess = new Process {StartInfo = {FileName = command}}; ocrProcess.Start(); ocrProcess.WaitForExit(); } catch { Add_Error_To_Log("Error launching OCR on (" + thisTiffFileInfo.Name + ")", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); } } } } // Clean any incoming text files first and look for SSN in text string ssn_text_file_name = String.Empty; string ssn_match = String.Empty; try { // Get the list of all text files here string[] text_files = Directory.GetFiles(resourceFolder, "*.txt"); if (text_files.Length > 0) { // Step through each text file foreach (string textFile in text_files) { // Clean the text file first Text_Cleaner.Clean_Text_File(textFile); // If no SSN possibly found, look for one if (ssn_match.Length == 0) { ssn_match = Text_Cleaner.Has_SSN(textFile); if ( ssn_match.Length > 0 ) ssn_text_file_name = (new FileInfo(textFile)).Name; } } } } catch { } // Send a database email if there appears to have been a SSN if (ssn_match.Length > 0 ) { if (SobekCM_Library_Settings.Privacy_Email_Address.Length > 0) { SobekCM_Database.Send_Database_Email(SobekCM_Library_Settings.Privacy_Email_Address, "Possible Social Security Number Located", "A string which appeared to be a possible social security number was found while bulk loading or post-processing an item.\n\nThe SSN was found in package " + bibID + ":" + vid + " in file '" + ssn_text_file_name + "'.\n\nThe text which may be a SSN is '" + ssn_match + "'.\n\nPlease review this item and remove any private information which should not be on the web server.", false, false, -1, -1); } Add_NonError_To_Log("Possible SSN Located (" + ssn_text_file_name + ")", "Privacy Warning", Resource.BibID + ":" + Resource.VID, Resource.METS_Type_String, Resource.BuilderLogId); } // Are there images that need to be processed here? if ( !String.IsNullOrEmpty(imagemagick_executable)) { // Get the list of jpeg and tiff files string[] jpeg_files = Directory.GetFiles(resourceFolder, "*.jpg"); string[] tiff_files = Directory.GetFiles(resourceFolder, "*.tif"); // Only continue if some exist if ((jpeg_files.Length > 0) || (tiff_files.Length > 0)) { // Create the image process object for creating Image_Derivative_Creation_Processor imageProcessor = new Image_Derivative_Creation_Processor(imagemagick_executable, Application.StartupPath + "\\Kakadu", true, true, SobekCM_Library_Settings.JPEG_Width, SobekCM_Library_Settings.JPEG_Height, false, SobekCM_Library_Settings.Thumbnail_Width, SobekCM_Library_Settings.Thumbnail_Height); imageProcessor.New_Task_String += imageProcessor_New_Task_String; imageProcessor.Error_Encountered += imageProcessor_Error_Encountered; // Step through the JPEGS and ensure they have thumbnails (TIFF generation below makes them as well) if (jpeg_files.Length > 0) { foreach (string jpegFile in jpeg_files) { FileInfo jpegFileInfo = new FileInfo(jpegFile); string name = jpegFileInfo.Name.ToUpper(); if ((name.IndexOf("THM.JPG") < 0) && (name.IndexOf(".QC.JPG") < 0)) { string name_sans_extension = jpegFileInfo.Name.Replace(jpegFileInfo.Extension, ""); if (!File.Exists(resourceFolder + "\\" + name_sans_extension + "thm.jpg")) { imageProcessor.ImageMagick_Create_JPEG(jpegFile, resourceFolder + "\\" + name_sans_extension + "thm.jpg", SobekCM_Library_Settings.Thumbnail_Width, SobekCM_Library_Settings.Thumbnail_Height, Resource.BuilderLogId, Resource.BibID + ":" + Resource.VID); } } } } // Step through any TIFFs as well if (tiff_files.Length > 0) { // Do a complete image derivative creation process on these TIFF files imageProcessor.Process(resourceFolder, bibID, vid, tiff_files, Resource.BuilderLogId); // Since we are actually creating page images here (most likely) try to add // them to the package as well foreach (string thisTiffFile in tiff_files) { // Get the name of the tiff file FileInfo thisTiffFileInfo = new FileInfo(thisTiffFile); string tiffFileName = thisTiffFileInfo.Name.Replace(thisTiffFileInfo.Extension, ""); // Get matching files string[] matching_files = Directory.GetFiles(resourceFolder, tiffFileName + ".*"); // Now, step through all these files foreach (string derivativeFile in matching_files) { // If this is a page image type file, add it FileInfo derivativeFileInfo = new FileInfo(derivativeFile); if (SobekCM_Library_Settings.PAGE_IMAGE_EXTENSIONS.Contains(derivativeFileInfo.Extension.ToUpper().Replace(".", ""))) NewImageFiles.Add(derivativeFileInfo.Name); } } } } } }
/// <summary> Creates all the image derivative files from original jpeg and tiff files </summary> /// <param name="Resource"> Incoming digital resource object </param> /// <returns> TRUE if processing can continue, FALSE if a critical error occurred which should stop all processing </returns> public override bool DoWork(Incoming_Digital_Resource Resource) { returnValue = true; string resourceFolder = Resource.Resource_Folder; string bibID = Resource.BibID; string vid = Resource.VID; string imagemagick_executable = MultiInstance_Builder_Settings.ImageMagick_Executable; // Are there images that need to be processed here? if (!String.IsNullOrEmpty(imagemagick_executable)) { // Get the list of jpeg and tiff files string[] jpeg_files = Directory.GetFiles(resourceFolder, "*.jpg"); string[] tiff_files = Directory.GetFiles(resourceFolder, "*.tif"); // Only continue if some exist if ((jpeg_files.Length > 0) || (tiff_files.Length > 0)) { string startupPath = Path.GetDirectoryName(Assembly.GetCallingAssembly().Location); if (startupPath == null) { OnError("Unable to find the startup path in CreateImageDerivativesModule!", String.Empty, String.Empty, -1); return(false); } string kakadu_path = Path.Combine(startupPath, "Kakadu"); // Create the image process object for creating Image_Derivative_Creation_Processor imageProcessor = new Image_Derivative_Creation_Processor(imagemagick_executable, kakadu_path, true, true, Settings.Resources.JPEG_Width, Settings.Resources.JPEG_Height, false, Settings.Resources.Thumbnail_Width, Settings.Resources.Thumbnail_Height, null); imageProcessor.New_Task_String += imageProcessor_New_Task_String; imageProcessor.Error_Encountered += imageProcessor_Error_Encountered; // Step through the JPEGS and ensure they have thumbnails (TIFF generation below makes them as well) if (jpeg_files.Length > 0) { foreach (string jpegFile in jpeg_files) { FileInfo jpegFileInfo = new FileInfo(jpegFile); string name = jpegFileInfo.Name.ToUpper(); if ((name.IndexOf("THM.JPG") < 0) && (name.IndexOf(".QC.JPG") < 0)) { string name_sans_extension = jpegFileInfo.Name.Replace(jpegFileInfo.Extension, ""); if (!File.Exists(resourceFolder + "\\" + name_sans_extension + "thm.jpg")) { imageProcessor.ImageMagick_Create_JPEG(jpegFile, resourceFolder + "\\" + name_sans_extension + "thm.jpg", Settings.Resources.Thumbnail_Width, Settings.Resources.Thumbnail_Height, Resource.BuilderLogId, Resource.BibID + ":" + Resource.VID); } } } } // Step through any TIFFs as well if (tiff_files.Length > 0) { // Do a complete image derivative creation process on these TIFF files imageProcessor.Process(resourceFolder, bibID, vid, tiff_files, Resource.BuilderLogId); // Since we are actually creating page images here (most likely) try to add // them to the package as well foreach (string thisTiffFile in tiff_files) { // Get the name of the tiff file FileInfo thisTiffFileInfo = new FileInfo(thisTiffFile); string tiffFileName = thisTiffFileInfo.Name.Replace(thisTiffFileInfo.Extension, ""); // Get matching files string[] matching_files = Directory.GetFiles(resourceFolder, tiffFileName + ".*"); // Now, step through all these files foreach (string derivativeFile in matching_files) { // If this is a page image type file, add it FileInfo derivativeFileInfo = new FileInfo(derivativeFile); if (Settings.System.Page_Image_Extensions.Contains(derivativeFileInfo.Extension.ToUpper().Replace(".", ""))) { Resource.NewImageFiles.Add(derivativeFileInfo.Name); } } } } } } return(returnValue); }