private static string ExtractText(string pdfFilePath) { StringBuilder sb = new StringBuilder(); //if pdf, open the pdf and determine if it has a text layer try { using (PdfReader reader = new PdfReader(pdfFilePath)) { iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader); iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy; for (int i = 1; i <= reader.NumberOfPages; i++) { strategy = parser.ProcessContent(i, new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy()); string text = strategy.GetResultantText(); if (!String.IsNullOrEmpty(text)) { sb.AppendLine(text); } } } } catch (Exception ex) { Debug.WriteLine(ex.ToString()); } return(sb.ToString()); }
public async Task <bool> ProcessNextFile(int runnerId) { FileInfo nextFile; DirectoryInfo tempDir = new DirectoryInfo(Path.Combine(working.FullName, Guid.NewGuid().ToString())); tempDir.Create(); try { await fetchLock.WaitAsync(); try { nextFile = source.GetFiles().FirstOrDefault(); if (nextFile != null) { progress.Report(new Tuple <int, string>(runnerId, nextFile.Name)); var oldPath = nextFile.FullName; var workingPath = Path.Combine(tempDir.FullName, nextFile.Name); nextFile.MoveTo(workingPath); while (File.Exists(oldPath)) { //wait for the delete to go through await Task.Delay(100); } nextFile = new FileInfo(workingPath); } } finally { fetchLock.Release(); } if (nextFile != null) { Document document = Document.CreateNewFromRootDirectory(processed, nextFile.NameWithoutExtension()); if (nextFile.Extension.Equals(".pdf", StringComparison.InvariantCultureIgnoreCase)) { bool hasText = false; //if pdf, open the pdf and determine if it has a text layer using (PdfReader reader = new PdfReader(nextFile.FullName)) { iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader); iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy; for (int i = 1; i <= reader.NumberOfPages; i++) { //byte[] pageContent = reader.GetPageContent(i); strategy = parser.ProcessContent(i, new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy()); string text = strategy.GetResultantText(); if (!text.IsNullOrWhiteSpaceOrGibberish()) { hasText = true; break; } } } if (!hasText)//If it does not have a text layer, convert to images with ghostscript { RunSilentProcess(ghostScriptPath, tempDir.FullName, $"-q -dNOPAUSE -dBATCH -sDEVICE=pngalpha -dUseCropBox -sOutputFile=output-%d.png -r600 \"{nextFile.Name}\""); List <string> pages = new List <string>(); foreach (var pageImageFile in tempDir.GetFiles("output-*.png")) { var imageFile = pageImageFile; //verify image size is reasonable try { bool converted = false; using (var i = Bitmap.FromFile(pageImageFile.FullName)) { //5100 width for 600 dpi 8.5 inch if (i.Size.Width > 6000 || i.Size.Height > 12000) { using (var outputImage = new Bitmap(5100, (int)(((double)i.Size.Height / i.Size.Width) * 5100), PixelFormat.Format24bppRgb)) { outputImage.SetResolution(600, 600); using (var g = Graphics.FromImage(outputImage)) { g.DrawImage(i, 0, 0, outputImage.Width, outputImage.Height); } imageFile = new FileInfo(Path.Combine(pageImageFile.Directory.FullName, pageImageFile.NameWithoutExtension() + "F.png")); outputImage.Save(imageFile.FullName, ImageFormat.Png); converted = true; } } } if (converted) { pageImageFile.Delete(); } } catch (Exception ex) { Debug.WriteLine(ex.ToString()); } //run tesseract.exe source output(without extension) -l eng PDF to get PDF file RunSilentProcess(tesseractPath, tempDir.FullName, $"{imageFile.Name} {pageImageFile.NameWithoutExtension()} --oem 1 -l eng PDF"); pages.Add(pageImageFile.NameWithoutExtension() + ".pdf"); imageFile.Delete(); } if (pages.Count == 1) { MoveAndOverwrite(Path.Combine(tempDir.FullName, pages[0]), document.ProcessedFile.FullName); } else if (pages.Count > 1) { //if multiple pages use ghostscript to combine to one pdf string inputFilesCommandLine = String.Join(" ", pages); RunSilentProcess(ghostScriptPath, tempDir.FullName, $"-q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -sOutputFile=output.pdf {inputFilesCommandLine}"); MoveAndOverwrite(Path.Combine(tempDir.FullName, "output.pdf"), document.ProcessedFile.FullName); foreach (var page in pages) { File.Delete(Path.Combine(tempDir.FullName, page)); } } } else//If it has a text layer, leave it as is. { File.Copy(nextFile.FullName, document.ProcessedFile.FullName, true); } } else//assume image { //run tesseract.exe source output(without extension) -l eng PDF to get PDF file RunSilentProcess(tesseractPath, tempDir.FullName, $"\"{nextFile.Name}\" \"{nextFile.NameWithoutExtension()}\" --oem 1 -l eng PDF"); MoveAndOverwrite(Path.Combine(tempDir.FullName, nextFile.NameWithoutExtension() + ".pdf"), document.ProcessedFile.FullName); } //make previews RenderPDFToJpegFile(document.ProcessedFile.FullName, 50, document.Preview50File.FullName); RenderPDFToJpegFile(document.ProcessedFile.FullName, 300, document.Preview300File.FullName); //output text File.WriteAllText(document.PreviewTextFile.FullName, ExtractText(document.ProcessedFile.FullName)); //save original file nextFile.MoveTo(document.OriginalFile.FullName); } tempDir.Delete(true); } catch (Exception ex) { Console.WriteLine(ex.ToString()); } return(false); }
public static bool CombineBriefPages_AddingBlanks( List <CockleFilePdf> srcFiles, string src, TypeOfBindEnum bind) { // new attempt Dec 28, to account for divider pages (or any page without text) // text has to start on odd-numbered page, if followed by divider page // first, add 2 pages for each divider page, to account for front and back. // then, when everything is together, cycle through doc to add extra dividers... // ... so that text always falls on odd-numbered page // should work for both Saddle Stitch and Perfect Bind // create new list without cover, ordered by rank List <CockleFilePdf> files = new List <CockleFilePdf>( srcFiles .Where(f => f.FileType != SourceFileTypeEnum.Cover) .Where(f => f.FileType != SourceFileTypeEnum.InsideCv) .Where(f => f.FileType != SourceFileTypeEnum.SidewaysPage) .Where(f => f.FileType != SourceFileTypeEnum.Brief_Foldout) .Where(f => f.FileType != SourceFileTypeEnum.Brief_ZFold) .Where(f => f.FileType != SourceFileTypeEnum.App_Foldout) .Where(f => f.FileType != SourceFileTypeEnum.App_ZFold) .Where(f => f.FileType != SourceFileTypeEnum.Unrecognized) .OrderBy(f => f.Rank)); if (files.Count < 1) { return(false); } // what if files.Count == 1 ??? just return ??? int pageCount = 0; bool hasDividers = false; bool firstAppFileFound = false; int firstPageOfApp = -1; try { using (var stream = new System.IO.FileStream(src, System.IO.FileMode.Create)) { // initiate iTextSharp processes iTextSharp.text.Document pdfdoc = new iTextSharp.text.Document(iTextSharp.text.PageSize.LETTER); iTextSharp.text.pdf.PdfCopy pdfcopy = new iTextSharp.text.pdf.PdfCopy(pdfdoc, stream); pdfdoc.Open(); // merge pdfs in folder CockleFilePdf f; for (int i = 0; i < files.Count; i++) { f = files[i]; // read file iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(f.FullName); int filePageCount = reader.NumberOfPages; // set up pdfstamper iTextSharp.text.pdf.PdfStamper stamper = new iTextSharp.text.pdf.PdfStamper(reader, stream); // look for divider pages here, add blank if exists List <int> divider_pages = new List <int>(); iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader); for (int j = 1; j <= reader.NumberOfPages; j++) { iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy extract = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); var extractedText = parser.ProcessContent(j, extract); string textFromPage = extractedText.GetResultantText(); int cnt = textFromPage.ToCharArray().Count(); int mch_cnt = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"A(PPENDIX|ppendix)").Count; if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0) { // collect blank pages divider_pages.Add(j); } else if (cnt < 50 && mch_cnt > 0) { // collect other divider pages divider_pages.Add(j); } } if (divider_pages.Count > 0) { hasDividers = true; int k = 0; // adjust for total page number change foreach (int page in divider_pages) { stamper.InsertPage(page + k, reader.GetPageSizeWithRotation(1)); filePageCount = reader.NumberOfPages; k++; } } // add blank page if needed to make even number if (files[i].FileType == SourceFileTypeEnum.Index || files[i].FileType == SourceFileTypeEnum.Brief || files[i].FileType == SourceFileTypeEnum.App_Index || files[i].FileType == SourceFileTypeEnum.Motion || files[i].FileType == SourceFileTypeEnum.Divider_Page) { f.AssignNeedsBlankPage(files, reader.NumberOfPages); if (f.NeedsBlankPage) { //PdfStamper stamper2 = new PdfStamper(reader, stream); stamper.InsertPage(reader.NumberOfPages + 1, reader.GetPageSizeWithRotation(1)); filePageCount = reader.NumberOfPages; } } // with last document in 'files', add extra pages to make divisible by 4 if (bind == TypeOfBindEnum.SaddleStitch && i == files.Count - 1) { if (bind == TypeOfBindEnum.SaddleStitch && (pageCount + reader.NumberOfPages) % 4 != 0) { //PdfStamper stamper3 = new PdfStamper(reader, stream); while ((pageCount + reader.NumberOfPages) % 4 != 0) { stamper.InsertPage(reader.NumberOfPages + 1, reader.GetPageSizeWithRotation(1)); } } } // get first page of first app file if (!firstAppFileFound && files[i].FileType == SourceFileTypeEnum.App_File) { firstAppFileFound = true; firstPageOfApp = pageCount + 1; } // add document to 'src' pdfcopy.AddDocument(new iTextSharp.text.pdf.PdfReader(reader)); pageCount += reader.NumberOfPages; } pdfcopy.Close(); pdfdoc.CloseDocument(); } // final cycle, if dividers, to make sure text starts on odd-sided pages if (bind == TypeOfBindEnum.PerfectBind && hasDividers) { string dest = (System.IO.Path.GetDirectoryName(src) + @"\temp " + DateTime.Now.ToString("ddMMyyyyhhmmssffff")); using (var stream = new System.IO.FileStream(dest, System.IO.FileMode.Create)) { iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(src); iTextSharp.text.pdf.PdfStamper stamper = new iTextSharp.text.pdf.PdfStamper(reader, stream); // get all blank pages in appendix iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader); List <List <int> > groupsOfBlanks = new List <List <int> >(); List <int> group_list = new List <int>(); int x; for (x = firstPageOfApp; x <= reader.NumberOfPages; x++) { iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy extract = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); var extractedText = parser.ProcessContent(x, extract); string textFromPage = extractedText.GetResultantText(); // find blank pages and cluster into group_list if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0) { // capture blank page cluster (??? but what if only 1 page ???) if (group_list.Count == 0 || group_list.Contains(x - 1)) { group_list.Add(x); } } else { // find first page after cluster if (group_list.Count > 0) { if (group_list.Last() % 2 == 1) { // add blank page stamper.InsertPage(group_list.Last() + 1, reader.GetPageSizeWithRotation(1)); } } // clear list group_list.Clear(); } } stamper.Close(); reader.Close(); } System.IO.File.Delete(src); System.IO.File.Move(dest, src); } } catch (Exception excpt) { System.Diagnostics.Debug.WriteLine(excpt); return(false); } return(true); }