Example #1
0
        private static string ExtractText(string pdfFilePath)
        {
            StringBuilder sb = new StringBuilder();

            //if pdf, open the pdf and determine if it has a text layer
            try
            {
                using (PdfReader reader = new PdfReader(pdfFilePath))
                {
                    iTextSharp.text.pdf.parser.PdfReaderContentParser  parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader);
                    iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy;
                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        strategy = parser.ProcessContent(i, new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy());
                        string text = strategy.GetResultantText();
                        if (!String.IsNullOrEmpty(text))
                        {
                            sb.AppendLine(text);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.ToString());
            }

            return(sb.ToString());
        }
Example #2
0
        public async Task <bool> ProcessNextFile(int runnerId)
        {
            FileInfo      nextFile;
            DirectoryInfo tempDir = new DirectoryInfo(Path.Combine(working.FullName, Guid.NewGuid().ToString()));

            tempDir.Create();
            try
            {
                await fetchLock.WaitAsync();

                try
                {
                    nextFile = source.GetFiles().FirstOrDefault();
                    if (nextFile != null)
                    {
                        progress.Report(new Tuple <int, string>(runnerId, nextFile.Name));
                        var oldPath     = nextFile.FullName;
                        var workingPath = Path.Combine(tempDir.FullName, nextFile.Name);
                        nextFile.MoveTo(workingPath);
                        while (File.Exists(oldPath))
                        {
                            //wait for the delete to go through
                            await Task.Delay(100);
                        }
                        nextFile = new FileInfo(workingPath);
                    }
                }
                finally
                {
                    fetchLock.Release();
                }

                if (nextFile != null)
                {
                    Document document = Document.CreateNewFromRootDirectory(processed, nextFile.NameWithoutExtension());

                    if (nextFile.Extension.Equals(".pdf", StringComparison.InvariantCultureIgnoreCase))
                    {
                        bool hasText = false;

                        //if pdf, open the pdf and determine if it has a text layer
                        using (PdfReader reader = new PdfReader(nextFile.FullName))
                        {
                            iTextSharp.text.pdf.parser.PdfReaderContentParser  parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader);
                            iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy;
                            for (int i = 1; i <= reader.NumberOfPages; i++)
                            {
                                //byte[] pageContent = reader.GetPageContent(i);
                                strategy = parser.ProcessContent(i, new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy());
                                string text = strategy.GetResultantText();
                                if (!text.IsNullOrWhiteSpaceOrGibberish())
                                {
                                    hasText = true;
                                    break;
                                }
                            }
                        }

                        if (!hasText)//If it does not have a text layer, convert to images with ghostscript
                        {
                            RunSilentProcess(ghostScriptPath, tempDir.FullName, $"-q -dNOPAUSE -dBATCH -sDEVICE=pngalpha -dUseCropBox -sOutputFile=output-%d.png -r600 \"{nextFile.Name}\"");

                            List <string> pages = new List <string>();

                            foreach (var pageImageFile in tempDir.GetFiles("output-*.png"))
                            {
                                var imageFile = pageImageFile;
                                //verify image size is reasonable
                                try
                                {
                                    bool converted = false;
                                    using (var i = Bitmap.FromFile(pageImageFile.FullName))
                                    {
                                        //5100 width for 600 dpi 8.5 inch
                                        if (i.Size.Width > 6000 || i.Size.Height > 12000)
                                        {
                                            using (var outputImage = new Bitmap(5100, (int)(((double)i.Size.Height / i.Size.Width) * 5100), PixelFormat.Format24bppRgb))
                                            {
                                                outputImage.SetResolution(600, 600);
                                                using (var g = Graphics.FromImage(outputImage))
                                                {
                                                    g.DrawImage(i, 0, 0, outputImage.Width, outputImage.Height);
                                                }
                                                imageFile = new FileInfo(Path.Combine(pageImageFile.Directory.FullName, pageImageFile.NameWithoutExtension() + "F.png"));
                                                outputImage.Save(imageFile.FullName, ImageFormat.Png);
                                                converted = true;
                                            }
                                        }
                                    }
                                    if (converted)
                                    {
                                        pageImageFile.Delete();
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Debug.WriteLine(ex.ToString());
                                }

                                //run tesseract.exe source output(without extension) -l eng PDF to get PDF file
                                RunSilentProcess(tesseractPath, tempDir.FullName, $"{imageFile.Name} {pageImageFile.NameWithoutExtension()} --oem 1 -l eng PDF");

                                pages.Add(pageImageFile.NameWithoutExtension() + ".pdf");
                                imageFile.Delete();
                            }

                            if (pages.Count == 1)
                            {
                                MoveAndOverwrite(Path.Combine(tempDir.FullName, pages[0]), document.ProcessedFile.FullName);
                            }
                            else if (pages.Count > 1)
                            {
                                //if multiple pages use ghostscript to combine to one pdf
                                string inputFilesCommandLine = String.Join(" ", pages);
                                RunSilentProcess(ghostScriptPath, tempDir.FullName, $"-q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dPDFSETTINGS=/prepress -sOutputFile=output.pdf {inputFilesCommandLine}");

                                MoveAndOverwrite(Path.Combine(tempDir.FullName, "output.pdf"), document.ProcessedFile.FullName);
                                foreach (var page in pages)
                                {
                                    File.Delete(Path.Combine(tempDir.FullName, page));
                                }
                            }
                        }
                        else//If it has a text layer, leave it as is.
                        {
                            File.Copy(nextFile.FullName, document.ProcessedFile.FullName, true);
                        }
                    }
                    else//assume image
                    {
                        //run tesseract.exe source output(without extension) -l eng PDF to get PDF file
                        RunSilentProcess(tesseractPath, tempDir.FullName, $"\"{nextFile.Name}\" \"{nextFile.NameWithoutExtension()}\" --oem 1 -l eng PDF");

                        MoveAndOverwrite(Path.Combine(tempDir.FullName, nextFile.NameWithoutExtension() + ".pdf"), document.ProcessedFile.FullName);
                    }

                    //make previews
                    RenderPDFToJpegFile(document.ProcessedFile.FullName, 50, document.Preview50File.FullName);
                    RenderPDFToJpegFile(document.ProcessedFile.FullName, 300, document.Preview300File.FullName);

                    //output text
                    File.WriteAllText(document.PreviewTextFile.FullName, ExtractText(document.ProcessedFile.FullName));

                    //save original file
                    nextFile.MoveTo(document.OriginalFile.FullName);
                }
                tempDir.Delete(true);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.ToString());
            }
            return(false);
        }
Example #3
0
        public static bool CombineBriefPages_AddingBlanks(
            List <CockleFilePdf> srcFiles, string src, TypeOfBindEnum bind)
        {
            // new attempt Dec 28, to account for divider pages (or any page without text)
            // text has to start on odd-numbered page, if followed by divider page

            // first, add 2 pages for each divider page, to account for front and back.
            // then, when everything is together, cycle through doc to add extra dividers...
            // ... so that text always falls on odd-numbered page


            // should work for both Saddle Stitch and Perfect Bind


            // create new list without cover, ordered by rank
            List <CockleFilePdf> files = new List <CockleFilePdf>(
                srcFiles
                .Where(f => f.FileType != SourceFileTypeEnum.Cover)
                .Where(f => f.FileType != SourceFileTypeEnum.InsideCv)
                .Where(f => f.FileType != SourceFileTypeEnum.SidewaysPage)
                .Where(f => f.FileType != SourceFileTypeEnum.Brief_Foldout)
                .Where(f => f.FileType != SourceFileTypeEnum.Brief_ZFold)
                .Where(f => f.FileType != SourceFileTypeEnum.App_Foldout)
                .Where(f => f.FileType != SourceFileTypeEnum.App_ZFold)
                .Where(f => f.FileType != SourceFileTypeEnum.Unrecognized)
                .OrderBy(f => f.Rank));

            if (files.Count < 1)
            {
                return(false);
            }

            // what if files.Count == 1 ??? just return ???

            int  pageCount         = 0;
            bool hasDividers       = false;
            bool firstAppFileFound = false;
            int  firstPageOfApp    = -1;

            try
            {
                using (var stream = new System.IO.FileStream(src, System.IO.FileMode.Create))
                {
                    // initiate iTextSharp processes
                    iTextSharp.text.Document    pdfdoc  = new iTextSharp.text.Document(iTextSharp.text.PageSize.LETTER);
                    iTextSharp.text.pdf.PdfCopy pdfcopy = new iTextSharp.text.pdf.PdfCopy(pdfdoc, stream);
                    pdfdoc.Open();

                    // merge pdfs in folder
                    CockleFilePdf f;
                    for (int i = 0; i < files.Count; i++)
                    {
                        f = files[i];
                        // read file
                        iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(f.FullName);
                        int filePageCount = reader.NumberOfPages;

                        // set up pdfstamper
                        iTextSharp.text.pdf.PdfStamper stamper = new iTextSharp.text.pdf.PdfStamper(reader, stream);

                        // look for divider pages here, add blank if exists
                        List <int> divider_pages = new List <int>();
                        iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader);
                        for (int j = 1; j <= reader.NumberOfPages; j++)
                        {
                            iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy extract = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
                            var    extractedText = parser.ProcessContent(j, extract);
                            string textFromPage  = extractedText.GetResultantText();

                            int cnt     = textFromPage.ToCharArray().Count();
                            int mch_cnt = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"A(PPENDIX|ppendix)").Count;

                            if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0)
                            {
                                // collect blank pages
                                divider_pages.Add(j);
                            }
                            else if (cnt < 50 && mch_cnt > 0)
                            {
                                // collect other divider pages
                                divider_pages.Add(j);
                            }
                        }
                        if (divider_pages.Count > 0)
                        {
                            hasDividers = true;

                            int k = 0; // adjust for total page number change
                            foreach (int page in divider_pages)
                            {
                                stamper.InsertPage(page + k, reader.GetPageSizeWithRotation(1));
                                filePageCount = reader.NumberOfPages;
                                k++;
                            }
                        }

                        // add blank page if needed to make even number
                        if (files[i].FileType == SourceFileTypeEnum.Index ||
                            files[i].FileType == SourceFileTypeEnum.Brief ||
                            files[i].FileType == SourceFileTypeEnum.App_Index ||
                            files[i].FileType == SourceFileTypeEnum.Motion ||
                            files[i].FileType == SourceFileTypeEnum.Divider_Page)
                        {
                            f.AssignNeedsBlankPage(files, reader.NumberOfPages);
                            if (f.NeedsBlankPage)
                            {
                                //PdfStamper stamper2 = new PdfStamper(reader, stream);
                                stamper.InsertPage(reader.NumberOfPages + 1, reader.GetPageSizeWithRotation(1));
                                filePageCount = reader.NumberOfPages;
                            }
                        }

                        // with last document in 'files', add extra pages to make divisible by 4
                        if (bind == TypeOfBindEnum.SaddleStitch && i == files.Count - 1)
                        {
                            if (bind == TypeOfBindEnum.SaddleStitch &&
                                (pageCount + reader.NumberOfPages) % 4 != 0)
                            {
                                //PdfStamper stamper3 = new PdfStamper(reader, stream);
                                while ((pageCount + reader.NumberOfPages) % 4 != 0)
                                {
                                    stamper.InsertPage(reader.NumberOfPages + 1, reader.GetPageSizeWithRotation(1));
                                }
                            }
                        }

                        // get first page of first app file
                        if (!firstAppFileFound && files[i].FileType == SourceFileTypeEnum.App_File)
                        {
                            firstAppFileFound = true;
                            firstPageOfApp    = pageCount + 1;
                        }

                        // add document to 'src'
                        pdfcopy.AddDocument(new iTextSharp.text.pdf.PdfReader(reader));
                        pageCount += reader.NumberOfPages;
                    }

                    pdfcopy.Close();
                    pdfdoc.CloseDocument();
                }

                // final cycle, if dividers, to make sure text starts on odd-sided pages
                if (bind == TypeOfBindEnum.PerfectBind && hasDividers)
                {
                    string dest = (System.IO.Path.GetDirectoryName(src) + @"\temp " + DateTime.Now.ToString("ddMMyyyyhhmmssffff"));

                    using (var stream = new System.IO.FileStream(dest, System.IO.FileMode.Create))
                    {
                        iTextSharp.text.pdf.PdfReader  reader  = new iTextSharp.text.pdf.PdfReader(src);
                        iTextSharp.text.pdf.PdfStamper stamper = new iTextSharp.text.pdf.PdfStamper(reader, stream);

                        // get all blank pages in appendix
                        iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader);
                        List <List <int> > groupsOfBlanks = new List <List <int> >();
                        List <int>         group_list     = new List <int>();
                        int x;
                        for (x = firstPageOfApp; x <= reader.NumberOfPages; x++)
                        {
                            iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy extract = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
                            var    extractedText = parser.ProcessContent(x, extract);
                            string textFromPage  = extractedText.GetResultantText();
                            // find blank pages and cluster into group_list
                            if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0)
                            {
                                // capture blank page cluster (??? but what if only 1 page ???)
                                if (group_list.Count == 0 || group_list.Contains(x - 1))
                                {
                                    group_list.Add(x);
                                }
                            }
                            else
                            {
                                // find first page after cluster
                                if (group_list.Count > 0)
                                {
                                    if (group_list.Last() % 2 == 1)
                                    {
                                        // add blank page
                                        stamper.InsertPage(group_list.Last() + 1, reader.GetPageSizeWithRotation(1));
                                    }
                                }
                                // clear list
                                group_list.Clear();
                            }
                        }
                        stamper.Close();
                        reader.Close();
                    }
                    System.IO.File.Delete(src);
                    System.IO.File.Move(dest, src);
                }
            }
            catch (Exception excpt)
            {
                System.Diagnostics.Debug.WriteLine(excpt); return(false);
            }
            return(true);
        }