static void Main(string[] args) { string inputFile = "sample.pdf"; using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) { splitter.OptimizeSplittedDocuments = true; //# Create list of page numbers in form "1,2,3,4,5,...,n" # // 1. Get Page Count int pageCount = splitter.GetPageCount(inputFile); // 2. Get all page numbers var pageNumbers = Enumerable.Range(1, pageCount); // 3. Split Range var splitRange = string.Join(",", pageNumbers); // Perform split by ranges string[] files = splitter.Split(inputFile, splitRange); Console.WriteLine(@"Splitted by parts: "); foreach (string file in files) { Console.WriteLine(" " + Path.GetFullPath(file)); } } Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
public int Split(string filename, string password) { var src = GetSource(filename); var info = IO.Get(src); var name = info.BaseName; var ext = info.Extension; var dest = Path(Args(name)); IO.Copy(src, IO.Combine(dest, $"{name}-01{ext}"), true); using (var w = new DocumentSplitter(IO)) { var op = new OpenOption { SaveMemory = false }; w.Add(new DocumentReader(src, password, op)); w.Save(dest); var n = w.Results.Count; var cmp = IO.GetFiles(dest).Length; Assert.That(cmp, Is.EqualTo(n + 1)); Assert.That(IO.Exists(IO.Combine(dest, $"{name}-01 (1){ext}"))); w.Reset(); Assert.That(w.Results.Count, Is.EqualTo(0)); return(n); } }
static void Main(string[] args) { // Create and setup Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor("demo", "demo"); // Load PDF document extractor.LoadDocumentFromFile(InputFile); // List to keep non-empty page numbers List <string> nonEmptyPages = new List <string>(); // Iterate through pages for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { // Extract page text string pageText = extractor.GetTextFromPage(pageIndex); // If extracted text is not empty keep the page number if (pageText.Length > 0) { nonEmptyPages.Add((pageIndex + 1).ToString()); } } // Cleanup extractor.Dispose(); // Form comma-separated list of page numbers to split("1,3,5") string ranges = string.Join(",", nonEmptyPages); // Create Bytescout.PDFExtractor.DocumentSplitter instance DocumentSplitter splitter = new DocumentSplitter("demo", "demo"); splitter.OptimizeSplittedDocuments = true; // Split document by non-empty in temp folder string[] parts = splitter.Split(InputFile, ranges, TempFolder); // Cleanup splitter.Dispose(); // Create Bytescout.PDFExtractor.DocumentMerger instance DocumentMerger merger = new DocumentMerger("demo", "demo"); // Merge parts merger.Merge(parts, OutputFile); // Cleanup merger.Dispose(); // Delete temp folder Directory.Delete(TempFolder, true); // Open the result file in default PDF viewer (for demo purposes) Process.Start(OutputFile); }
private static void ThreadProc(object stateInfo) { int threadIndex = (int)((object[])stateInfo)[0]; ManualResetEvent doneEvent = (ManualResetEvent)((object[])stateInfo)[1]; string inputFile = (string)((object[])stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Extract a piece of document string chunk = string.Format("temp-{0}-{1}", startPage, endPage); using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1); // Process the piece using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo")) { searchablePdfMaker.OCRDetectPageRotation = true; searchablePdfMaker.OCRLanguageDataFolder = @"C:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata"; searchablePdfMaker.LoadDocumentFromFile(chunk); // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. searchablePdfMaker.OCRResolution = 300; searchablePdfMaker.MakePDFSearchable(outputFile); } File.Delete(chunk); Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // Signal the thread is finished doneEvent.Set(); // Release semaphore ThreadLimiter.Release(); } }
static void Main(string[] args) { string inputFile = @".\encrypted (password is 'password').pdf"; using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) { // Handle `PasswordRequired` event splitter.PasswordRequired += new PasswordEventHandler(splitter_PasswordRequired); // Ignore document permissions splitter.CheckPermissions = false; // Split document splitter.Split(inputFile, "part1.pdf", @"part2.pdf", 3); } Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main(string[] args) { string inputFile = @".\sample2.pdf"; // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); int pageCount = extractor.GetPageCount(); // Search each page for a keyword for (int i = 0; i < pageCount; i++) { if (extractor.Find(i, "bombardment", false)) { // Extract page using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) { splitter.OptimizeSplittedDocuments = true; int pageNumber = i + 1; // (!) page number in ExtractPage() is 1-based string outputFile = @".\page" + pageNumber + ".pdf"; splitter.ExtractPage(inputFile, outputFile, pageNumber); Console.WriteLine("Extracted page " + pageNumber + " to file \"" + outputFile + "\""); } } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main(string[] args) { string inputFile = @".\sample.pdf"; using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) { splitter.OptimizeSplittedDocuments = true; // Extracting specific page: // ========================= splitter.ExtractPage(inputFile, "page3.pdf", 3); // (!) Note: page number is 1-based. Console.WriteLine(@"Extracted page 3 to file ""page3.pdf"""); Console.WriteLine(); // Split in two parts: // =================== splitter.Split(inputFile, "part1.pdf", "part2.pdf", 3); // (!) Note: page number is 1-based. Console.WriteLine(@"Splitted at page 3 to files ""part1.pdf"" and ""part2.pdf"""); Console.WriteLine(); // Split by ranges: // ================ string[] files = splitter.Split(inputFile, "1-3,4-6,7,8-"); // (!) Note: page numbers are 1-based; ending "-" means "to the end". Console.WriteLine(@"Splitted by ranges: "); foreach (string file in files) { Console.WriteLine(" " + Path.GetFileName(file)); } } Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
private static void ThreadProc(object stateInfo) { int threadIndex = (int)((object[])stateInfo)[0]; ManualResetEvent doneEvent = (ManualResetEvent)((object[])stateInfo)[1]; string inputFile = (string)((object[])stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Extract a piece of document string chunk = string.Format("temp-{0}-{1}", startPage, endPage); using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1); /* * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply * recognized text over the scanned document. Such fonts contain only basic characters * from ISO-8859-1 charset. * If you run OCR for one of the languages with characters that are not present in the default * encoding, you should explicitly specify the font that contains the required characters * using ".LabelingFont" property. * If you run the application in Windows with a selected locale that matches OCR language, * it will be enough to specify the usual font "Arial". But if your app will run in an unknown * environment (for example, in some virtual machine) you will need to install some full Unicode * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker: * * //searchablePDFMaker.LabelingFont = "Arial Unicode MS"; */ // Process the piece using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo")) { searchablePdfMaker.OCRDetectPageRotation = true; searchablePdfMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; searchablePdfMaker.LoadDocumentFromFile(chunk); // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. searchablePdfMaker.OCRResolution = 300; searchablePdfMaker.MakePDFSearchable(outputFile); } File.Delete(chunk); Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // Signal the thread is finished doneEvent.Set(); // Release semaphore ThreadLimiter.Release(); } }