/// <summary> /// Split input PDF file by page numbers /// </summary> /// <param name="args"></param> static void Main(string[] args) { var inputFile = new FileInfo(args[0]); if (!inputFile.Exists) { throw new FileNotFoundException(inputFile.FullName); } // Load, if there, the exam roster id stuff var examRoster = RosterForExamUtils.LoadDBFileForScanFile(inputFile); // Build a catalog of the PDF file var inputPDFFile = new PDFFileHandler(inputFile); var pageGroups = inputPDFFile .GetPagesInfo() .Select(qrTagText => (qrTagText.pageNum, qrTagText.tagText.ParseTagString())) .GroupBy(tinfo => GroupID(tinfo.Item2.pageNumber)); foreach (var g in pageGroups) { Console.WriteLine($"{g.Key}:"); // Order the pages to make it easy to deal with var pages = g .OrderBy(qrInfo => GetName(examRoster, qrInfo.Item2.paperID, qrInfo.Item1).last) .ThenBy(qrInfo => GetName(examRoster, qrInfo.Item2.paperID, qrInfo.Item1).first) .ThenBy(qrInfo => qrInfo.Item2.pageNumber); inputPDFFile.CopyPages(new FileInfo($"{inputFile.Directory.FullName}\\{Path.GetFileNameWithoutExtension(inputFile.Name)} - {g.Key}.pdf"), pages.Select(p => p.Item1)); //foreach (var p in pages) //{ // Console.WriteLine($" {p.Item1}: {p.Item2.paperID} - {p.Item2.pageNumber}"); //} Console.WriteLine(); } // Save out the roster file RosterForExamUtils.SaveDBFile(inputFile, examRoster); }
/// <summary> /// Given a set of PDF files, scan them, extract all pages, and put them into individual exams by /// exam ID number /// </summary> /// <param name="args"></param> static void Main(string[] args) { var sourceDirectory = new DirectoryInfo(args[0]); if (!sourceDirectory.Exists) { throw new DirectoryNotFoundException($"Can't find ${args[0]}"); } var rosterDirectory = new DirectoryInfo(args[1]); if (!rosterDirectory.Exists) { throw new DirectoryNotFoundException($"Can't find roster directory ${args[1]}"); } // Load in the roster. var roster = rosterDirectory .EnumerateFiles("*-roster-db.csv") .SelectMany(f => RosterForExamUtils.LoadDBFile(f)) .ToDictionary(k => k.Key, k => k.Value); // for all PDF files in that directory, get a scan of the file, page number, etc. var allpages = sourceDirectory.GetFiles("*.pdf") .Select(pdf => new PDFFileHandler(pdf)) .SelectMany(pdfScanner => pdfScanner.GetPagesInfo().Select(pg => (scanner: pdfScanner, pnum: pg.pageNum, tag: pg.tagText.ParseTagString()))); // Organize them by exam number and pages in that exam. var byexam = allpages .GroupBy(e => e.tag.paperID); // Prepare the directory for output. var outputDir = new DirectoryInfo($"{sourceDirectory.FullName}\\Collated"); if (!outputDir.Exists) { outputDir.Create(); outputDir.Refresh(); } // Loop over every exam, and write them out. foreach (var exam in byexam) { var exam_name = roster.ContainsKey(exam.Key) ? $"{roster[exam.Key].LastName}, {roster[exam.Key].FirstName}" : exam.Key.ToString(); Console.WriteLine(exam_name); using (var pdf = new PDFFileWriter(new FileInfo($"{outputDir.FullName}\\{exam_name}.pdf"))) { var orderedPages = exam.OrderBy(pgs => pgs.tag.pageNumber); foreach (var p in orderedPages) { pdf.AddPage(p.scanner, p.pnum); } // Loook for missing pages var pageNumbers = orderedPages.Select(p => p.tag.pageNumber).ToArray(); var maxPage = pageNumbers.Max(); var missingPages = Enumerable.Range(1, maxPage) .Where(n => !pageNumbers.Contains(n)) .ToArray(); if (missingPages.Length > 0) { var l = string.Join(", ", missingPages.Select(i => i.ToString())); Console.WriteLine($" Missing pages: {l}"); } } } }