/// <summary> /// Получить список страниц с их текстом в base64. /// </summary> /// <param name="pdfPath">Путь к файлу.</param> /// <param name="bookId">ИД книги, к которой пренадлежит страница.</param> /// <param name="progressAction">Устанавливает прогресс загрузки книги.</param> /// <returns>Список страниц.</returns> public static IEnumerable <Page> GetTextLayerWithPages(string pdfPath, Guid bookId, Action <double> progressAction) { BitMiracle.Docotic.LicenseManager.AddLicenseData("5P66M-TH5YU-KMPKP-ZU4U3-SSBP0"); var pages = new ConcurrentQueue <Page>(); var reader = new PdfReader(pdfPath); var numberOfPages = reader.NumberOfPages; var exceptions = new ConcurrentQueue <Exception>(); Parallel.For(1, numberOfPages + 1, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount * 4 }, i => { try { string text; lock (Lock) { text = PdfTextExtractor.GetTextFromPage(reader, i, new LocationTextExtractionStrategy()).Normalize(); } if (string.IsNullOrEmpty(text)) { using var pdfDocument = new PdfDocument(pdfPath); var page = pdfDocument.Pages[i - 1]; page.Rotation = PdfRotation.None; var recognizedPageText = RecognizePageText(page); pages.Enqueue(new Page(i, bookId, GetStringInBase64(recognizedPageText))); } else { pages.Enqueue(new Page(i, bookId, GetStringInBase64(text.Trim()))); } progressAction?.Invoke((double)1 / numberOfPages); } catch (Exception e) { exceptions.Enqueue(e); } }); if (exceptions.Any()) { throw new AggregateException(exceptions); } return(pages); }
public static List <OutputData> ProcessData(List <Mappings> mappings, FileTableInfo fileTableInfo) { //Add key BitMiracle.Docotic.LicenseManager.AddLicenseData(""); var outputData = new List <OutputData>(); using (var pdf = new BitMiracle.Docotic.Pdf.PdfDocument(fileTableInfo.FileLocation)) { var pageGroups = mappings.GroupBy(pg => pg.PageNumber); foreach (var pageGroup in pageGroups) { var pageNumber = pageGroup.Key; var pageMappings = pageGroup.ToList(); var currentpage = pdf.GetPage(pageNumber); string formattedText = currentpage.GetTextWithFormatting(); string modifiedText = Regex.Replace(formattedText, @"['$]", string.Empty); var strArray = System.Text.RegularExpressions.Regex.Split(modifiedText, @"\s{2,}"); foreach (var map in pageMappings) { var amount = PdfFactory.GetNextPositionValue(strArray, map.MapName, map.Position); amount = Regex.Replace(amount, @"[,]", string.Empty); if (amount[0] == '(' && amount[amount.Length - 1] == ')') { amount = "-" + Regex.Replace(amount, @"[)(]", string.Empty); } outputData.Add(new OutputData { FileId = fileTableInfo.FileId, DataType = map.MapType, Value = amount, PageNumber = map.PageNumber }); } } } return(outputData); }
static void Main(string[] args) { // Changes \ to / in the path string string oldstr = @"\"; string newstr = @"/"; string pdf_path_original = string.Empty; // Tells the file path Console.WriteLine("FILE PATH: "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("ファイル パス: "); pdf_path_original = Console.ReadLine(); string pdf_path = pdf_path_original.Replace(oldstr, newstr); // Tells the file name Console.WriteLine(".pdf`s Name: "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("ファイルの名前は: "); string pdf_file_name = Console.ReadLine(); string path_n_name = pdf_path + "/" + pdf_file_name + ".pdf"; // Tells the file alignment Console.WriteLine("Text alignment: "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("テキスト配置: "); Console.WriteLine("1: Left (左), and Right (右), or 2: Middle (真ん中)"); string caseSwitch = Console.ReadLine(); Console.WriteLine(); // Tells the file start page and end page Console.WriteLine(".pdf`s Initial Page "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("最初のページは: "); string startpage_string = Console.ReadLine(); int startpage = Int32.Parse(startpage_string); Console.WriteLine(".pdf`s Last Page "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("最後のページは: "); string endpage_string = Console.ReadLine(); int endpage = Int32.Parse(endpage_string); // Creates blank pdf files for (int i = startpage; i <= endpage; i++) { string convi = i.ToString(); // Creates a temp folder for the pdf files System.IO.Directory.CreateDirectory(pdf_path + "/" + "temp"); System.IO.FileStream pdf_creator = new System.IO.FileStream(pdf_path + "/" + "temp" + "/" + convi + ".pdf", System.IO.FileMode.Create); pdf_creator.Close(); } // Tells the attributes from the new pdf files, and the original pdf source iTextSharp.text.pdf.PdfReader reader = null; iTextSharp.text.Document sourceDocument = null; iTextSharp.text.pdf.PdfCopy pdfCopyProvider = null; iTextSharp.text.pdf.PdfImportedPage importedPage = null; reader = new iTextSharp.text.pdf.PdfReader(path_n_name); sourceDocument = new iTextSharp.text.Document(reader.GetPageSizeWithRotation(startpage)); sourceDocument.Open(); // Creates a .docx to receive the pdf's text Spire.Doc.Document word_doc = new Spire.Doc.Document(); // Word doc formatting Spire.Doc.Section word_doc_section = word_doc.AddSection(); Spire.Doc.Documents.Paragraph word_doc_paragraph = word_doc_section.AddParagraph(); Spire.Doc.Documents.Paragraph word_doc_paragraph_page = word_doc_section.AddParagraph(); // Update those blank pdf files, inserting the copied pages into it try { for (int i = startpage; i <= endpage; i++) { string convi = i.ToString(); pdfCopyProvider = new PdfCopy(sourceDocument, new System.IO.FileStream(pdf_path + "/" + "temp" + "/" + convi + ".pdf", System.IO.FileMode.Append)); sourceDocument.Open(); importedPage = pdfCopyProvider.GetImportedPage(reader, i); pdfCopyProvider.AddPage(importedPage); } sourceDocument.Close(); reader.Close(); } // ERROR catch (Exception ex) { Console.WriteLine("Error! "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("エラー ! "); throw ex; } // Collects the text without furigana from the listed pdf files switch (caseSwitch) { // case 1 reffers to the left and right alignments of the pdf text case "1": Console.WriteLine(); for (int i = startpage; i <= endpage; i++) { // the following refers to the int counter of pages being converted into string string convi = i.ToString(); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("今のページ: " + convi); Console.WriteLine("Current Page: " + convi); // the following refers to the bitmiracle api pdf to get the texts using (BitMiracle.Docotic.Pdf.PdfDocument pdf_1 = new BitMiracle.Docotic.Pdf.PdfDocument(pdf_path + "/" + "temp" + "/" + convi + ".pdf")) { BitMiracle.Docotic.Pdf.PdfPage page = pdf_1.Pages[0]; foreach (PdfTextData data in page.GetWords()) { if (data.FontSize > 6 && data.Position.X < 600) { string text = data.Text; text.TrimEnd(); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine(text); //word_builder.Writeln(text); word_doc_paragraph.AppendText(text); } } foreach (PdfTextData data in page.GetWords()) { if (data.FontSize > 6 && data.Position.X > 600) { string text = data.Text; text.TrimEnd(); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine(text); word_doc_paragraph.AppendText(text); } } } // the following lines reffers to the space between pages of the pdf text Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); // the followin reffers to the extra lines on word text word_doc_paragraph.AppendText(" "); word_doc_paragraph.AppendText("CURRENT PAGE: " + convi); word_doc_paragraph = word_doc_section.AddParagraph(); word_doc.Sections[0].Paragraphs[i].AppendBreak(BreakType.PageBreak); } break; // case 2 reffers to the alignment of the pdf text that is centralized case "2": Console.WriteLine(); for (int i = startpage; i <= endpage; i++) { // the following refers to the int counter of pages being converted into string string convi = i.ToString(); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("今のページ: " + convi); Console.WriteLine("Current Page: " + convi); // the following refers to the bitmiracle api pdf to get the texts using (BitMiracle.Docotic.Pdf.PdfDocument pdf_1 = new BitMiracle.Docotic.Pdf.PdfDocument(pdf_path + "/" + "temp" + "/" + convi + ".pdf")) { BitMiracle.Docotic.Pdf.PdfPage page = pdf_1.Pages[0]; foreach (PdfTextData data in page.GetWords()) { if (data.FontSize > 6) { string text = data.Text; text.TrimEnd(); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine(text); word_doc_paragraph.AppendText(text); } } } Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); word_doc_paragraph.AppendText(" "); word_doc_paragraph.AppendText("CURRENT PAGE: " + convi); word_doc_paragraph = word_doc_section.AddParagraph(); word_doc.Sections[0].Paragraphs[i].AppendBreak(BreakType.PageBreak); } break; default: Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("error! (エラー)"); Console.ReadKey(); break; } // The following refers to creating a .docx file, opening up the file and deleting the temp folder word_doc.SaveToFile(pdf_path + "/" + pdf_file_name + ".docx", FileFormat.Docx); System.IO.Directory.Delete(pdf_path + "/" + "temp", true); try { System.Diagnostics.Process.Start(pdf_path + "/" + pdf_file_name + ".docx"); } catch { Console.WriteLine("Error! "); Console.OutputEncoding = Encoding.GetEncoding(932); Console.WriteLine("エラー ! "); } }