Ejemplo n.º 1
0
        /// <summary>
        /// Распознать текст нечитаемого PDF-файла.
        /// </summary>
        /// <param name="page">Нечитаемая PDF-страница.</param>
        /// <returns>Распознанный текст со страницы.</returns>
        private static string RecognizePageText(PdfPage page)
        {
            var options = PdfDrawOptions.Create();

            options.BackgroundColor      = new PdfRgbColor(255, 255, 255);
            options.HorizontalResolution = 200;
            options.VerticalResolution   = 200;

            using var memoryStream = new MemoryStream();
            page.Save(memoryStream, options);

            using var engine         = new TesseractEngine(@"tessdata\fast", "rus+eng", EngineMode.LstmOnly);
            using var img            = Pix.LoadFromMemory(memoryStream.GetBuffer());
            using var recognizedPage = engine.Process(img);

            return(recognizedPage.GetText());
        }
Ejemplo n.º 2
0
        static void Main(string[] args)
        {
            // Changes \ to / in the path string
            string oldstr            = @"\";
            string newstr            = @"/";
            string pdf_path_original = string.Empty;

            // Tells the file path
            Console.WriteLine("FILE PATH: ");
            Console.OutputEncoding = Encoding.GetEncoding(932);
            Console.WriteLine("ファイル パス: ");
            pdf_path_original = Console.ReadLine();
            string pdf_path = pdf_path_original.Replace(oldstr, newstr);

            // Tells the file name
            Console.WriteLine(".pdf`s Name: ");
            Console.OutputEncoding = Encoding.GetEncoding(932);
            Console.WriteLine("ファイルの名前は: ");
            string pdf_file_name = Console.ReadLine();
            string path_n_name   = pdf_path + "/" + pdf_file_name + ".pdf";

            // Tells the file alignment
            Console.WriteLine("Text alignment: ");
            Console.OutputEncoding = Encoding.GetEncoding(932);
            Console.WriteLine("テキスト配置: ");
            Console.WriteLine("1: Left (左), and Right (右), or 2: Middle (真ん中)");
            string caseSwitch = Console.ReadLine();

            Console.WriteLine();


            // Tells the file start page and end page
            Console.WriteLine(".pdf`s Initial Page ");
            Console.OutputEncoding = Encoding.GetEncoding(932);
            Console.WriteLine("最初のページは: ");
            string startpage_string = Console.ReadLine();
            int    startpage        = Int32.Parse(startpage_string);

            Console.WriteLine(".pdf`s Last Page ");
            Console.OutputEncoding = Encoding.GetEncoding(932);
            Console.WriteLine("最後のページは: ");
            string endpage_string = Console.ReadLine();
            int    endpage        = Int32.Parse(endpage_string);

            // Creates blank pdf files
            for (int i = startpage; i <= endpage; i++)
            {
                string convi = i.ToString();
                // Creates a temp folder for the pdf files
                System.IO.Directory.CreateDirectory(pdf_path + "/" + "temp");
                System.IO.FileStream pdf_creator = new System.IO.FileStream(pdf_path + "/" + "temp" + "/" + convi + ".pdf", System.IO.FileMode.Create);
                pdf_creator.Close();
            }


            // Tells the attributes from the new pdf files, and the original pdf source
            iTextSharp.text.pdf.PdfReader       reader          = null;
            iTextSharp.text.Document            sourceDocument  = null;
            iTextSharp.text.pdf.PdfCopy         pdfCopyProvider = null;
            iTextSharp.text.pdf.PdfImportedPage importedPage    = null;

            reader         = new iTextSharp.text.pdf.PdfReader(path_n_name);
            sourceDocument = new iTextSharp.text.Document(reader.GetPageSizeWithRotation(startpage));
            sourceDocument.Open();

            // Creates a .docx to receive the pdf's text
            Spire.Doc.Document word_doc = new Spire.Doc.Document();

            // Word doc formatting
            Spire.Doc.Section             word_doc_section        = word_doc.AddSection();
            Spire.Doc.Documents.Paragraph word_doc_paragraph      = word_doc_section.AddParagraph();
            Spire.Doc.Documents.Paragraph word_doc_paragraph_page = word_doc_section.AddParagraph();


            // Update those blank pdf files, inserting the copied pages into it
            try
            {
                for (int i = startpage; i <= endpage; i++)
                {
                    string convi = i.ToString();
                    pdfCopyProvider = new PdfCopy(sourceDocument, new System.IO.FileStream(pdf_path + "/" + "temp" + "/" + convi + ".pdf", System.IO.FileMode.Append));
                    sourceDocument.Open();
                    importedPage = pdfCopyProvider.GetImportedPage(reader, i);
                    pdfCopyProvider.AddPage(importedPage);
                }


                sourceDocument.Close();
                reader.Close();
            }


            // ERROR
            catch (Exception ex)
            {
                Console.WriteLine("Error! ");
                Console.OutputEncoding = Encoding.GetEncoding(932);
                Console.WriteLine("エラー ! ");
                throw ex;
            }


            // Collects the text without furigana from the listed pdf files
            switch (caseSwitch)
            {
            // case 1 reffers to the left and right alignments of the pdf text
            case "1":
                Console.WriteLine();
                for (int i = startpage; i <= endpage; i++)
                {
                    // the following refers to the int counter of pages being converted into string
                    string convi = i.ToString();
                    Console.OutputEncoding = Encoding.GetEncoding(932);
                    Console.WriteLine("今のページ: " + convi);
                    Console.WriteLine("Current Page: " + convi);

                    // the following refers to the bitmiracle api pdf to get the texts
                    using (BitMiracle.Docotic.Pdf.PdfDocument pdf_1 = new BitMiracle.Docotic.Pdf.PdfDocument(pdf_path + "/" + "temp" + "/" + convi + ".pdf"))
                    {
                        BitMiracle.Docotic.Pdf.PdfPage page = pdf_1.Pages[0];
                        foreach (PdfTextData data in page.GetWords())
                        {
                            if (data.FontSize > 6 && data.Position.X < 600)
                            {
                                string text = data.Text;
                                text.TrimEnd();
                                Console.OutputEncoding = Encoding.GetEncoding(932);
                                Console.WriteLine(text);
                                //word_builder.Writeln(text);
                                word_doc_paragraph.AppendText(text);
                            }
                        }
                        foreach (PdfTextData data in page.GetWords())
                        {
                            if (data.FontSize > 6 && data.Position.X > 600)
                            {
                                string text = data.Text;
                                text.TrimEnd();
                                Console.OutputEncoding = Encoding.GetEncoding(932);
                                Console.WriteLine(text);
                                word_doc_paragraph.AppendText(text);
                            }
                        }
                    }
                    // the following lines reffers to the space between pages of the pdf text
                    Console.WriteLine();
                    Console.WriteLine();
                    Console.WriteLine();
                    Console.WriteLine();
                    Console.WriteLine();
                    // the followin reffers to the extra lines on word text


                    word_doc_paragraph.AppendText("                                        ");
                    word_doc_paragraph.AppendText("CURRENT PAGE: " + convi);
                    word_doc_paragraph = word_doc_section.AddParagraph();
                    word_doc.Sections[0].Paragraphs[i].AppendBreak(BreakType.PageBreak);
                }

                break;


            // case 2 reffers to the alignment of the pdf text that is centralized
            case "2":
                Console.WriteLine();
                for (int i = startpage; i <= endpage; i++)
                {
                    // the following refers to the int counter of pages being converted into string
                    string convi = i.ToString();
                    Console.OutputEncoding = Encoding.GetEncoding(932);
                    Console.WriteLine("今のページ: " + convi);
                    Console.WriteLine("Current Page: " + convi);

                    // the following refers to the bitmiracle api pdf to get the texts
                    using (BitMiracle.Docotic.Pdf.PdfDocument pdf_1 = new BitMiracle.Docotic.Pdf.PdfDocument(pdf_path + "/" + "temp" + "/" + convi + ".pdf"))
                    {
                        BitMiracle.Docotic.Pdf.PdfPage page = pdf_1.Pages[0];
                        foreach (PdfTextData data in page.GetWords())
                        {
                            if (data.FontSize > 6)
                            {
                                string text = data.Text;
                                text.TrimEnd();
                                Console.OutputEncoding = Encoding.GetEncoding(932);
                                Console.WriteLine(text);
                                word_doc_paragraph.AppendText(text);
                            }
                        }
                    }
                    Console.WriteLine();
                    Console.WriteLine();
                    Console.WriteLine();
                    Console.WriteLine();
                    Console.WriteLine();

                    word_doc_paragraph.AppendText("                                        ");
                    word_doc_paragraph.AppendText("CURRENT PAGE: " + convi);
                    word_doc_paragraph = word_doc_section.AddParagraph();
                    word_doc.Sections[0].Paragraphs[i].AppendBreak(BreakType.PageBreak);
                }
                break;

            default:
                Console.OutputEncoding = Encoding.GetEncoding(932);
                Console.WriteLine("error! (エラー)");
                Console.ReadKey();
                break;
            }

            // The following refers to creating a .docx file, opening up the file and deleting the temp folder
            word_doc.SaveToFile(pdf_path + "/" + pdf_file_name + ".docx", FileFormat.Docx);
            System.IO.Directory.Delete(pdf_path + "/" + "temp", true);
            try
            {
                System.Diagnostics.Process.Start(pdf_path + "/" + pdf_file_name + ".docx");
            }
            catch
            {
                Console.WriteLine("Error! ");
                Console.OutputEncoding = Encoding.GetEncoding(932);
                Console.WriteLine("エラー ! ");
            }
        }