C# (CSharp) TextExtractor.GetTextFromPageの例

プログラミング言語: C# (CSharp)

クラス/型: TextExtractor

メソッド/関数: GetTextFromPage

hotexamples.comのコード掲載数: 9

C# (CSharp) TextExtractor.GetTextFromPage - 9件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC# (CSharp)のTextExtractor.GetTextFromPageの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

LoadDocumentFromFile(30)

Extract(29)

GetPageCount(22)

Find(19)

SaveTextToFile(18)

FindNext(15)

ExtractText(15)

Dispose(13)

SetExtractionArea(13)

GetText(11)

GetTextFromPage(9)

FindAll(8)

IsValidFileType(6)

Begin(6)

Open(5)

SavePageTextToFile(5)

ExtractLine(4)

SaveTextToStream(4)

ExtractAll(4)

GetAsXML(4)

SavePageTextToStream(4)

GetFirstLine(3)

Reset(3)

TextExtractingWillBePotentiallySlow(3)

ResetExtractionArea(2)

PostImageAsync(2)

LoadProfiles(2)

ToString(2)

GetValue(2)

LoadDocumentFromStream(2)

GetPageRectangle(2)

Filter(2)

GetPageRect_Width(2)

GetPageRect_Height(2)

GetTextFromBitmapAsync(2)

GetWordCount(2)

Replace(1)

NextPage(1)

GetListValues(1)

IsOCRRecommendedForPage(1)

SelectStrategy(1)

ExtractFullText(1)

SupportedFormats(1)

SupportedLanguages(1)

CreateDocument(1)

AddFilter(1)

コード例 #1

ファイルを表示

ファイル: Converter.cs プロジェクト: zulfiqar1982/zulfiqar

        //public string ReadFromPositionSpire()
        //{
        //    PdfPageBase page = Document.Pages[0];
        //    string text = page.ExtractText(new RectangleF(50, 50, 500, 100));
        //    StringBuilder sb = new StringBuilder();
        //    sb.AppendLine(text);
        //    return sb.ToString();
        //    return string.Empty;
        //}

        public string BytescoutPDFExtractor(string path)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            path = @"C:\Users\zulfiqar\Downloads\ExpenseClaimForm1_b2abe30fabca4b1ca322fafd74306ceb (1).pdf";

            // load the document
            extractor.LoadDocumentFromFile(path);

            // get page count
            //int pageCount = extractor.GetPageCount();
            //int count = 0;

            // iterate through pages


            // define rectangle location to extract from
            RectangleF location = new RectangleF(0, 0, 200, 200);

            // set extraction area
            extractor.SetExtractionArea(location);

            // extract text bounded by the extraction area
            string extractedString = extractor.GetTextFromPage(0);

            return(extractedString);
        }

コード例 #2

ファイルを表示

ファイル: Program.cs プロジェクト: babylon3389/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            int pageCount = extractor.GetPageCount();

            // Search each page for some keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "References", false))
                {
                    // If page contains the keyword, extract a text from it.
                    // For demonstration we'll extract the text from top part of the page only
                    extractor.SetExtractionArea(0, 0, 600, 200);
                    string text = extractor.GetTextFromPage(i);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

コード例 #3

ファイルを表示

ファイル: Program.cs プロジェクト: bytescout/pdf-extractor-sdk-samples-c-sharp

        /// <summary>
        /// Get text from particular region
        /// </summary>
        private static string GetTextFromRegion(TextExtractor textExtractor, RectangleF extractionRegion, int pageIndex = 0)
        {
            // Set Extraction Area
            textExtractor.SetExtractionArea(extractionRegion);

            // Get Text from that region
            return(textExtractor.GetTextFromPage(pageIndex));
        }

コード例 #4

ファイルを表示

ファイル: Program.cs プロジェクト: wushian/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create and setup Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load PDF document
            extractor.LoadDocumentFromFile(InputFile);

            // List to keep non-empty page numbers
            List <string> nonEmptyPages = new List <string>();

            // Iterate through pages
            for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
            {
                // Extract page text
                string pageText = extractor.GetTextFromPage(pageIndex);
                // If extracted text is not empty keep the page number
                if (pageText.Length > 0)
                {
                    nonEmptyPages.Add((pageIndex + 1).ToString());
                }
            }

            // Cleanup
            extractor.Dispose();


            // Form comma-separated list of page numbers to split("1,3,5")
            string ranges = string.Join(",", nonEmptyPages);

            // Create Bytescout.PDFExtractor.DocumentSplitter instance
            DocumentSplitter splitter = new DocumentSplitter("demo", "demo");

            splitter.OptimizeSplittedDocuments = true;

            // Split document by non-empty in temp folder
            string[] parts = splitter.Split(InputFile, ranges, TempFolder);

            // Cleanup
            splitter.Dispose();


            // Create Bytescout.PDFExtractor.DocumentMerger instance
            DocumentMerger merger = new DocumentMerger("demo", "demo");

            // Merge parts
            merger.Merge(parts, OutputFile);

            // Cleanup
            merger.Dispose();

            // Delete temp folder
            Directory.Delete(TempFolder, true);


            // Open the result file in default PDF viewer (for demo purposes)
            Process.Start(OutputFile);
        }

コード例 #5

ファイルを表示

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.InfoExtractor instance
            InfoExtractor infoExtractor = new InfoExtractor();

            infoExtractor.RegistrationName = "demo";
            infoExtractor.RegistrationKey  = "demo";

            TextExtractor textExtractor = new TextExtractor();

            textExtractor.RegistrationName = "demo";
            textExtractor.RegistrationKey  = "demo";

            // List all PDF files in directory
            foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf"))
            {
                infoExtractor.LoadDocumentFromFile(file);

                Console.WriteLine("File Name:      " + Path.GetFileName(file));
                Console.WriteLine("Page Count:     " + infoExtractor.GetPageCount());
                Console.WriteLine("Author:         " + infoExtractor.Author);
                Console.WriteLine("Title:          " + infoExtractor.Title);
                Console.WriteLine("Producer:       " + infoExtractor.Producer);
                Console.WriteLine("Subject:        " + infoExtractor.Subject);
                Console.WriteLine("CreationDate:   " + infoExtractor.CreationDate);
                Console.WriteLine("Text (first 2 lines): ");

                // Load a couple of lines from each document
                textExtractor.LoadDocumentFromFile(file);
                using (StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0)))
                {
                    Console.WriteLine(stringReader.ReadLine());
                    Console.WriteLine(stringReader.ReadLine());
                }
                Console.WriteLine();
            }

            // Cleanup
            infoExtractor.Dispose();
            textExtractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

コード例 #6

ファイルを表示

        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // Set extraction area
                extractor.SetExtractionArea(location);

                // Extract text from the extraction area
                string text = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":");
                Console.WriteLine();
                Console.WriteLine(text);

                // Reset the extraction area
                extractor.ResetExtractionArea();

                Console.WriteLine();
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }

コード例 #7

ファイルを表示

ファイル: Program.cs プロジェクト: remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("../../sample2.pdf");

            // get page count
            int pageCount = extractor.GetPageCount();
            int count     = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // set extraction area
                extractor.SetExtractionArea(location);

                // extract text bounded by the extraction area
                string extractedString = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString);

                // reset extraction area to full page (by default)
                extractor.ResetExtractionArea();

                Console.WriteLine("\r\n");
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }

コード例 #8

ファイルを表示

ファイル: Program.cs プロジェクト: remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

コード例 #9

ファイルを表示

        public static Dictionary <int, PDFLineInfo> ScrapyDataFromPDFiles(string[] urllist)
        {
            PDFLineInfo temp = new PDFLineInfo();
            Dictionary <int, PDFLineInfo> dictionary = new Dictionary <int, PDFLineInfo>();
            //dictionary = new Dictionary<int, PDFLineInfo>();


            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            for (int nFileIndex = 0; nFileIndex < urllist.Length; nFileIndex++)
            {
                //string currentFileName = "sample2.pdf";
                string currentFileName  = urllist[nFileIndex];
                string currentTitleName = "";

                // Load each PDF Document
                extractor.LoadDocumentFromFile(currentFileName);
                int pageCount = extractor.GetPageCount();


                //most of all case i = 0 but one case  i = 0
                int pdfDocumentType = -1;// 1: material type 2: spirit type 3: Empty type.

                /*if(currentFileName.Contains("R92(592112)_ExpViewPartList") == true)
                 * {
                 *  int zz = 5;
                 * }*/

                for (int i = 1; i < pageCount; i++)
                {
                    if (currentTitleName.Contains("notable") == true)
                    {
                        break;
                    }

                    //if (extractor.Find(i, "Dyaco", false))
                    {
                        //extractor.SetExtractionArea(0, 0, 800, 2000);
                        string wholetext = extractor.GetTextFromPage(i);
                        //Console.WriteLine(wholetext);

                        string[] lines = wholetext.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);
                        //Console.WriteLine("Length ===================== >" + lines.Length);
                        //if line.getlen is not 4 alert!!


                        //1 . Notify Header Strings
                        int j = 0;


                        while (j < lines.Length)
                        {
                            if ((lines[j].ToLower().Contains("part") == true) && (findTitle == false))
                            {
                                //Console.WriteLine("Title = = = = => " + detectTitle(lines[i]));
                                currentTitleName = detectTitle(lines[j]);
                                findTitle        = true;

                                j++;
                                continue;
                            }
                            if (currentTitleName.Contains("notable") == true)
                            {
                                break;
                            }

                            if (findTitle == false)
                            {
                                if (j > 2)
                                {
                                    currentTitleName = "notable";
                                    break;
                                }
                                j++;
                                continue;
                            }

                            var array = lines[j].Split(new string[] { "  " }, StringSplitOptions.RemoveEmptyEntries);

                            if ((findheader == false) && (findTitle == true))
                            {
                                if ((lines[j].ToLower().Contains("dyaco") == true) || (lines[j].ToLower().Contains("material") == true) ||
                                    (lines[j].ToLower().Contains("spirit") == true) || (lines[j].ToLower().Contains("no") == true) ||
                                    (lines[j].ToLower().Contains("part") == true) || (lines[j].ToLower().Contains("qty") == true))
                                {
                                    findheader = true;

                                    {
                                        if (lines[j].ToLower().Contains("material") == true)
                                        {
                                            pdfDocumentType = 1;
                                        }

                                        else if (lines[j].ToLower().Contains("spirit") == true)
                                        {
                                            pdfDocumentType = 2;
                                        }

                                        else
                                        {
                                            if (array.Length > 2)
                                            {
                                                if (array[2].ToLower().Contains("part") == true)
                                                {
                                                    pdfDocumentType = 2;
                                                }
                                            }
                                            pdfDocumentType = 3;
                                        }
                                    }

                                    j++;
                                    continue;
                                }

                                if (array.Length < 4)
                                {
                                    j++;
                                    continue;
                                }
                            }

                            if ((lines[j].Contains("(TRIAL VER. PDF Extractor SDK 8.4.1.2829.888331924)") == true) || ((lines[j].Contains("TRIAL VERSION EXPIRES 90 DAYS AFTER INSTALLATION") == true)))
                            {
                                j++;
                                continue;
                            }

                            if (pdfDocumentType == -1)
                            {
                                MessageBox.Show("Can not get pdf Type");
                            }

                            if (pdfDocumentType == 3)
                            {
                                if ((array.Length != 3) || (array[0].Length > 5))
                                {
                                    j++;
                                    continue;
                                }
                            }
                            else
                            {
                                if (array.Length != 4)
                                {
                                    j++;
                                    continue;
                                }
                            }



                            //Console.WriteLine("Document Type =====>" + pdfDocumentType);

                            //Console.WriteLine(RemoveSpace(array[0]));
                            //Console.WriteLine(RemoveSpace(array[1]));
                            //Console.WriteLine(RemoveSpace(array[2]));
                            //Console.WriteLine(RemoveSpace(array[3]));

                            /*if(array[0].Contains("57") == true)
                             * {
                             *  int awe = 5;
                             * }*/

                            switch (pdfDocumentType)
                            {
                            case 1:
                                temp.PartID   = RemoveSpace(array[1]);
                                temp.PartName = RemoveSpace(array[2]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[3]));     //no change

                                break;

                            case 2:
                                temp.PartID   = RemoveSpace(array[2]);
                                temp.PartName = RemoveSpace(array[1]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[3]));     //no change

                                break;

                            case 3:
                                temp.PartID   = "";                                     //empty
                                temp.PartName = RemoveSpace(array[1]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[2]));     //no change

                                break;
                            }

                            temp.ProductName = currentTitleName;                 //no change

                            /*if(currentTitleName.Length <3)
                             * {
                             *  int qqq = 5;
                             * }*/

                            j++;

                            //2. Add values to PDFLineInfo
                            dictionary.Add(nTotalIndex, temp);

                            nTotalIndex++;
                        }
                    }
                }

                findheader       = false;
                findTitle        = false;
                currentTitleName = "";

                int currentpercent = (int)(20 * nFileIndex / urllist.Length);

                Console.WriteLine("*******" + nFileIndex + "*********" + currentpercent + "********");

                //updateing value
                Form1.progressvalue = currentpercent;
                Form1.progressBar1.BeginInvoke(new Action(() => Form1.progressBar1.Value = currentpercent));
                Form1.percentlabel.Text = currentpercent.ToString() + "%";
            }

            return(dictionary);
        }