TextExtractor.Find C# (CSharp)代码示例

示例#1

0

显示文件

文件： Default.aspx.cs 项目： remlex/ByteScout-SDK-SourceCode

        protected void Page_Load(object sender, EventArgs e)
        {
            // This test file will be copied to the project directory on the pre-build event (see the project properties).
            String inputFile = Server.MapPath("words-with-hyphens.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            Response.Clear();
            Response.ContentType = "text/html";

            Rectangle location;
            int       pageIndex;

            Response.Write("Searching for \"hyphen\" string:<br><br>");

            // Search for "hyphen" string
            if (extractor.Find(0, "hyphen", false))
            {
                do
                {
                    Response.Write("Found at location " + extractor.FoundText.Bounds.ToString() + "<br>");
                } while (extractor.FindNext());
            }

            Response.End();
        }

示例#2

0

显示文件

文件： Program.cs 项目： repohoarder/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("words-with-hyphens.pdf");

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "hyphen" string
                if (extractor.Find(i, "hyphen", false))
                {
                    do
                    {
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                    }while (extractor.FindNext());
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

示例#3

0

显示文件

文件： MainWindow.xaml.cs 项目： remlex/ByteScout-SDK-SourceCode

        private void Button_Find(object sender, RoutedEventArgs e)
        {
            if (textBoxFind.Text.Length > 0)
            {
                StringBuilder builder = new StringBuilder();

                builder.AppendLine("Searching for \"" + textBoxFind.Text + "\"");

                if (extractor.Find(0, textBoxFind.Text, false))
                {
                    do
                    {
                        builder.AppendLine("");
                        builder.AppendLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        builder.AppendLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            builder.AppendLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            builder.AppendLine("Text: " + element.Text);
                            builder.AppendLine("Font is bold: " + element.FontIsBold);
                            builder.AppendLine("Font is italic:" + element.FontIsItalic);
                            builder.AppendLine("Font name: " + element.FontName);
                            builder.AppendLine("Font size:" + element.FontSize);
                            builder.AppendLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }

                builder.AppendLine("Finished.");

                textBox1.Text = builder.ToString();
            }
        }

示例#4

0

显示文件

文件： Program.cs 项目： babylon3389/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            int pageCount = extractor.GetPageCount();

            // Search each page for some keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "References", false))
                {
                    // If page contains the keyword, extract a text from it.
                    // For demonstration we'll extract the text from top part of the page only
                    extractor.SetExtractionArea(0, 0, 600, 200);
                    string text = extractor.GetTextFromPage(i);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

示例#5

0

显示文件

        static void Main(string[] args)
        {
            try
            {
                // Get all settings VM
                var allSettings = GetSettingsVM("settings.json");

                // Create Bytescout.PDFExtractor.TextExtractor instance
                TextExtractor extractor = new TextExtractor();
                extractor.RegistrationName = "demo";
                extractor.RegistrationKey  = "demo";

                foreach (var fileName in Directory.GetFiles("InputFiles"))
                {
                    // Load sample PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    // Enable regex search
                    extractor.RegexSearch = true;

                    // Get Number of pages PDF contains
                    int pageCount = extractor.GetPageCount();

                    for (int iPage = 0; iPage < pageCount; iPage++)
                    {
                        // Loop through all search settings
                        foreach (var itmSearchSetting in allSettings.Settings)
                        {
                            // If found, then copy file to sub-category folder
                            if (extractor.Find(iPage, itmSearchSetting.regex, false))
                            {
                                // If Directory does not exists, then create them
                                if (!Directory.Exists($"{allSettings.MainFolderName}/{itmSearchSetting.category}"))
                                {
                                    Directory.CreateDirectory($"{allSettings.MainFolderName}/{itmSearchSetting.category}");
                                }

                                // Copy File
                                File.Copy(fileName, $"{allSettings.MainFolderName}/{itmSearchSetting.category}/{Path.GetFileName(fileName)}", true);
                            }
                        }
                    }
                }

                // Cleanup
                extractor.Dispose();
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: " + ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press enter key to continue...");
            Console.ReadLine();
        }

示例#6

0

显示文件

文件： Program.cs 项目： jboddiford/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\Invoice.pdf");

            extractor.RegexSearch = true; // Enable the regular expressions

            int pageCount = extractor.GetPageCount();

            // Search through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Search dates in format 12/31/1999
                string regexPattern = "[0-9]{2}/[0-9]{2}/[0-9]{4}";
                // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                // Search each page for the pattern
                if (extractor.Find(i, regexPattern, false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds);
                        Console.WriteLine("");

                        // Iterate through each element in the found text
                        foreach (ISearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("   Text: " + element.Text);
                            Console.WriteLine("   Font is bold: " + element.FontIsBold);
                            Console.WriteLine("   Font is italic: " + element.FontIsItalic);
                            Console.WriteLine("   Font name: " + element.FontName);
                            Console.WriteLine("   Font size: " + element.FontSize);
                            Console.WriteLine("   Font color: " + element.FontColor);
                            Console.WriteLine();
                        }
                    }while (extractor.FindNext());
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

示例#7

0

显示文件

文件： Program.cs 项目： babylon3389/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            const string inputFile           = @"sample.pdf";
            const int    pageIndex           = 0;
            const float  renderingResolution = 300f;
            const string searchPattern       = "\\d+\\.\\d+";

            // Prepare TextExtractor
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                textExtractor.RegexSearch = true;
                textExtractor.LoadDocumentFromFile(inputFile);

                // Preapre RasterRenderer
                using (RasterRenderer rasterRenderer = new RasterRenderer("demo", "demo"))
                {
                    rasterRenderer.LoadDocumentFromFile(inputFile);

                    // Render document page to image
                    Image image = rasterRenderer.GetImage(pageIndex, renderingResolution);

                    // Prepare highlight brush
                    Brush highlightBrush = new SolidBrush(Color.FromArgb(128, Color.Yellow));

                    using (Graphics graphics = Graphics.FromImage(image))
                    {
                        // Search for pattern and paint found piecese
                        if (textExtractor.Find(pageIndex, searchPattern, caseSensitive: false))
                        {
                            do
                            {
                                foreach (var foundPiece in textExtractor.FoundText.Elements)
                                {
                                    // Convert from document Points to pixels
                                    Rectangle pixelRect = new Rectangle(
                                        (int)(foundPiece.Bounds.Left / 72f * renderingResolution),
                                        (int)(foundPiece.Bounds.Top / 72f * renderingResolution),
                                        (int)(foundPiece.Bounds.Width / 72f * renderingResolution),
                                        (int)(foundPiece.Bounds.Height / 72f * renderingResolution)
                                        );

                                    // Paint rectangle
                                    graphics.FillRectangle(highlightBrush, pixelRect);
                                }
                            } while (textExtractor.FindNext());
                        }
                    }

                    image.Save("result.png");
                    Process.Start("result.png");
                }
            }
        }

示例#8

0

显示文件

文件： Default.aspx.cs 项目： repohoarder/ByteScout-SDK-SourceCode

        /*
         * IF YOU SEE TEMPORARY FOLDER ACCESS ERRORS:
         *
         * Temporary folder access is required for web application when you use ByteScout SDK in it.
         * If you are getting errors related to the access to temporary folder like "Access to the path 'C:\Windows\TEMP\... is denied" then you need to add permission for this temporary folder to make ByteScout SDK working on that machine and IIS configuration because ByteScout SDK requires access to temp folder to cache some of its data for more efficient work.
         *
         * SOLUTION:
         *
         * If your IIS Application Pool has "Load User Profile" option enabled the IIS provides access to user's temp folder. Check user's temporary folder
         *
         * If you are running Web Application under an impersonated account or IIS_IUSRS group, IIS may redirect all requests into separate temp folder like "c:\temp\".
         *
         * In this case
         * - check the User or User Group your web application is running under
         * - then add permissions for this User or User Group to read and write into that temp folder (c:\temp or c:\windows\temp\ folder)
         * - restart your web application and try again
         *
         */

        protected void Page_Load(object sender, EventArgs e)
        {
            String inputFile = Server.MapPath(@".\bin\sample1.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            // Set the matching mode.
            // WordMatchingMode.None - treats the search string as substring
            // WordMatchingMode.ExactMatch - treats the search string as separate word
            // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader).
            extractor.WordMatchingMode = WordMatchingMode.ExactMatch;

            Response.Clear();
            Response.ContentType = "text/html";

            Response.Write("Searching for \"ipsum\" string:<br>");

            // Search for "ipsum" string
            if (extractor.Find(0, "ipsum", false))
            {
                do
                {
                    Response.Write("<br/>");
                    Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds + "<br/>");
                    Response.Write("<br/>");

                    // The found text may be splitted to parts.
                    // Iterate through each part of the found text.
                    for (var i = 0; i < extractor.FoundText.Elements.Count; i++)
                    {
                        ISearchResultElement element = extractor.FoundText.Elements[i];

                        Response.Write("Element #" + i + " at " + element.Bounds + "<br/>");
                        Response.Write("Text: " + element.Text + "<br/>");
                        Response.Write("Font is bold: " + element.FontIsBold + "<br/>");
                        Response.Write("Font is italic:" + element.FontIsItalic + "<br/>");
                        Response.Write("Font name: " + element.FontName + "<br/>");
                        Response.Write("Font size:" + element.FontSize + "<br/>");
                        Response.Write("Font color:" + element.FontColor + "<br/>");
                    }
                }while (extractor.FindNext());
            }

            Response.End();
        }

示例#9

0

显示文件

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample1.pdf");

            // Set the matching mode.
            // WordMatchingMode.None - treats the search string as substring
            // WordMatchingMode.ExactMatch - treats the search string as separate word
            // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader).
            extractor.WordMatchingMode = WordMatchingMode.ExactMatch;

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "ipsum" string
                if (extractor.Find(i, "ipsum", false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // Iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

示例#10

0

显示文件

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");


            int pageCount = extractor.GetPageCount();

            extractor.RegexSearch = true; //  ' turn on the regular expression search

            // search through pages
            for (int i = 0; i < pageCount; i++)
            {
                // searches for the text starting from LABORIS and ending with VELIT words
                string regexPattern = "LABORIS.*VELIT";
                // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                // Search each page for the pattern
                if (extractor.Find(i, regexPattern, false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

示例#11

0

显示文件

        static void Main(string[] args)
        {
            try
            {
                // Create Bytescout.PDFExtractor.TextExtractor instance
                using (TextExtractor extractor = new TextExtractor())
                {
                    extractor.RegistrationName = "demo";
                    extractor.RegistrationKey  = "demo";

                    // Load sample PDF document
                    extractor.LoadDocumentFromFile("SampleInvoice.pdf");

                    extractor.RegexSearch = true; // Enable the regular expressions

                    int pageCount = extractor.GetPageCount();

                    // Search through pages
                    for (int i = 0; i < pageCount; i++)
                    {
                        // Search credit card number in format of (XXXX XXXX XXXX XXXX)
                        string regexPattern = @"[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";
                        // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx

                        // Search each page for the pattern
                        if (extractor.Find(i, regexPattern, false))
                        {
                            do
                            {
                                // Iterate through each element in the found text
                                foreach (ISearchResultElement element in extractor.FoundText.Elements)
                                {
                                    Console.WriteLine("Found Credit Card Number: " + element.Text);
                                }
                            }while (extractor.FindNext());
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error: " + ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press enter key to continue...");
            Console.ReadLine();
        }

示例#12

0

显示文件

        static void Main(string[] args)
        {
            const string inputFile     = @"sample.pdf";
            const int    pageIndex     = 0;
            const string searchPattern = "\\d+\\.\\d+";

            // Prepare TextExtractor
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                textExtractor.RegexSearch = true;
                textExtractor.LoadDocumentFromFile(inputFile);

                // Load document with PDF SDK
                using (Document pdfDocument = new Document(inputFile))
                {
                    pdfDocument.RegistrationName = "demo";
                    pdfDocument.RegistrationKey  = "demo";

                    Page   pdfDocumentPage = pdfDocument.Pages[pageIndex];
                    Canvas canvas          = pdfDocumentPage.Canvas;

                    SolidBrush fillBrush = new SolidBrush(new ColorRGB(255, 0, 0));
                    fillBrush.Opacity = 50;                     // make the brush transparent

                    // Search for pattern and highlight found pieces
                    if (textExtractor.Find(pageIndex, searchPattern, caseSensitive: false))
                    {
                        do
                        {
                            foreach (var foundPiece in textExtractor.FoundText.Elements)
                            {
                                // Inflate the rectangle a bit
                                RectangleF rect = RectangleF.Inflate(foundPiece.Bounds, 1, 2);
                                // Draw rectangle over the PDF page
                                canvas.DrawRectangle(fillBrush, rect);
                            }
                        } while (textExtractor.FindNext());
                    }

                    // Save as new PDF document
                    pdfDocument.Save("result.pdf");

                    // Open result document in default associated application (for demo purposes)
                    Process.Start("result.pdf");
                }
            }
        }

示例#13

0

显示文件

        protected void Page_Load(object sender, EventArgs e)
        {
            // This test file will be copied to the project directory on the pre-build event (see the project properties).
            String inputFile = Server.MapPath("sample1.pdf");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            Response.Clear();
            Response.ContentType = "text/html";

            Rectangle location;
            int       pageIndex;

            Response.Write("Searching for \"ipsum\" string:<br><br>");

            // Search for "ipsum" string
            if (extractor.Find(0, "ipsum", false))
            {
                do
                {
                    Response.Write("<br/>");
                    Response.Write("Found on page 1 at location " + extractor.FoundText.Bounds.ToString() + "<br/>");
                    Response.Write("<br/>");
                    // iterate through each element in the found text
                    foreach (SearchResultElement element in extractor.FoundText.Elements)
                    {
                        Response.Write("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height + "<br/>");
                        Response.Write("Text: " + element.Text + "<br/>");
                        Response.Write("Font is bold: " + element.FontIsBold + "<br/>");
                        Response.Write("Font is italic:" + element.FontIsItalic + "<br/>");
                        Response.Write("Font name: " + element.FontName + "<br/>");
                        Response.Write("Font size:" + element.FontSize + "<br/>");
                        Response.Write("Font color:" + element.FontColor + "<br/>");
                    }
                }while (extractor.FindNext());
            }

            Response.End();
        }

示例#14

0

显示文件

文件： Program.cs 项目： remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample1.pdf");

            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                // Search each page for "ipsum" string
                if (extractor.Find(i, "ipsum", false))
                {
                    do
                    {
                        Console.WriteLine("");
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());
                        Console.WriteLine("");
                        // iterate through each element in the found text
                        foreach (SearchResultElement element in extractor.FoundText.Elements)
                        {
                            Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height);
                            Console.WriteLine("Text: " + element.Text);
                            Console.WriteLine("Font is bold: " + element.FontIsBold);
                            Console.WriteLine("Font is italic:" + element.FontIsItalic);
                            Console.WriteLine("Font name: " + element.FontName);
                            Console.WriteLine("Font size:" + element.FontSize);
                            Console.WriteLine("Font color:" + element.FontColor);
                        }
                    }while (extractor.FindNext());
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }

示例#15

0

显示文件

        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load the document
            extractor.LoadDocumentFromFile("sample2.pdf");

            // Smart match the search string like Adobe Reader
            extractor.WordMatchingMode = WordMatchingMode.SmartMatch;

            string searchString = "land";

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Search for text string
                if (extractor.Find(i, searchString, false))
                {
                    do
                    {
                        // Output search results
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());

                        // Now we are getting the found text
                        string extractedString = extractor.FoundText.Text;
                        Console.WriteLine("Found text: " + extractedString);
                    }while (extractor.FindNext()); // Search next occurrence of the search string
                }
            }

            // Cleanup
            extractor.Dispose();


            Console.WriteLine();
            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }

示例#16

0

显示文件

文件： Program.cs 项目： jboddiford/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            string inputFile = @".\sample2.pdf";

            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(inputFile);

            int pageCount = extractor.GetPageCount();

            // Search each page for a keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "bombardment", false))
                {
                    // Extract page
                    using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo"))
                    {
                        splitter.OptimizeSplittedDocuments = true;

                        int    pageNumber = i + 1; // (!) page number in ExtractPage() is 1-based
                        string outputFile = @".\page" + pageNumber + ".pdf";
                        splitter.ExtractPage(inputFile, outputFile, pageNumber);

                        Console.WriteLine("Extracted page " + pageNumber + " to file \"" + outputFile + "\"");
                    }
                }
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

示例#17

0

显示文件

文件： MainWindow.xaml.cs 项目： remlex/ByteScout-SDK-SourceCode

        private void Button_Find(object sender, RoutedEventArgs e)
        {
            if (textBoxFind.Text.Length > 0)
            {
                StringBuilder builder = new StringBuilder();

                builder.AppendLine("Searching for \"" + textBoxFind.Text + "\"");

                if (extractor.Find(0, textBoxFind.Text, false))
                {
                    do
                    {
                        builder.AppendLine("Found on page 0 at location " + extractor.FoundText.Location.ToString());
                    }while (extractor.FindNext());
                }

                builder.AppendLine("Finished.");

                textBox1.Text = builder.ToString();
            }
        }

示例#18

0

显示文件

        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("sample2.pdf");

            string searchString = "what";

            // get page count
            int pageCount = extractor.GetPageCount();
            int count     = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // search for text string
                if (extractor.Find(i, searchString, false))
                {
                    do
                    {
                        count++;

                        // output search results
                        Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString());

                        // now we are getting the found text
                        string extractedString = extractor.FoundText.Text;
                        Console.WriteLine("Extracted string: " + extractedString);
                    }while (extractor.FindNext()); // search next occurance of the search string
                }
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }

示例#19

0

显示文件

文件： Program.cs 项目： remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

C# (CSharp) TextExtractor.Find示例