Ejemplo n.º 1
0
        //public string ReadFromPositionSpire()
        //{
        //    PdfPageBase page = Document.Pages[0];
        //    string text = page.ExtractText(new RectangleF(50, 50, 500, 100));
        //    StringBuilder sb = new StringBuilder();
        //    sb.AppendLine(text);
        //    return sb.ToString();
        //    return string.Empty;
        //}

        public string BytescoutPDFExtractor(string path)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            path = @"C:\Users\zulfiqar\Downloads\ExpenseClaimForm1_b2abe30fabca4b1ca322fafd74306ceb (1).pdf";

            // load the document
            extractor.LoadDocumentFromFile(path);

            // get page count
            //int pageCount = extractor.GetPageCount();
            //int count = 0;

            // iterate through pages


            // define rectangle location to extract from
            RectangleF location = new RectangleF(0, 0, 200, 200);

            // set extraction area
            extractor.SetExtractionArea(location);

            // extract text bounded by the extraction area
            string extractedString = extractor.GetTextFromPage(0);

            return(extractedString);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            int pageCount = extractor.GetPageCount();

            // Search each page for some keyword
            for (int i = 0; i < pageCount; i++)
            {
                if (extractor.Find(i, "References", false))
                {
                    // If page contains the keyword, extract a text from it.
                    // For demonstration we'll extract the text from top part of the page only
                    extractor.SetExtractionArea(0, 0, 600, 200);
                    string text = extractor.GetTextFromPage(i);
                    Console.WriteLine(text);
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
        /// <summary>
        /// Get text from particular region
        /// </summary>
        private static string GetTextFromRegion(TextExtractor textExtractor, RectangleF extractionRegion, int pageIndex = 0)
        {
            // Set Extraction Area
            textExtractor.SetExtractionArea(extractionRegion);

            // Get Text from that region
            return(textExtractor.GetTextFromPage(pageIndex));
        }
Ejemplo n.º 4
0
        static void Main(string[] args)
        {
            // Create and setup Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load PDF document
            extractor.LoadDocumentFromFile(InputFile);

            // List to keep non-empty page numbers
            List <string> nonEmptyPages = new List <string>();

            // Iterate through pages
            for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
            {
                // Extract page text
                string pageText = extractor.GetTextFromPage(pageIndex);
                // If extracted text is not empty keep the page number
                if (pageText.Length > 0)
                {
                    nonEmptyPages.Add((pageIndex + 1).ToString());
                }
            }

            // Cleanup
            extractor.Dispose();


            // Form comma-separated list of page numbers to split("1,3,5")
            string ranges = string.Join(",", nonEmptyPages);

            // Create Bytescout.PDFExtractor.DocumentSplitter instance
            DocumentSplitter splitter = new DocumentSplitter("demo", "demo");

            splitter.OptimizeSplittedDocuments = true;

            // Split document by non-empty in temp folder
            string[] parts = splitter.Split(InputFile, ranges, TempFolder);

            // Cleanup
            splitter.Dispose();


            // Create Bytescout.PDFExtractor.DocumentMerger instance
            DocumentMerger merger = new DocumentMerger("demo", "demo");

            // Merge parts
            merger.Merge(parts, OutputFile);

            // Cleanup
            merger.Dispose();

            // Delete temp folder
            Directory.Delete(TempFolder, true);


            // Open the result file in default PDF viewer (for demo purposes)
            Process.Start(OutputFile);
        }
Ejemplo n.º 5
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.InfoExtractor instance
            InfoExtractor infoExtractor = new InfoExtractor();

            infoExtractor.RegistrationName = "demo";
            infoExtractor.RegistrationKey  = "demo";

            TextExtractor textExtractor = new TextExtractor();

            textExtractor.RegistrationName = "demo";
            textExtractor.RegistrationKey  = "demo";

            // List all PDF files in directory
            foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf"))
            {
                infoExtractor.LoadDocumentFromFile(file);

                Console.WriteLine("File Name:      " + Path.GetFileName(file));
                Console.WriteLine("Page Count:     " + infoExtractor.GetPageCount());
                Console.WriteLine("Author:         " + infoExtractor.Author);
                Console.WriteLine("Title:          " + infoExtractor.Title);
                Console.WriteLine("Producer:       " + infoExtractor.Producer);
                Console.WriteLine("Subject:        " + infoExtractor.Subject);
                Console.WriteLine("CreationDate:   " + infoExtractor.CreationDate);
                Console.WriteLine("Text (first 2 lines): ");

                // Load a couple of lines from each document
                textExtractor.LoadDocumentFromFile(file);
                using (StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0)))
                {
                    Console.WriteLine(stringReader.ReadLine());
                    Console.WriteLine(stringReader.ReadLine());
                }
                Console.WriteLine();
            }

            // Cleanup
            infoExtractor.Dispose();
            textExtractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Ejemplo n.º 6
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // Load document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Get page count
            int pageCount = extractor.GetPageCount();

            // Iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // Define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // Set extraction area
                extractor.SetExtractionArea(location);

                // Extract text from the extraction area
                string text = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":");
                Console.WriteLine();
                Console.WriteLine(text);

                // Reset the extraction area
                extractor.ResetExtractionArea();

                Console.WriteLine();
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
Ejemplo n.º 7
0
        static void Main(string[] args)
        {
            TextExtractor extractor = new TextExtractor("demo", "demo");

            // load the document
            extractor.LoadDocumentFromFile("../../sample2.pdf");

            // get page count
            int pageCount = extractor.GetPageCount();
            int count     = 0;

            // iterate through pages
            for (int i = 0; i < pageCount; i++)
            {
                // define rectangle location to extract from
                RectangleF location = new RectangleF(0, 0, 200, 200);

                // set extraction area
                extractor.SetExtractionArea(location);

                // extract text bounded by the extraction area
                string extractedString = extractor.GetTextFromPage(i);

                Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString);

                // reset extraction area to full page (by default)
                extractor.ResetExtractionArea();

                Console.WriteLine("\r\n");
            }



            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
Ejemplo n.º 8
0
        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }
Ejemplo n.º 9
0
        public static Dictionary <int, PDFLineInfo> ScrapyDataFromPDFiles(string[] urllist)
        {
            PDFLineInfo temp = new PDFLineInfo();
            Dictionary <int, PDFLineInfo> dictionary = new Dictionary <int, PDFLineInfo>();
            //dictionary = new Dictionary<int, PDFLineInfo>();


            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            for (int nFileIndex = 0; nFileIndex < urllist.Length; nFileIndex++)
            {
                //string currentFileName = "sample2.pdf";
                string currentFileName  = urllist[nFileIndex];
                string currentTitleName = "";

                // Load each PDF Document
                extractor.LoadDocumentFromFile(currentFileName);
                int pageCount = extractor.GetPageCount();


                //most of all case i = 0 but one case  i = 0
                int pdfDocumentType = -1;// 1: material type 2: spirit type 3: Empty type.

                /*if(currentFileName.Contains("R92(592112)_ExpViewPartList") == true)
                 * {
                 *  int zz = 5;
                 * }*/

                for (int i = 1; i < pageCount; i++)
                {
                    if (currentTitleName.Contains("notable") == true)
                    {
                        break;
                    }

                    //if (extractor.Find(i, "Dyaco", false))
                    {
                        //extractor.SetExtractionArea(0, 0, 800, 2000);
                        string wholetext = extractor.GetTextFromPage(i);
                        //Console.WriteLine(wholetext);

                        string[] lines = wholetext.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);
                        //Console.WriteLine("Length ===================== >" + lines.Length);
                        //if line.getlen is not 4 alert!!


                        //1 . Notify Header Strings
                        int j = 0;


                        while (j < lines.Length)
                        {
                            if ((lines[j].ToLower().Contains("part") == true) && (findTitle == false))
                            {
                                //Console.WriteLine("Title = = = = => " + detectTitle(lines[i]));
                                currentTitleName = detectTitle(lines[j]);
                                findTitle        = true;

                                j++;
                                continue;
                            }
                            if (currentTitleName.Contains("notable") == true)
                            {
                                break;
                            }

                            if (findTitle == false)
                            {
                                if (j > 2)
                                {
                                    currentTitleName = "notable";
                                    break;
                                }
                                j++;
                                continue;
                            }

                            var array = lines[j].Split(new string[] { "  " }, StringSplitOptions.RemoveEmptyEntries);

                            if ((findheader == false) && (findTitle == true))
                            {
                                if ((lines[j].ToLower().Contains("dyaco") == true) || (lines[j].ToLower().Contains("material") == true) ||
                                    (lines[j].ToLower().Contains("spirit") == true) || (lines[j].ToLower().Contains("no") == true) ||
                                    (lines[j].ToLower().Contains("part") == true) || (lines[j].ToLower().Contains("qty") == true))
                                {
                                    findheader = true;

                                    {
                                        if (lines[j].ToLower().Contains("material") == true)
                                        {
                                            pdfDocumentType = 1;
                                        }

                                        else if (lines[j].ToLower().Contains("spirit") == true)
                                        {
                                            pdfDocumentType = 2;
                                        }

                                        else
                                        {
                                            if (array.Length > 2)
                                            {
                                                if (array[2].ToLower().Contains("part") == true)
                                                {
                                                    pdfDocumentType = 2;
                                                }
                                            }
                                            pdfDocumentType = 3;
                                        }
                                    }

                                    j++;
                                    continue;
                                }

                                if (array.Length < 4)
                                {
                                    j++;
                                    continue;
                                }
                            }

                            if ((lines[j].Contains("(TRIAL VER. PDF Extractor SDK 8.4.1.2829.888331924)") == true) || ((lines[j].Contains("TRIAL VERSION EXPIRES 90 DAYS AFTER INSTALLATION") == true)))
                            {
                                j++;
                                continue;
                            }

                            if (pdfDocumentType == -1)
                            {
                                MessageBox.Show("Can not get pdf Type");
                            }

                            if (pdfDocumentType == 3)
                            {
                                if ((array.Length != 3) || (array[0].Length > 5))
                                {
                                    j++;
                                    continue;
                                }
                            }
                            else
                            {
                                if (array.Length != 4)
                                {
                                    j++;
                                    continue;
                                }
                            }



                            //Console.WriteLine("Document Type =====>" + pdfDocumentType);

                            //Console.WriteLine(RemoveSpace(array[0]));
                            //Console.WriteLine(RemoveSpace(array[1]));
                            //Console.WriteLine(RemoveSpace(array[2]));
                            //Console.WriteLine(RemoveSpace(array[3]));

                            /*if(array[0].Contains("57") == true)
                             * {
                             *  int awe = 5;
                             * }*/

                            switch (pdfDocumentType)
                            {
                            case 1:
                                temp.PartID   = RemoveSpace(array[1]);
                                temp.PartName = RemoveSpace(array[2]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[3]));     //no change

                                break;

                            case 2:
                                temp.PartID   = RemoveSpace(array[2]);
                                temp.PartName = RemoveSpace(array[1]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[3]));     //no change

                                break;

                            case 3:
                                temp.PartID   = "";                                     //empty
                                temp.PartName = RemoveSpace(array[1]);
                                temp.PartKey  = RemoveSpace(array[0]);
                                temp.Quantity = Int32.Parse(RemoveSpace(array[2]));     //no change

                                break;
                            }

                            temp.ProductName = currentTitleName;                 //no change

                            /*if(currentTitleName.Length <3)
                             * {
                             *  int qqq = 5;
                             * }*/

                            j++;

                            //2. Add values to PDFLineInfo
                            dictionary.Add(nTotalIndex, temp);

                            nTotalIndex++;
                        }
                    }
                }

                findheader       = false;
                findTitle        = false;
                currentTitleName = "";

                int currentpercent = (int)(20 * nFileIndex / urllist.Length);

                Console.WriteLine("*******" + nFileIndex + "*********" + currentpercent + "********");

                //updateing value
                Form1.progressvalue = currentpercent;
                Form1.progressBar1.BeginInvoke(new Action(() => Form1.progressBar1.Value = currentpercent));
                Form1.percentlabel.Text = currentpercent.ToString() + "%";
            }

            return(dictionary);
        }