//public string ReadFromPositionSpire() //{ // PdfPageBase page = Document.Pages[0]; // string text = page.ExtractText(new RectangleF(50, 50, 500, 100)); // StringBuilder sb = new StringBuilder(); // sb.AppendLine(text); // return sb.ToString(); // return string.Empty; //} public string BytescoutPDFExtractor(string path) { TextExtractor extractor = new TextExtractor("demo", "demo"); path = @"C:\Users\zulfiqar\Downloads\ExpenseClaimForm1_b2abe30fabca4b1ca322fafd74306ceb (1).pdf"; // load the document extractor.LoadDocumentFromFile(path); // get page count //int pageCount = extractor.GetPageCount(); //int count = 0; // iterate through pages // define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // set extraction area extractor.SetExtractionArea(location); // extract text bounded by the extraction area string extractedString = extractor.GetTextFromPage(0); return(extractedString); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); int pageCount = extractor.GetPageCount(); // Search each page for some keyword for (int i = 0; i < pageCount; i++) { if (extractor.Find(i, "References", false)) { // If page contains the keyword, extract a text from it. // For demonstration we'll extract the text from top part of the page only extractor.SetExtractionArea(0, 0, 600, 200); string text = extractor.GetTextFromPage(i); Console.WriteLine(text); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
/// <summary> /// Get text from particular region /// </summary> private static string GetTextFromRegion(TextExtractor textExtractor, RectangleF extractionRegion, int pageIndex = 0) { // Set Extraction Area textExtractor.SetExtractionArea(extractionRegion); // Get Text from that region return(textExtractor.GetTextFromPage(pageIndex)); }
static void Main(string[] args) { // Create and setup Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor("demo", "demo"); // Load PDF document extractor.LoadDocumentFromFile(InputFile); // List to keep non-empty page numbers List <string> nonEmptyPages = new List <string>(); // Iterate through pages for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { // Extract page text string pageText = extractor.GetTextFromPage(pageIndex); // If extracted text is not empty keep the page number if (pageText.Length > 0) { nonEmptyPages.Add((pageIndex + 1).ToString()); } } // Cleanup extractor.Dispose(); // Form comma-separated list of page numbers to split("1,3,5") string ranges = string.Join(",", nonEmptyPages); // Create Bytescout.PDFExtractor.DocumentSplitter instance DocumentSplitter splitter = new DocumentSplitter("demo", "demo"); splitter.OptimizeSplittedDocuments = true; // Split document by non-empty in temp folder string[] parts = splitter.Split(InputFile, ranges, TempFolder); // Cleanup splitter.Dispose(); // Create Bytescout.PDFExtractor.DocumentMerger instance DocumentMerger merger = new DocumentMerger("demo", "demo"); // Merge parts merger.Merge(parts, OutputFile); // Cleanup merger.Dispose(); // Delete temp folder Directory.Delete(TempFolder, true); // Open the result file in default PDF viewer (for demo purposes) Process.Start(OutputFile); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.InfoExtractor instance InfoExtractor infoExtractor = new InfoExtractor(); infoExtractor.RegistrationName = "demo"; infoExtractor.RegistrationKey = "demo"; TextExtractor textExtractor = new TextExtractor(); textExtractor.RegistrationName = "demo"; textExtractor.RegistrationKey = "demo"; // List all PDF files in directory foreach (string file in Directory.GetFiles(@"..\..\..\..", "*.pdf")) { infoExtractor.LoadDocumentFromFile(file); Console.WriteLine("File Name: " + Path.GetFileName(file)); Console.WriteLine("Page Count: " + infoExtractor.GetPageCount()); Console.WriteLine("Author: " + infoExtractor.Author); Console.WriteLine("Title: " + infoExtractor.Title); Console.WriteLine("Producer: " + infoExtractor.Producer); Console.WriteLine("Subject: " + infoExtractor.Subject); Console.WriteLine("CreationDate: " + infoExtractor.CreationDate); Console.WriteLine("Text (first 2 lines): "); // Load a couple of lines from each document textExtractor.LoadDocumentFromFile(file); using (StringReader stringReader = new StringReader(textExtractor.GetTextFromPage(0))) { Console.WriteLine(stringReader.ReadLine()); Console.WriteLine(stringReader.ReadLine()); } Console.WriteLine(); } // Cleanup infoExtractor.Dispose(); textExtractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // Set extraction area extractor.SetExtractionArea(location); // Extract text from the extraction area string text = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":"); Console.WriteLine(); Console.WriteLine(text); // Reset the extraction area extractor.ResetExtractionArea(); Console.WriteLine(); } // Cleanup extractor.Dispose(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // load the document extractor.LoadDocumentFromFile("../../sample2.pdf"); // get page count int pageCount = extractor.GetPageCount(); int count = 0; // iterate through pages for (int i = 0; i < pageCount; i++) { // define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // set extraction area extractor.SetExtractionArea(location); // extract text bounded by the extraction area string extractedString = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString); // reset extraction area to full page (by default) extractor.ResetExtractionArea(); Console.WriteLine("\r\n"); } Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { // Create TextExtractor instance TextExtractor textExtractor = new TextExtractor("demo", "demo"); textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader) // Create XMLExtractor instance XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo"); // Load document textExtractor.LoadDocumentFromFile("Invoice.pdf"); xmlExtractor.LoadDocumentFromFile("Invoice.pdf"); // Results string invoiceNo = string.Empty; string invoiceDate = string.Empty; string total = string.Empty; string tableData = string.Empty; // Iterate pages for (int i = 0; i < textExtractor.GetPageCount(); i++) { RectangleF pageRectangle = textExtractor.GetPageRectangle(i); RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0); // Search for "Invoice No." if (textExtractor.Find(i, "Invoice No.", false)) { // Get the found text rectangle RectangleF textRect = textExtractor.FoundText.Bounds; // Assume the text at right is the invoice number. // Shift the rectangle to the right: textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; // Set the extraction region and extract the text textExtractor.SetExtractionArea(textRect); invoiceNo = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Invoice Date" and extract text at right if (textExtractor.Find(i, "Invoice Date", false)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); invoiceDate = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Quantity" keyword to detect the top of the tabular data rectangle if (textExtractor.Find(i, "Quantity", false)) { // Keep the top table coordinate tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers } // Search for "TOTAL" (it will be also the bottom of tabular data rectangle) if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); total = textExtractor.GetTextFromPage(i).Trim(); // Calculate the table height tableRect.Height = textRect.Top - tableRect.Top; } // Extract tabular data using XMLExtractor if (tableRect.Height > 0) { xmlExtractor.SetExtractionArea(tableRect); tableData = xmlExtractor.GetXMLFromPage(i); } } // Display extracted data Console.WriteLine("Invoice No.: " + invoiceNo); Console.WriteLine("Invoice Date: " + invoiceDate); Console.WriteLine("TOTAL: " + total); Console.WriteLine("Table Data: "); Console.WriteLine(tableData); Console.WriteLine("Press any key..."); Console.ReadKey(); }
public static Dictionary <int, PDFLineInfo> ScrapyDataFromPDFiles(string[] urllist) { PDFLineInfo temp = new PDFLineInfo(); Dictionary <int, PDFLineInfo> dictionary = new Dictionary <int, PDFLineInfo>(); //dictionary = new Dictionary<int, PDFLineInfo>(); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; for (int nFileIndex = 0; nFileIndex < urllist.Length; nFileIndex++) { //string currentFileName = "sample2.pdf"; string currentFileName = urllist[nFileIndex]; string currentTitleName = ""; // Load each PDF Document extractor.LoadDocumentFromFile(currentFileName); int pageCount = extractor.GetPageCount(); //most of all case i = 0 but one case i = 0 int pdfDocumentType = -1;// 1: material type 2: spirit type 3: Empty type. /*if(currentFileName.Contains("R92(592112)_ExpViewPartList") == true) * { * int zz = 5; * }*/ for (int i = 1; i < pageCount; i++) { if (currentTitleName.Contains("notable") == true) { break; } //if (extractor.Find(i, "Dyaco", false)) { //extractor.SetExtractionArea(0, 0, 800, 2000); string wholetext = extractor.GetTextFromPage(i); //Console.WriteLine(wholetext); string[] lines = wholetext.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); //Console.WriteLine("Length ===================== >" + lines.Length); //if line.getlen is not 4 alert!! //1 . Notify Header Strings int j = 0; while (j < lines.Length) { if ((lines[j].ToLower().Contains("part") == true) && (findTitle == false)) { //Console.WriteLine("Title = = = = => " + detectTitle(lines[i])); currentTitleName = detectTitle(lines[j]); findTitle = true; j++; continue; } if (currentTitleName.Contains("notable") == true) { break; } if (findTitle == false) { if (j > 2) { currentTitleName = "notable"; break; } j++; continue; } var array = lines[j].Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); if ((findheader == false) && (findTitle == true)) { if ((lines[j].ToLower().Contains("dyaco") == true) || (lines[j].ToLower().Contains("material") == true) || (lines[j].ToLower().Contains("spirit") == true) || (lines[j].ToLower().Contains("no") == true) || (lines[j].ToLower().Contains("part") == true) || (lines[j].ToLower().Contains("qty") == true)) { findheader = true; { if (lines[j].ToLower().Contains("material") == true) { pdfDocumentType = 1; } else if (lines[j].ToLower().Contains("spirit") == true) { pdfDocumentType = 2; } else { if (array.Length > 2) { if (array[2].ToLower().Contains("part") == true) { pdfDocumentType = 2; } } pdfDocumentType = 3; } } j++; continue; } if (array.Length < 4) { j++; continue; } } if ((lines[j].Contains("(TRIAL VER. PDF Extractor SDK 8.4.1.2829.888331924)") == true) || ((lines[j].Contains("TRIAL VERSION EXPIRES 90 DAYS AFTER INSTALLATION") == true))) { j++; continue; } if (pdfDocumentType == -1) { MessageBox.Show("Can not get pdf Type"); } if (pdfDocumentType == 3) { if ((array.Length != 3) || (array[0].Length > 5)) { j++; continue; } } else { if (array.Length != 4) { j++; continue; } } //Console.WriteLine("Document Type =====>" + pdfDocumentType); //Console.WriteLine(RemoveSpace(array[0])); //Console.WriteLine(RemoveSpace(array[1])); //Console.WriteLine(RemoveSpace(array[2])); //Console.WriteLine(RemoveSpace(array[3])); /*if(array[0].Contains("57") == true) * { * int awe = 5; * }*/ switch (pdfDocumentType) { case 1: temp.PartID = RemoveSpace(array[1]); temp.PartName = RemoveSpace(array[2]); temp.PartKey = RemoveSpace(array[0]); temp.Quantity = Int32.Parse(RemoveSpace(array[3])); //no change break; case 2: temp.PartID = RemoveSpace(array[2]); temp.PartName = RemoveSpace(array[1]); temp.PartKey = RemoveSpace(array[0]); temp.Quantity = Int32.Parse(RemoveSpace(array[3])); //no change break; case 3: temp.PartID = ""; //empty temp.PartName = RemoveSpace(array[1]); temp.PartKey = RemoveSpace(array[0]); temp.Quantity = Int32.Parse(RemoveSpace(array[2])); //no change break; } temp.ProductName = currentTitleName; //no change /*if(currentTitleName.Length <3) * { * int qqq = 5; * }*/ j++; //2. Add values to PDFLineInfo dictionary.Add(nTotalIndex, temp); nTotalIndex++; } } } findheader = false; findTitle = false; currentTitleName = ""; int currentpercent = (int)(20 * nFileIndex / urllist.Length); Console.WriteLine("*******" + nFileIndex + "*********" + currentpercent + "********"); //updateing value Form1.progressvalue = currentpercent; Form1.progressBar1.BeginInvoke(new Action(() => Form1.progressBar1.Value = currentpercent)); Form1.percentlabel.Text = currentpercent.ToString() + "%"; } return(dictionary); }