static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("words-with-hyphens.pdf"); int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "hyphen" string if (extractor.Find(i, "hyphen", false)) { do { Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); }while (extractor.FindNext()); } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { string fileName = "page" + i + ".txt"; // Save extracted page text to file extractor.SavePageTextToFile(i, fileName); } // Open first output file in default associated application System.Diagnostics.Process.Start(@".\page1.txt"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { string fileName = "page" + i + ".txt"; // Save extracted page text to file extractor.SavePageTextToFile(i, fileName); } // Cleanup extractor.Dispose(); // Open first output file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\page1.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); int pageCount = extractor.GetPageCount(); // Search each page for some keyword for (int i = 0; i < pageCount; i++) { if (extractor.Find(i, "References", false)) { // If page contains the keyword, extract a text from it. // For demonstration we'll extract the text from top part of the page only extractor.SetExtractionArea(0, 0, 600, 200); string text = extractor.GetTextFromPage(i); Console.WriteLine(text); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample1.pdf"); // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Create new stream. You can use MemoryStream or any other System.IO.Stream inheritor. FileStream stream = new FileStream(@".\page" + i + ".txt", FileMode.Create); // Save text from page to the file stream extractor.SavePageTextToStream(i, stream); // Close stream stream.Dispose(); } // Cleanup extractor.Dispose(); // Open first output file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\page1.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); // Get page count int pageCount = extractor.GetPageCount(); string outputText = ""; for (int i = 0; i < pageCount; i++) { // create new file stream FileStream fStream = new FileStream("page" + i.ToString() + ".txt", FileMode.Create); // save text from page #i to the file stream extractor.SavePageTextToStream(i, fStream); // close stream fStream.Close(); } // Open first output file in default associated application System.Diagnostics.Process.Start("page1.txt"); }
static void Main(string[] args) { // Create and setup Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor("demo", "demo"); // Load PDF document extractor.LoadDocumentFromFile(InputFile); // List to keep non-empty page numbers List <string> nonEmptyPages = new List <string>(); // Iterate through pages for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { // Extract page text string pageText = extractor.GetTextFromPage(pageIndex); // If extracted text is not empty keep the page number if (pageText.Length > 0) { nonEmptyPages.Add((pageIndex + 1).ToString()); } } // Cleanup extractor.Dispose(); // Form comma-separated list of page numbers to split("1,3,5") string ranges = string.Join(",", nonEmptyPages); // Create Bytescout.PDFExtractor.DocumentSplitter instance DocumentSplitter splitter = new DocumentSplitter("demo", "demo"); splitter.OptimizeSplittedDocuments = true; // Split document by non-empty in temp folder string[] parts = splitter.Split(InputFile, ranges, TempFolder); // Cleanup splitter.Dispose(); // Create Bytescout.PDFExtractor.DocumentMerger instance DocumentMerger merger = new DocumentMerger("demo", "demo"); // Merge parts merger.Merge(parts, OutputFile); // Cleanup merger.Dispose(); // Delete temp folder Directory.Delete(TempFolder, true); // Open the result file in default PDF viewer (for demo purposes) Process.Start(OutputFile); }
static void Main(string[] args) { try { // Get all settings VM var allSettings = GetSettingsVM("settings.json"); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; foreach (var fileName in Directory.GetFiles("InputFiles")) { // Load sample PDF document extractor.LoadDocumentFromFile(fileName); // Enable regex search extractor.RegexSearch = true; // Get Number of pages PDF contains int pageCount = extractor.GetPageCount(); for (int iPage = 0; iPage < pageCount; iPage++) { // Loop through all search settings foreach (var itmSearchSetting in allSettings.Settings) { // If found, then copy file to sub-category folder if (extractor.Find(iPage, itmSearchSetting.regex, false)) { // If Directory does not exists, then create them if (!Directory.Exists($"{allSettings.MainFolderName}/{itmSearchSetting.category}")) { Directory.CreateDirectory($"{allSettings.MainFolderName}/{itmSearchSetting.category}"); } // Copy File File.Copy(fileName, $"{allSettings.MainFolderName}/{itmSearchSetting.category}/{Path.GetFileName(fileName)}", true); } } } } // Cleanup extractor.Dispose(); } catch (Exception ex) { Console.WriteLine("Error: " + ex.Message); } Console.WriteLine(); Console.WriteLine("Press enter key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\Invoice.pdf"); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int i = 0; i < pageCount; i++) { // Search dates in format 12/31/1999 string regexPattern = "[0-9]{2}/[0-9]{2}/[0-9]{4}"; // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds); Console.WriteLine(""); // Iterate through each element in the found text foreach (ISearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine(" Text: " + element.Text); Console.WriteLine(" Font is bold: " + element.FontIsBold); Console.WriteLine(" Font is italic: " + element.FontIsItalic); Console.WriteLine(" Font name: " + element.FontName); Console.WriteLine(" Font size: " + element.FontSize); Console.WriteLine(" Font color: " + element.FontColor); Console.WriteLine(); } }while (extractor.FindNext()); } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample1.pdf"); // Set the matching mode. // WordMatchingMode.None - treats the search string as substring // WordMatchingMode.ExactMatch - treats the search string as separate word // WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader). extractor.WordMatchingMode = WordMatchingMode.ExactMatch; int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum", false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // Iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); extractor.RegexSearch = true; // ' turn on the regular expression search // search through pages for (int i = 0; i < pageCount; i++) { // searches for the text starting from LABORIS and ending with VELIT words string regexPattern = "LABORIS.*VELIT"; // see the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { try { // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("SampleInvoice.pdf"); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int i = 0; i < pageCount; i++) { // Search credit card number in format of (XXXX XXXX XXXX XXXX) string regexPattern = @"[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}"; // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx // Search each page for the pattern if (extractor.Find(i, regexPattern, false)) { do { // Iterate through each element in the found text foreach (ISearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Found Credit Card Number: " + element.Text); } }while (extractor.FindNext()); } } } } catch (Exception ex) { Console.WriteLine("Error: " + ex.Message); } Console.WriteLine(); Console.WriteLine("Press enter key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample1.pdf"); int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { // Search each page for "ipsum" string if (extractor.Find(i, "ipsum", false)) { do { Console.WriteLine(""); Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); Console.WriteLine(""); // iterate through each element in the found text foreach (SearchResultElement element in extractor.FoundText.Elements) { Console.WriteLine("Element #" + element.Index + " at left=" + element.Left + "; top=" + element.Top + "; width=" + element.Width + "; height=" + element.Height); Console.WriteLine("Text: " + element.Text); Console.WriteLine("Font is bold: " + element.FontIsBold); Console.WriteLine("Font is italic:" + element.FontIsItalic); Console.WriteLine("Font name: " + element.FontName); Console.WriteLine("Font size:" + element.FontSize); Console.WriteLine("Font color:" + element.FontColor); } }while (extractor.FindNext()); } } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
static void Main(string[] args) { string inputFile = @".\sample2.pdf"; // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(inputFile); int pageCount = extractor.GetPageCount(); // Search each page for a keyword for (int i = 0; i < pageCount; i++) { if (extractor.Find(i, "bombardment", false)) { // Extract page using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) { splitter.OptimizeSplittedDocuments = true; int pageNumber = i + 1; // (!) page number in ExtractPage() is 1-based string outputFile = @".\page" + pageNumber + ".pdf"; splitter.ExtractPage(inputFile, outputFile, pageNumber); Console.WriteLine("Extracted page " + pageNumber + " to file \"" + outputFile + "\""); } } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load the document extractor.LoadDocumentFromFile("sample2.pdf"); // Smart match the search string like Adobe Reader extractor.WordMatchingMode = WordMatchingMode.SmartMatch; string searchString = "land"; // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Search for text string if (extractor.Find(i, searchString, false)) { do { // Output search results Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); // Now we are getting the found text string extractedString = extractor.FoundText.Text; Console.WriteLine("Found text: " + extractedString); }while (extractor.FindNext()); // Search next occurrence of the search string } } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // Load document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Get page count int pageCount = extractor.GetPageCount(); // Iterate through pages for (int i = 0; i < pageCount; i++) { // Define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // Set extraction area extractor.SetExtractionArea(location); // Extract text from the extraction area string text = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":"); Console.WriteLine(); Console.WriteLine(text); // Reset the extraction area extractor.ResetExtractionArea(); Console.WriteLine(); } // Cleanup extractor.Dispose(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // load the document extractor.LoadDocumentFromFile("sample2.pdf"); string searchString = "what"; // get page count int pageCount = extractor.GetPageCount(); int count = 0; // iterate through pages for (int i = 0; i < pageCount; i++) { // search for text string if (extractor.Find(i, searchString, false)) { do { count++; // output search results Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds.ToString()); // now we are getting the found text string extractedString = extractor.FoundText.Text; Console.WriteLine("Extracted string: " + extractedString); }while (extractor.FindNext()); // search next occurance of the search string } } Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
private void BtnFindAll_Click(object sender, EventArgs e) { if (tbSearchExpression.Text.Length > 1) { // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile); // Set options from UI textExtractor.RegexSearch = cbRegex.Checked; textExtractor.WordMatchingMode = WordMatchingMode.None; // Search for text in all pages and store rectangles of found pieces for (int pageIndex = 0; pageIndex < textExtractor.GetPageCount(); pageIndex++) { ISearchResult[] searchResults = textExtractor.FindAll(pageIndex, tbSearchExpression.Text, caseSensitive: true); if (searchResults.Length > 0) { _foundTextRectangles[pageIndex] = searchResults.Select(searchResult => searchResult.Bounds).ToArray(); } } } // Select fount rectangles in PDF Viewer if (_foundTextRectangles.ContainsKey(pdfViewerControl1.CurrentPageIndex)) { pdfViewerControl1.SelectionInPoints = _foundTextRectangles[pdfViewerControl1.CurrentPageIndex]; } } else { MessageBox.Show(@"Try larger search string"); } }
static void Main(string[] args) { TextExtractor extractor = new TextExtractor("demo", "demo"); // load the document extractor.LoadDocumentFromFile("../../sample2.pdf"); // get page count int pageCount = extractor.GetPageCount(); int count = 0; // iterate through pages for (int i = 0; i < pageCount; i++) { // define rectangle location to extract from RectangleF location = new RectangleF(0, 0, 200, 200); // set extraction area extractor.SetExtractionArea(location); // extract text bounded by the extraction area string extractedString = extractor.GetTextFromPage(i); Console.WriteLine("Extracted from page #" + i + ":\r\n" + extractedString); // reset extraction area to full page (by default) extractor.ResetExtractionArea(); Console.WriteLine("\r\n"); } Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
static void Main(string[] args) { string inputDocument = @".\samplePDF_SSNNo.pdf"; string outputDocument = @".\samplePDF_SSNNo_edited.pdf"; try { // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor("demo", "demo")) { // Create Bytescout.PDFExtractor.Remover2 instance using (Remover2 remover = new Remover2("demo", "demo")) { // Load sample PDF document extractor.LoadDocumentFromFile("samplePDF_SSNNo.pdf"); remover.LoadDocumentFromFile(inputDocument); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) { // Search SSN in format 202-55-0130 using regular expression. // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Search each page for the pattern ISearchResult[] searchResults = extractor.FindAll(pageIndex, regexPattern, caseSensitive: false); foreach (var element in searchResults) { Console.WriteLine("Found SSN No: " + element.Text); // Add rectangle of the found SSN to Remover remover.AddTextToRemove(pageIndex, element.Bounds); } } // Mask replaced text with black rectangle remover.MaskRemovedText = true; // Change the color of the mask rectangle, if necessary //remover.MaskColor = Color.Red; remover.PerformRemoval(outputDocument); Console.WriteLine("Found SSNs removed, result saved to file \"" + outputDocument + "\""); } } // Open result file in default associated application (for the demonstration purpose) var processStartInfo = new ProcessStartInfo(outputDocument) { UseShellExecute = true }; Process.Start(processStartInfo); } catch (Exception ex) { Console.WriteLine("Error: " + ex.Message); } Console.WriteLine(); Console.WriteLine("Press enter key to continue..."); Console.ReadLine(); }
public static Dictionary <int, PDFLineInfo> ScrapyDataFromPDFiles(string[] urllist) { PDFLineInfo temp = new PDFLineInfo(); Dictionary <int, PDFLineInfo> dictionary = new Dictionary <int, PDFLineInfo>(); //dictionary = new Dictionary<int, PDFLineInfo>(); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; for (int nFileIndex = 0; nFileIndex < urllist.Length; nFileIndex++) { //string currentFileName = "sample2.pdf"; string currentFileName = urllist[nFileIndex]; string currentTitleName = ""; // Load each PDF Document extractor.LoadDocumentFromFile(currentFileName); int pageCount = extractor.GetPageCount(); //most of all case i = 0 but one case i = 0 int pdfDocumentType = -1;// 1: material type 2: spirit type 3: Empty type. /*if(currentFileName.Contains("R92(592112)_ExpViewPartList") == true) * { * int zz = 5; * }*/ for (int i = 1; i < pageCount; i++) { if (currentTitleName.Contains("notable") == true) { break; } //if (extractor.Find(i, "Dyaco", false)) { //extractor.SetExtractionArea(0, 0, 800, 2000); string wholetext = extractor.GetTextFromPage(i); //Console.WriteLine(wholetext); string[] lines = wholetext.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); //Console.WriteLine("Length ===================== >" + lines.Length); //if line.getlen is not 4 alert!! //1 . Notify Header Strings int j = 0; while (j < lines.Length) { if ((lines[j].ToLower().Contains("part") == true) && (findTitle == false)) { //Console.WriteLine("Title = = = = => " + detectTitle(lines[i])); currentTitleName = detectTitle(lines[j]); findTitle = true; j++; continue; } if (currentTitleName.Contains("notable") == true) { break; } if (findTitle == false) { if (j > 2) { currentTitleName = "notable"; break; } j++; continue; } var array = lines[j].Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); if ((findheader == false) && (findTitle == true)) { if ((lines[j].ToLower().Contains("dyaco") == true) || (lines[j].ToLower().Contains("material") == true) || (lines[j].ToLower().Contains("spirit") == true) || (lines[j].ToLower().Contains("no") == true) || (lines[j].ToLower().Contains("part") == true) || (lines[j].ToLower().Contains("qty") == true)) { findheader = true; { if (lines[j].ToLower().Contains("material") == true) { pdfDocumentType = 1; } else if (lines[j].ToLower().Contains("spirit") == true) { pdfDocumentType = 2; } else { if (array.Length > 2) { if (array[2].ToLower().Contains("part") == true) { pdfDocumentType = 2; } } pdfDocumentType = 3; } } j++; continue; } if (array.Length < 4) { j++; continue; } } if ((lines[j].Contains("(TRIAL VER. PDF Extractor SDK 8.4.1.2829.888331924)") == true) || ((lines[j].Contains("TRIAL VERSION EXPIRES 90 DAYS AFTER INSTALLATION") == true))) { j++; continue; } if (pdfDocumentType == -1) { MessageBox.Show("Can not get pdf Type"); } if (pdfDocumentType == 3) { if ((array.Length != 3) || (array[0].Length > 5)) { j++; continue; } } else { if (array.Length != 4) { j++; continue; } } //Console.WriteLine("Document Type =====>" + pdfDocumentType); //Console.WriteLine(RemoveSpace(array[0])); //Console.WriteLine(RemoveSpace(array[1])); //Console.WriteLine(RemoveSpace(array[2])); //Console.WriteLine(RemoveSpace(array[3])); /*if(array[0].Contains("57") == true) * { * int awe = 5; * }*/ switch (pdfDocumentType) { case 1: temp.PartID = RemoveSpace(array[1]); temp.PartName = RemoveSpace(array[2]); temp.PartKey = RemoveSpace(array[0]); temp.Quantity = Int32.Parse(RemoveSpace(array[3])); //no change break; case 2: temp.PartID = RemoveSpace(array[2]); temp.PartName = RemoveSpace(array[1]); temp.PartKey = RemoveSpace(array[0]); temp.Quantity = Int32.Parse(RemoveSpace(array[3])); //no change break; case 3: temp.PartID = ""; //empty temp.PartName = RemoveSpace(array[1]); temp.PartKey = RemoveSpace(array[0]); temp.Quantity = Int32.Parse(RemoveSpace(array[2])); //no change break; } temp.ProductName = currentTitleName; //no change /*if(currentTitleName.Length <3) * { * int qqq = 5; * }*/ j++; //2. Add values to PDFLineInfo dictionary.Add(nTotalIndex, temp); nTotalIndex++; } } } findheader = false; findTitle = false; currentTitleName = ""; int currentpercent = (int)(20 * nFileIndex / urllist.Length); Console.WriteLine("*******" + nFileIndex + "*********" + currentpercent + "********"); //updateing value Form1.progressvalue = currentpercent; Form1.progressBar1.BeginInvoke(new Action(() => Form1.progressBar1.Value = currentpercent)); Form1.percentlabel.Text = currentpercent.ToString() + "%"; } return(dictionary); }
static void Main(string[] args) { // Create TextExtractor instance TextExtractor textExtractor = new TextExtractor("demo", "demo"); textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader) // Create XMLExtractor instance XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo"); // Load document textExtractor.LoadDocumentFromFile("Invoice.pdf"); xmlExtractor.LoadDocumentFromFile("Invoice.pdf"); // Results string invoiceNo = string.Empty; string invoiceDate = string.Empty; string total = string.Empty; string tableData = string.Empty; // Iterate pages for (int i = 0; i < textExtractor.GetPageCount(); i++) { RectangleF pageRectangle = textExtractor.GetPageRectangle(i); RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0); // Search for "Invoice No." if (textExtractor.Find(i, "Invoice No.", false)) { // Get the found text rectangle RectangleF textRect = textExtractor.FoundText.Bounds; // Assume the text at right is the invoice number. // Shift the rectangle to the right: textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; // Set the extraction region and extract the text textExtractor.SetExtractionArea(textRect); invoiceNo = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Invoice Date" and extract text at right if (textExtractor.Find(i, "Invoice Date", false)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); invoiceDate = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Quantity" keyword to detect the top of the tabular data rectangle if (textExtractor.Find(i, "Quantity", false)) { // Keep the top table coordinate tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers } // Search for "TOTAL" (it will be also the bottom of tabular data rectangle) if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); total = textExtractor.GetTextFromPage(i).Trim(); // Calculate the table height tableRect.Height = textRect.Top - tableRect.Top; } // Extract tabular data using XMLExtractor if (tableRect.Height > 0) { xmlExtractor.SetExtractionArea(tableRect); tableData = xmlExtractor.GetXMLFromPage(i); } } // Display extracted data Console.WriteLine("Invoice No.: " + invoiceNo); Console.WriteLine("Invoice Date: " + invoiceDate); Console.WriteLine("TOTAL: " + total); Console.WriteLine("Table Data: "); Console.WriteLine(tableData); Console.WriteLine("Press any key..."); Console.ReadKey(); }