private void button1_Click(object sender, EventArgs e) { String input = @"..\..\..\..\..\..\Data\PDFTemplate-Az.pdf"; PdfDocument doc = new PdfDocument(); // Read a pdf file doc.LoadFromFile(input); // Get the first page PdfPageBase page = doc.Pages[0]; // Extract text from page keeping white space String text = page.ExtractText(true); // Extract text from page without keeping white space //String text = page.ExtractText(false); String result = Path.GetFullPath("ExtractTextFromParticularPage_out.txt"); // Create a writer to put the extracted text TextWriter tw = new StreamWriter(result); // Write a line of text to the file tw.WriteLine(text); // Close the stream tw.Close(); MessageBox.Show("\nText extracted successfully from particular pages of PDF Document.\nFile saved at " + result); }
static void Main(string[] args) { //Load the PDF document PdfLoadedDocument loadedDocument = new PdfLoadedDocument("../../../../../../Data/Invoice.pdf"); // Get the first page of the loaded PDF document PdfPageBase page = loadedDocument.Pages[0]; TextLines lineCollection = new TextLines(); // Extract text from the first page with bounds page.ExtractText(out lineCollection); RectangleF textBounds = new RectangleF(474, 161, 50, 9); string invoiceNumer = ""; //Get the text provided in the bounds foreach (TextLine txtLine in lineCollection) { foreach (TextWord word in txtLine.WordCollection) { if (textBounds.IntersectsWith(word.Bounds)) { invoiceNumer = word.Text; break; } } } //Close the PDF document loadedDocument.Close(true); File.WriteAllText("data.txt", invoiceNumer); }
private void button1_Click(object sender, EventArgs e) { string input = @"..\..\..\..\..\..\Data\ExtractTextFromSpecificArea.pdf"; //Load the PDF file PdfDocument pdf = new PdfDocument(); pdf.LoadFromFile(input); //Get the first page PdfPageBase page = pdf.Pages[0]; //Extract text from a specific rectangular area within the page string text = page.ExtractText(new RectangleF(80, 180, 500, 200)); //Save the text to a .txt file StringBuilder sb = new StringBuilder(); sb.AppendLine(text); string result = "ExtractText_result.txt"; File.WriteAllText(result, sb.ToString()); Viewer(result); }
private void button1_Click(object sender, EventArgs e) { //Create a pdf document PdfDocument doc = new PdfDocument(); //Load a pdf file doc.LoadFromFile(@"..\..\..\..\..\..\Data\ExtractHighlightedText.pdf"); PdfPageBase page = doc.Pages[0]; PdfTextMarkupAnnotationWidget textMarkupAnnotation; StringBuilder stringBuilder = new StringBuilder(); stringBuilder.AppendLine("Extracted hightlighted text:"); //Get PdfTextMarkupAnnotationWidget objects for (int i = 0; i < page.AnnotationsWidget.Count; i++) { if (page.AnnotationsWidget[i] is PdfTextMarkupAnnotationWidget) { textMarkupAnnotation = page.AnnotationsWidget[i] as PdfTextMarkupAnnotationWidget; //Get the highlighted text stringBuilder.AppendLine(page.ExtractText(textMarkupAnnotation.Bounds)); //Get the highlighted color Color color = textMarkupAnnotation.TextMarkupColor; } } String result = "ExtractHighlightedText.txt"; File.WriteAllText(result, stringBuilder.ToString()); DocumentViewer(result); }
static void Main(string[] args) { //Load the PDF document PdfLoadedDocument loadedDocument = new PdfLoadedDocument("../../../../../../Data/Invoice.pdf"); //Get the first page of the loaded PDF document PdfPageBase page = loadedDocument.Pages[0]; //Extract text with layout string extractedText = page.ExtractText(true); //Save text to file File.WriteAllText("data.txt", extractedText); //Close the PDF document loadedDocument.Close(true); }
private void getPDFMsg() { PdfDocument pdf = new PdfDocument(); pdf.LoadFromFile(pdfFileName[currentPageNum - 1]); PdfPageBase page = pdf.Pages[0]; //从第一页的指定矩形区域内提取文本 string text = page.ExtractText(new RectangleF((int)(x.X - 70), (int)(x.Y - 30), (int)(y.X - 80), (int)(y.Y - 135))); //string text = page.ExtractText(new RectangleF(50, 50, 100, 100)); StringBuilder sb = new StringBuilder(); sb.AppendLine(text); File.WriteAllText("Extract.txt", sb.ToString().Replace("Evaluation Warning : The document was created with Spire.PDF for .NET.", "")); Process.Start("Extract.txt"); }
public static async Task <string> Read(StorageFile openFile) { SaveFilePdf.openFile = openFile; string extractedText = ""; if (openFile != null) { PdfLoadedDocument loadedDocument = new PdfLoadedDocument(); await loadedDocument.OpenAsync(openFile).ConfigureAwait(true); PdfPageBase page = loadedDocument.Pages[0]; extractedText = page.ExtractText(); loadedDocument.Close(true); loadedDocument.Dispose(); var mru = Windows.Storage.AccessCache.StorageApplicationPermissions.MostRecentlyUsedList; string mruToken = mru.Add(openFile, "Pdf file"); } return(extractedText); }
static void Main(string[] args) { FileStream inputStream = new FileStream("../../../../../../../Data/Invoice.pdf", FileMode.Open); //Load the PDF document PdfLoadedDocument loadedDocument = new PdfLoadedDocument(inputStream); // Get the first page of the loaded PDF document PdfPageBase page = loadedDocument.Pages[0]; // Extract text from the first page with bounds string extractedText = page.ExtractText(); //Close the document loadedDocument.Close(true); //Save the text to file File.WriteAllText("data.txt", extractedText); }
public string StripPDF(PDF filename) { //Load an existing PDF. PdfLoadedDocument loadedDocument = new PdfLoadedDocument(filename.Path); //Load the first page. PdfPageBase page = loadedDocument.Pages[0]; //Extract text from first page. string extractedText = page.ExtractText(); //Close the document loadedDocument.Close(true); return(extractedText); }
public static void GetElements(string fileName) { try { PdfDocument doc = new PdfDocument(); doc.LoadFromFile(fileName); PdfPageBase page = doc.Pages[0]; SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string text = page.ExtractText(strategy); FileStream fs = new FileStream(Path.GetDirectoryName(fileName) + "\\result_spire.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); sw.Write(text); sw.Flush(); sw.Close(); } catch (Exception e) { Console.WriteLine(e.Message); } }
private void button2_Click(object sender, EventArgs e) { if (fileDialog.FileName.Length > 0) { PdfDocument doc = new PdfDocument(); doc.LoadFromFile(fileDialog.FileName); PdfPageBase page = doc.Pages[2]; SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string text = page.ExtractText(strategy); FileStream fs = new FileStream("Result_PDF.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); sw.Write(text); sw.Flush(); sw.Close(); string textValue = System.IO.File.ReadAllText("Result_PDF.txt"); textBox1.Text = textValue; } else { MessageBox.Show("PDF 파일을 선택해 주세요."); } }
private void button1_Click(object sender, EventArgs e) { var fileContent = string.Empty; var filePath = string.Empty; using (OpenFileDialog openFileDialog = new OpenFileDialog()) { this.statusBox.Text += Environment.NewLine + "Selecting file..." + Environment.NewLine; openFileDialog.InitialDirectory = "c:\\"; openFileDialog.Filter = "pdf files (*.pdf)|*.pdf|All files (*.*)|*.*"; openFileDialog.FilterIndex = 2; openFileDialog.RestoreDirectory = true; if (openFileDialog.ShowDialog() == DialogResult.OK) { this.messageBox.Text = " "; //Get the path of specified file filePath = openFileDialog.FileName; //Read the contents of the file into a stream var fileStream = openFileDialog.OpenFile(); using (StreamReader reader = new StreamReader(fileStream)) { fileContent = reader.ReadToEnd(); } } } this.statusBox.Text += "Processing PDF file..." + Environment.NewLine; this.fileName.Text = filePath; PdfDocument doc = new PdfDocument(); doc.LoadFromFile(filePath); PdfPageBase page = doc.Pages[0]; SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string text = page.ExtractText(strategy); //PdfTextFind[] results = null; //results = page.FindText("MS61").Finds; //foreach (PdfTextFind texts in results) //{ // PointF p = texts.Position; // float x = texts.Position.X; // float y = texts.Position.Y; // // this.displayText.Text += p + Environment.NewLine; // string textArea = page.ExtractText(new RectangleF(x-30, y, 80, 180)); // // this.displayText.Text += Environment.NewLine + textArea + Environment.NewLine; //} //MessageBox.Show(text, " ", MessageBoxButtons.OK); //Document pdfDocument = new Document(filePath); //TextAbsorber textAbsorber = new TextAbsorber(); //pdfDocument.Pages.Accept(textAbsorber); //string extractedText = textAbsorber.Text; //Console.WriteLine(extractedText); //MessageBox.Show(extractedText, "Upstate Gold Sheet", MessageBoxButtons.OK); //PdfDocument PDF = PdfDocument.FromFile(filePath); //string AllText = PDF.ExtractAllText(); string[] stringArray = text.Split(Environment.NewLine); //string[] stringArray = text.Split("$"); this.statusBox.Text += "API Request/Response..." + Environment.NewLine; foreach (string line in stringArray) { bool stringExists = line.Contains("$"); if (stringExists) { string[] lineSplit = line.Split(' '); // foreach ( string word in lineSplit) // { // this.displayText.Text += word + Environment.NewLine; // } // this.displayText.Text += line + Environment.NewLine; // this.displayText.Text += " = " + lineSplit[3] + Environment.NewLine; string requestString = "method=test&requestString=" + line; this.displayText.Text += "--> " + lineSplit[0] + " " + lineSplit[1]; string responseString = APIRequest(requestString); string[] responseArray = responseString.Split(':'); responseArray[1] = responseArray[1].Replace("}", string.Empty); // MessageBox.Show(responseArray[1], "api response"); if (responseArray[1] == "0") { this.displayText.Text += ": Success" + Environment.NewLine; } else { // this.displayText.ForeColor this.displayText.Text += ": Failure" + Environment.NewLine; } // this.displayText.Text += responseString + Environment.NewLine; } } this.messageBox.Text = "Click the 'Exit' button to end the application."; this.statusBox.Text += "Job End..."; // MessageBox.Show(fileContent, "Unconverted Content " + filePath, MessageBoxButtons.OK); // MessageBox.Show(AllText, "Upstate Gold Sheet", MessageBoxButtons.OK); // MessageBox.Show(fileContent, "File Content at path: " + filePath, MessageBoxButtons.OK); // Show the dialog and get result. //DialogResult result = openFileDialog1.ShowDialog(); //if (result == DialogResult.OK) // Test result. //{ //} // Console.WriteLine(result); // <-- For debugging use. }
public ChartData GeneratePDFReport(PdfLoadedPageCollection pages) { try { StringBuilder extractedText = new StringBuilder(); if (pages != null) { //Parallel.ForEach<PdfPageBase>(pages.AsParallel(), page => //{ //}); foreach (PdfPageBase pageT in pages) { PdfPageBase page = pageT; extractedText.Append(page.ExtractText()); } } var wordsCount = System.Text.RegularExpressions.Regex.Matches(extractedText.ToString(), "\\S+").Count; var sentences = extractedText.ToString().Split(new string[] { ". ", "\r\n\\" }, StringSplitOptions.None); var sentenceReport = (from sentence in sentences where sentence != string.Empty group sentence by sentence into tempBag //let count = tempBag.Count() //orderby count descending select new { Value = tempBag.Key, Length = tempBag.Key.Length } ).ToList(); chartObj.NumberOfSentences = sentenceReport.Distinct().Count(); if (chartObj.NumberOfSentences > 0) { chartObj.AvgSetenceLength = sentenceReport.Sum(x => x.Length) / chartObj.NumberOfSentences; } string[] source = extractedText.ToString().Split(new char[] { '.', '?', '!', ' ', ';', ':', ',', '_' }, StringSplitOptions.RemoveEmptyEntries); var matchQuery = from word in source where word.ToLowerInvariant() == "\r\n".ToLowerInvariant() select word; chartObj.ParagraphCount = matchQuery.Count(); var wordReport = (from word in source where word != string.Empty group word by word into tempBag let count = tempBag.Count() orderby count descending select new { Value = tempBag.Key, Count = count, Length = tempBag.Key.Length } ).ToList(); chartObj.TotalUniqueWords = wordReport.Count(); if (chartObj.TotalUniqueWords > 0) { chartObj.AverageWordLength = wordReport.Sum(x => x.Length) / chartObj.TotalUniqueWords; } var topTenOccuringWords = (from obj in wordReport.OrderByDescending(x => x.Count).ToList().Take(10) where obj.Value != "\r\n" select new ChartData { Word = obj.Value, Count = obj.Count }).ToList(); chartObj.Top10WordsFromPDFLoaded = new ObservableCollection <ChartData>(topTenOccuringWords); chartObj.ListOfDetailsToPrint = GetOtherPDFReportData(chartObj); } catch (Exception ex) { throw new Exception("Error while generating PDF Reports in PdfViewer.ViewModels.PDFViewModel.GeneratePDFReport", ex); } return(chartObj); }