private async Task <string> pdfTextExtract(string sFilePath) { string texto; try { PdfReader reader = new PdfReader(sFilePath); iText.Kernel.Pdf.PdfDocument pdf = new iText.Kernel.Pdf.PdfDocument(reader); texto = string.Empty; for (int page = 1; page <= pdf.GetNumberOfPages(); page++) { ITextExtractionStrategy its = new SimpleTextExtractionStrategy(); String s = PdfTextExtractor.GetTextFromPage(pdf.GetPage(page), its); //s = System.Text.Encoding.UTF8.GetString(ASCIIEncoding.Convert(System.Text.Encoding.Default, System.Text.Encoding.UTF8, System.Text.Encoding.Default.GetBytes(s))); texto = texto + s; } reader.Close(); } catch (Exception Ex) { await new MessageDialog("Error al abrir archivo: " + Ex.Message).ShowAsync(); return(null); } return(texto); }
////////////////////////////////////////////////////////// //// public properties #region public properties /// <summary> /// Convert the Portable Document to text. /// </summary> /// <param name="file">The portable document file stream.</param> /// <param name="errors">Collection of errors that occur during reading the file stream. Null if sucessful.</param> /// <returns>The text contents of the file.</returns> public static string Convert(Stream file, out List <Exception> errors) { StringBuilder result = new StringBuilder(); errors = null; using (iText7.PdfReader reader = new iText7.PdfReader(file)) { using (iText7.PdfDocument doc = new iText7.PdfDocument(reader)) { int numberOfPages = doc.GetNumberOfPages(); for (int i = 1; i <= numberOfPages; i++) { try { var page = doc.GetPage(i); string pagetext = iText7.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page); result.Append(Common.CleanPdfText(pagetext)); } catch (Exception e) { if (errors == null) { errors = new List <Exception>(); } errors.Add(e); } } } } return(result.ToString()); }
protected override void Index() { var startTime = DateTime.Now; Console.Write($"Indexing {Name}"); #region PDF // Get the PDF document from a `FileStream` via `PdfReader`. var pdfDocument = new Pdf.PdfDocument(new Pdf.PdfReader(new FileStream(Location, FileMode.Open, FileAccess.Read))); var totalPageNumber = pdfDocument.GetNumberOfPages(); for (var i = 1; i <= totalPageNumber; i++) { // parser.ProcessPageContent(pdfDocument.GetPage(i+1)); // var text = strategy.GetResultantText(); var text = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(i)); this.AddToIndex(texts: text); // parser.Reset(); } #endregion Console.Write($" >==> {Thumbnail.Count} unique words. {(DateTime.Now - startTime).TotalMilliseconds}ms\n"); }
public static void AddPages() { PdfDocument pdfDocument = new PdfDocument(new PdfReader(PATH), new PdfWriter(new FileStream(PATH_PAGED, FileMode.Create, FileAccess.Write))); Document doc = new Document(pdfDocument); int numberOfPages = pdfDocument.GetNumberOfPages(); var size = pdfDocument.GetPage(1).GetPageSize(); for (int i = 1; i <= numberOfPages; i++) { // Write aligned text to the specified by parameters point doc.ShowTextAligned(new Paragraph("Strona " + i + " z " + numberOfPages), size.GetWidth() - 50, 20, i, TextAlignment.RIGHT, VerticalAlignment.BOTTOM, 0); } doc.Close(); }
private void SetLogo(PdfFormField toSet, iText.Kernel.Pdf.PdfDocument pdfDoc, string filename, int pagina) { var b = toSet as PdfButtonFormField; var afmetingen = b.GetWidgets().SelectMany(f => f.GetRectangle()).ToArray(); var x = (int)Convert.ToDouble(afmetingen[0].ToString().Replace(".", ",")); if (x < 10) { x = 100; } var y = (int)Convert.ToDouble(afmetingen[1].ToString().Replace(".", ",")); var wWidth = (int)Convert.ToDouble(afmetingen[2].ToString().Replace(".", ",")); var pageWidth = (int)pdfDoc.GetPage(1).GetPageSizeWithRotation().GetWidth(); if (wWidth > pageWidth - 20) { wWidth = pageWidth - 20; } var wHeight = (int)Convert.ToDouble(afmetingen[3].ToString().Replace(".", ",")); if (pagina == 1) { wHeight -= 10; } ImageData img = ImageDataFactory.Create(filename); var pdfImage = new iText.Layout.Element.Image(img); var scaled = pdfImage.ScaleToFit(wWidth, wHeight - y); var scaledWidth = scaled.GetImageScaledWidth(); var scaledHeight = scaled.GetImageScaledHeight(); Document d = new Document(pdfDoc); var berekendeX = (x + wWidth - scaledWidth) / 2; var berekendeY = (y + wHeight - scaledHeight) / 2; scaled.SetFixedPosition(pagina, berekendeX, berekendeY); d.Add(scaled); b.SetValue(""); }
public string GetTextFromPDF(string url) { try { StringBuilder ster = new StringBuilder(); //DocumentModel document = DocumentModel.Load(url); PdfReader read = new PdfReader(url); iText.Kernel.Pdf.PdfDocument doc = new iText.Kernel.Pdf.PdfDocument(read); //foreach (var page in doc.GetPage(int page)) for (int i = 0; i <= doc.GetNumberOfPages(); i++) { var word = ster.Append(PdfTextExtractor.GetTextFromPage(doc.GetPage(i))); text = word.ToString(); } return(text); } catch (Exception) { text = "Error reading file"; return(text); } }
/// <summary> /// An enumeration of paragraphs of the portable document. /// </summary> /// <param name="file">The portable document file stream.</param> /// <returns>A <see cref="IEnumerable{T}"/> of paragraphs.</returns> public static IEnumerable <string> Paragraphs(Stream file) { using (iText7.PdfReader reader = new iText7.PdfReader(file)) { using (iText7.PdfDocument doc = new iText7.PdfDocument(reader)) { int numberOfPages = doc.GetNumberOfPages(); for (int i = 1; i <= numberOfPages; i++) { iText7.PdfPage page = doc.GetPage(i); string pagetext = iText7.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page); pagetext = Common.CleanPdfText(pagetext); // Parse paragraphs. IEnumerable <string> paragraphs = pagetext.Split(new[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); foreach (string item in paragraphs) { yield return(item); } } } } }
/// <summary> /// Updates the links. /// </summary> /// <param name="pdfFilePath">The PDF file path.</param> /// <param name="htmlToPdfFiles">The HTML to PDF files.</param> /// <param name="logger">The logger.</param> internal static void UpdateLinks( string pdfFilePath, IReadOnlyCollection <HtmlToPdfFile> htmlToPdfFiles, ILogger logger) { string tempFilePath = Path.GetTempFileName(); using (PdfReader pdfReader = new PdfReader(pdfFilePath)) { using (PdfWriter pdfWriter = new PdfWriter(tempFilePath)) { using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(pdfReader, pdfWriter)) { int pageCount = pdfDocument.GetNumberOfPages(); for (int i = 1; i <= pageCount; i++) { // get page PdfPage pdfPage = pdfDocument.GetPage(i); // get link annotations IEnumerable <PdfLinkAnnotation> linkAnnotations = pdfPage.GetAnnotations().OfType <PdfLinkAnnotation>(); foreach (PdfLinkAnnotation linkAnnotation in linkAnnotations) { // get action PdfDictionary action = linkAnnotation.GetAction(); if (action == null) { continue; } PdfName s = action.GetAsName(PdfName.S); if (s != PdfName.URI) { continue; } PdfString uriPdfString = action.GetAsString(PdfName.URI); if (!Uri.TryCreate(uriPdfString.GetValue(), UriKind.RelativeOrAbsolute, out Uri uri)) { continue; } if (!uri.IsFile) { continue; } string htmlFilePath = uri.LocalPath.ToLower(); if (!htmlToPdfFiles.Any(x => string.Compare(x.Input, htmlFilePath, StringComparison.OrdinalIgnoreCase) == 0)) { // ex. when printing PDF from TOC.html by itself logger.LogDebug($"Could not find '{htmlFilePath}'. Referenced in '{pdfFilePath}' on page {i}."); continue; } HtmlToPdfFile linkedHtmlToPdfFile = htmlToPdfFiles.Single(x => x.Input == htmlFilePath); int linkedPageNumber = linkedHtmlToPdfFile.OutputPdfFilePageNumber; PdfPage linkedPage; try { // http://api.itextpdf.com/itext/com/itextpdf/text/pdf/PdfDestination.html linkedPage = pdfDocument.GetPage(linkedPageNumber); } catch (Exception ex) { throw new PdfPageNotFoundException(linkedPageNumber, linkedHtmlToPdfFile.Input, ex); } float top = linkedPage.GetPageSize().GetTop(); PdfExplicitDestination destination = PdfExplicitDestination.CreateFitH(linkedPage, top); PdfAction newAction = PdfAction.CreateGoTo(destination); linkAnnotation.SetAction(newAction); } } } } } File.Delete(pdfFilePath); File.Move(tempFilePath, pdfFilePath); }
/// <summary> /// Finds and sets the page numbers of links mapped to HTML headings in the specified PDF file. /// </summary> /// <param name="htmlToPdfFile">The HTML to PDF file.</param> internal static void SetHeadingPageNumbers(HtmlToPdfFile htmlToPdfFile) { using (PdfReader pdfReader = new PdfReader(htmlToPdfFile.PdfFilePath)) { using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(pdfReader)) { int pageCount = pdfDocument.GetNumberOfPages(); for (int i = 1; i <= pageCount; i++) { // get page PdfPage pdfPage = pdfDocument.GetPage(i); // get link annotations IEnumerable <PdfLinkAnnotation> linkAnnotations = pdfPage.GetAnnotations().OfType <PdfLinkAnnotation>(); foreach (PdfLinkAnnotation linkAnnotation in linkAnnotations) { // get action PdfDictionary action = linkAnnotation.GetAction(); if (action == null) { continue; } PdfName s = action.GetAsName(PdfName.S); if (s != PdfName.URI) { continue; } PdfString uriPdfString = action.GetAsString(PdfName.URI); if (!Uri.TryCreate(uriPdfString.GetValue(), UriKind.RelativeOrAbsolute, out Uri uri)) { continue; } if (!uri.IsFile) { continue; } // get query string NameValueCollection queryString = HttpUtility.ParseQueryString(uri.Query); // ex. ?headingLevel={level}&headingText string headingLevel = queryString["headingLevel"]; if (headingLevel == null) { continue; } if (!int.TryParse(headingLevel, out int level)) { continue; } string headingText = queryString["headingText"]; if (headingText == null) { continue; } HtmlHeading htmlHeading = htmlToPdfFile.TitleAndHeadings.SingleOrDefault(x => (x.Level == level) && (x.Text == headingText)); if (htmlHeading == null) { continue; } htmlHeading.Page = i; } } } } }
public ActionResult buildPDF(List <InformeResponse> lista, string nombreAsada) { MemoryStream ms = new MemoryStream(); PdfWriter pw = new PdfWriter(ms); PdfDocument pdfDocument = new PdfDocument(pw); Document doc = new Document(pdfDocument, PageSize.LETTER, false); doc.Add(new Paragraph("Reporte " + nombreAsada).SetFontSize(20).SetTextAlignment(TextAlignment.CENTER).SetFontColor(new DeviceRgb(4, 124, 188))); foreach (InformeResponse item in lista) { Preguntas preguntasObj = TipoFormulario(item.tipo); doc.Add(new Paragraph(item.acueducto).SetFontSize(15).SetBold()); doc.Add(new Paragraph("Fecha: " + item.fecha).SetFontSize(12)); doc.Add(new Paragraph("Encargado: " + item.encargado).SetFontSize(12).SetPaddingBottom(2)); doc.Add(new Paragraph("Respuestas ").SetFontSize(12).SetUnderline()); var infra = JsonConvert.DeserializeObject <Dictionary <string, string> >(item.infraestructura); foreach (var kv in infra) { if (kv.Key == "P1") { doc.Add(new Paragraph(preguntasObj.p1 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P2") { doc.Add(new Paragraph(preguntasObj.p2 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P3") { doc.Add(new Paragraph(preguntasObj.p3 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P4") { doc.Add(new Paragraph(preguntasObj.p4 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P5") { doc.Add(new Paragraph(preguntasObj.p5 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P6") { doc.Add(new Paragraph(preguntasObj.p6 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P7") { doc.Add(new Paragraph(preguntasObj.p7 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P8") { doc.Add(new Paragraph(preguntasObj.p8 + ": " + kv.Value).SetFontSize(10)); } else if (kv.Key == "P9") { doc.Add(new Paragraph(preguntasObj.p9 + ": " + kv.Value).SetFontSize(10)); } } doc.Add(new Paragraph("Comentarios: " + item.comentarios).SetFontSize(12)); doc.Add(new Paragraph("Tipo de formulario: " + preguntasObj.tipo).SetFontSize(12)); Cell cell = new Cell(); cell.Add(new Paragraph("Riesgo " + item.riesgo).SetBorder(new SolidBorder(colorRiesgo(item.riesgo), 1)).SetBackgroundColor(colorRiesgo(item.riesgo)).SetTextAlignment(iText.Layout.Properties.TextAlignment.CENTER).SetFontSize(14).SetBold()); doc.Add(cell); WebClient webClient = new WebClient(); byte[] data = webClient.DownloadData(item.imagen); ImageData imageData = ImageDataFactory.Create(data); Image image = new Image(imageData); var s = 0.4; float fwi = (float)s; float fhei = (float)s; doc.Add(image.Scale(fwi, fhei).SetHorizontalAlignment(HorizontalAlignment.CENTER).SetMarginBottom(15).SetMarginTop(15)); } //imagen del logo de sersa var s2 = 0.08; float fwi2 = (float)s2; float fhei2 = (float)s2; WebClient webClient2 = new WebClient(); byte[] data2 = webClient2.DownloadData(logoletra); ImageData imageData2 = ImageDataFactory.Create(data2); Image image2 = new Image(imageData2); Paragraph header = new Paragraph(""); header.Add(image2.Scale(fwi2, fhei2).SetMarginBottom(15)); //imagen del logo de TEC var s3 = 0.4; float fwi3 = (float)s3; float fhei3 = (float)s3; WebClient webClient3 = new WebClient(); byte[] data3 = webClient3.DownloadData(logotec); ImageData imageData3 = ImageDataFactory.Create(data3); Image image3 = new Image(imageData3); Paragraph header2 = new Paragraph(""); header2.Add(image3.Scale(fwi3, fhei3)).SetMarginBottom(10); for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++) { Rectangle pageSize = pdfDocument.GetPage(i).GetPageSize(); float x1 = 20; float y1 = pageSize.GetTop() - 55; float x2 = pageSize.GetRight() - 30; float y2 = pageSize.GetTop() - 40; doc.ShowTextAligned(header, x1, y1, i, TextAlignment.LEFT, VerticalAlignment.BOTTOM, 0); doc.ShowTextAligned(header2, x2, y2, i, TextAlignment.RIGHT, VerticalAlignment.BOTTOM, 0); } doc.Close(); byte[] bytesStream = ms.ToArray(); ms = new MemoryStream(); ms.Write(bytesStream, 0, bytesStream.Length); ms.Position = 0; return(new FileStreamResult(ms, "application/pdf")); }