/** * Displays a summary of the entries in the XObject dictionary for the stream * @param resourceDic the resource dictionary for the stream * @return a string with the summary of the entries * @throws IOException * @since 5.0.2 */ public static String GetXObjectDetail(PdfDictionary resourceDic) { StringBuilder sb = new StringBuilder(); PdfDictionary xobjects = resourceDic.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { return("No XObjects"); } foreach (PdfName entryName in xobjects.Keys) { PdfStream xobjectStream = xobjects.GetAsStream(entryName); sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + " = " + xobjectStream.GetAsNumber(PdfName.LENGTH) + " bytes ------\n"); if (!xobjectStream.Get(PdfName.SUBTYPE).Equals(PdfName.IMAGE)) { byte[] contentBytes = ContentByteUtils.GetContentBytesFromContentObject(xobjectStream); foreach (byte b in contentBytes) { sb.Append((char)b); } sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + "End of Content" + "------\n"); } } return(sb.ToString()); }
public void HandleXObject(PdfContentStreamProcessor processor, PdfStream stream, PdfIndirectReference refi) { PdfDictionary resources = stream.GetAsDict(PdfName.RESOURCES); // we read the content bytes up here so if it fails we don't leave the graphics state stack corrupted // this is probably not necessary (if we fail on this, probably the entire content stream processing // operation should be rejected byte[] contentBytes; contentBytes = ContentByteUtils.GetContentBytesFromContentObject(stream); PdfArray matrix = stream.GetAsArray(PdfName.MATRIX); new PushGraphicsState().Invoke(processor, null, null); if (matrix != null) { float a = matrix.GetAsNumber(0).FloatValue; float b = matrix.GetAsNumber(1).FloatValue; float c = matrix.GetAsNumber(2).FloatValue; float d = matrix.GetAsNumber(3).FloatValue; float e = matrix.GetAsNumber(4).FloatValue; float f = matrix.GetAsNumber(5).FloatValue; Matrix formMatrix = new Matrix(a, b, c, d, e, f); processor.Gs().ctm = formMatrix.Multiply(processor.Gs().ctm); } processor.ProcessContent(contentBytes, resources); new PopGraphicsState().Invoke(processor, null, null); }
public virtual void SpaceTrimColumnTextTest() { Document doc = new Document(PageSize.A4, 50, 30, 50, 30); PdfWriter writer = PdfWriter.GetInstance(doc, new FileStream(OUTSPTRIMCT, FileMode.Create)); doc.Open(); Phrase under = new Phrase(); under.Font = new Font(Font.FontFamily.TIMES_ROMAN, 12, Font.UNDERLINE); under.Add(new Chunk(" 1 1 9 ")); Paragraph underlineTest = new Paragraph(under); underlineTest.KeepTogether = true; doc.Add(underlineTest); doc.Close(); writer.Close(); PdfReader reader = new PdfReader(OUTSPTRIMCT); MyTextRenderListener listener = new MyTextRenderListener(); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(1); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic); Assert.IsTrue(listener.GetText().Length == 60, "Unexpected text length"); }
/** * This method edits the immediate contents of a page, i.e. its content stream. * It explicitly does not descent into form xobjects, patterns, or annotations. */ public void EditPage(PdfStamper pdfStamper, int pageNum) { var pdfReader = pdfStamper.Reader; var page = pdfReader.GetPageN(pageNum); var pageContentInput = ContentByteUtils.GetContentBytesForPage(pdfReader, pageNum); page.Remove(PdfName.CONTENTS); EditContent(pageContentInput, page.GetAsDict(PdfName.RESOURCES), pdfStamper.GetUnderContent(pageNum)); }
/** * Processes content from the specified page number using the specified listener * @param <E> the type of the renderListener - this makes it easy to chain calls * @param pageNumber the page number to process * @param renderListener the listener that will receive render callbacks * @return the provided renderListener * @throws IOException if operations on the reader fail */ virtual public E ProcessContent <E>(int pageNumber, E renderListener) where E : IRenderListener { PdfDictionary pageDic = reader.GetPageN(pageNumber); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, pageNumber), resourcesDic); return(renderListener); }
public static string ExtractText(string pdfFilename, int pageNumber) { PdfReader reader = new PdfReader(pdfFilename); MyTextRenderListener listener = new MyTextRenderListener(); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(pageNumber); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic); return(listener.Text.ToString()); }
public static void ProcessContentPage(PdfReader reader, int page, Test_iTextSharp.ITextExtractionStrategy strategy) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); PdfDictionary pageDic = reader.GetPageN(page); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); Test_iTextSharp.PdfContentStreamProcessor processor = new Test_iTextSharp.PdfContentStreamProcessor(strategy); byte[] bytes = ContentByteUtils.GetContentBytesForPage(reader, page); processor.ProcessContent(bytes, resourcesDic); }
public PdfImageObject extractImage(String signatureName) { MyImageRenderListener listener = new MyImageRenderListener(); PdfDictionary sigFieldDic = reader.AcroFields.GetFieldItem(signatureName).GetMerged(0); PdfDictionary appearancesDic = sigFieldDic.GetAsDict(PdfName.AP); PdfStream normalAppearance = appearancesDic.GetAsStream(PdfName.N); PdfDictionary resourcesDic = normalAppearance.GetAsDict(PdfName.RESOURCES); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); processor.ProcessContent(ContentByteUtils.GetContentBytesFromContentObject(normalAppearance), resourcesDic); return(listener.image); }
private void Do_Form(PdfStream stream) { PdfDictionary resources = stream.GetAsDict(PdfName.RESOURCES); byte[] contentBytes = ContentByteUtils.GetContentBytesFromContentObject(stream); contentBytes = _modifier.Modify(contentBytes, resources); PRStream prStream = stream as PRStream; prStream.SetData(contentBytes); }
private void CheckPageContent(String path) { PdfReader pdfReader = new PdfReader(path); PdfDictionary pageDic = pdfReader.GetPageN(1); IRenderListener dummy = new DummyRenderListner(); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(dummy); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(pdfReader, 1), resourcesDic); pdfReader.Close(); }
public void extractSnippets(String src, String dest) { TextWriter output = new StreamWriter(new FileStream(dest, FileMode.Create)); PdfReader reader = new PdfReader(src); IRenderListener listener = new MyTextRenderListener(output); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(1); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic); output.Flush(); output.Close(); reader.Close(); }
/** * Put all data from given day into given table */ public static void collectDataforDay(DateTime dateTime, SQLiteConnection m_dbConnection) { string zeroMonth = dateTime.Month < 10 ? "0" : ""; string zeroDay = dateTime.Day < 10 ? "0" : ""; string date = zeroMonth + dateTime.Month + "/" + zeroDay + dateTime.Day + "/" + dateTime.Year; PdfReader reader; try { reader = new PdfReader("http://www.equibase.com/premium/eqbPDFChartPlus.cfm?RACE=A&BorP=P&TID=SAR&CTRY=USA&DT=" + date + "&DAY=D&STYLE=EQB"); } catch (Exception e) { Console.WriteLine("CAPTCHA TIME"); Console.ReadKey(); Console.ReadKey(); reader = new PdfReader("http://www.equibase.com/premium/eqbPDFChartPlus.cfm?RACE=A&BorP=P&TID=SAR&CTRY=USA&DT=" + date + "&DAY=D&STYLE=EQB"); } StringBuilder builder = new StringBuilder(); for (int x = 1; x <= reader.NumberOfPages; x++) { PdfDictionary page = reader.GetPageN(x); IRenderListener listener = new SBTextRenderer(builder); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(x); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic); } if (pages.Count != 0) { DataHandler handler = new DataHandler(dateTime, pages, m_dbConnection); Thread thread = new Thread(new ThreadStart(handler.extractPdfData)); thread.Start(); thread.Join(); reader.Dispose(); pages.Clear(); } else { // If there were no races on this particular day, simply skip it! :D Console.WriteLine("Invalid Date: " + date); } }
virtual public void WeirdHyphensTest() { PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "WeirdHyphens.pdf"); List <String> textChunks = new List <String>(); IRenderListener listener = new MyTextRenderListener(textChunks); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(1); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic); /** * This assertion makes sure that encoding has been read properly from FontDescriptor. * If not the vallue will be "\u0000 14". */ Assert.AreEqual("\u0096 14", textChunks[18]); reader.Close(); }
public void TestType3FontWidth() { String inFile = "type3font_text.pdf"; LineSegment origLineSegment = new LineSegment(new Vector(20.3246f, 769.4974f, 1.0f), new Vector(151.22923f, 769.4974f, 1.0f)); PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, inFile); TextPositionRenderListener renderListener = new TextPositionRenderListener(); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener); PdfDictionary pageDic = reader.GetPageN(FIRST_PAGE); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, FIRST_PAGE), resourcesDic); Assert.AreEqual(renderListener.LineSegments[FIRST_ELEMENT_INDEX].GetStartPoint()[FIRST_ELEMENT_INDEX], origLineSegment.GetStartPoint()[FIRST_ELEMENT_INDEX], 1 / 2f); Assert.AreEqual(renderListener.LineSegments[FIRST_ELEMENT_INDEX].GetEndPoint()[FIRST_ELEMENT_INDEX], origLineSegment.GetEndPoint()[FIRST_ELEMENT_INDEX], 1 / 2f); }
private void CleanUpPage(int pageNum, IList <PdfCleanUpLocation> cleanUpLocations) { if (cleanUpLocations.Count == 0) { return; } PdfReader pdfReader = pdfStamper.Reader; PdfDictionary page = pdfReader.GetPageN(pageNum); PdfContentByte canvas = pdfStamper.GetUnderContent(pageNum); byte[] pageContentInput = ContentByteUtils.GetContentBytesForPage(pdfReader, pageNum); page.Remove(PdfName.CONTENTS); canvas.SaveState(); PdfCleanUpRegionFilter filter = CreateFilter(cleanUpLocations); PdfCleanUpRenderListener pdfCleanUpRenderListener = new PdfCleanUpRenderListener(pdfStamper, filter); pdfCleanUpRenderListener.RegisterNewContext(pdfReader.GetPageResources(page), canvas); PdfContentStreamProcessor contentProcessor = new PdfContentStreamProcessor(pdfCleanUpRenderListener); PdfCleanUpContentOperator.PopulateOperators(contentProcessor, pdfCleanUpRenderListener); contentProcessor.ProcessContent(pageContentInput, page.GetAsDict(PdfName.RESOURCES)); pdfCleanUpRenderListener.PopContext(); canvas.RestoreState(); ColorCleanedLocations(canvas, cleanUpLocations); if (redactAnnotIndirRefs != null) // if it isn't null, then we are in "extract locations from redact annots" mode { DeleteRedactAnnots(pageNum); } }
//static const Regex.Replace(s, @"\t|\n|\r", ""); static void Main(string[] args) { string fileName = @"D:\mobi.pdf"; //Open PDF document using (PdfReader reader = new PdfReader(fileName)) { var sb = new TextRenderEx(); //var parser = new PdfReaderContentParser(reader); for (int page = 1; page <= reader.NumberOfPages; page++) { var size = reader.GetCropBox(page); Console.WriteLine(size.Width); Console.WriteLine(size.Height); PdfDictionary pdfDictionary = reader.GetPageN(page); IRenderListener listener = new SBTextRenderer(sb); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(page); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, page), resourcesDic); //Create an instance of our strategy //var t2 = new MyLocationTextExtractionStrategy(searchText, System.Globalization.CompareOptions.IgnoreCase); //var ex = PdfTextExtractor.GetTextFromPage(reader, page, t2); ////Loop through each chunk found //foreach (var p in t2.myPoints) //{ // Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom)); //} //var strategy = parser.ProcessContent(i, new LocationTextExtractionStrategyEx()); //var res = strategy.GetLocations(); var its = new LocationTextExtractionStrategyEx2(); String s = PdfTextExtractor.GetTextFromPage(reader, page, its); var result = new StringBuilder(); foreach (var t in its.Columbs.Values) { string rs = t.ToString(); Console.WriteLine(rs); } string str = result.ToString(); //Console.Write(str); //if (!string.IsNullOrWhiteSpace(str) && (str.IndexOf(SearchText) != -1)||searchText.IndexOf(str) != -1) { // Console.Write(str); //} //Console.WriteLine(pageResult.ToString()); // System.Diagnostics.Debug.WriteLine(s); //var its2 = new LocationTextExtractionStrategyEx(searchText, page); //String ss = PdfTextExtractor.GetTextFromPage(reader, page, its2); //for (int i1 = 0; i1 < its2.m_SearchResultsList.Count; i1++) //{ // SearchResult t = its2.m_SearchResultsList[i1]; // Console.WriteLine(string.Format("text:{2}; x:{0},y:{1}", t.iPosX, t.iPosY, t.Text)); // Console.WriteLine(string.Format("topleft: x:{0},y:{1}", t.TopLeft[Vector.I1], t.TopLeft[Vector.I2])); //} var bbb = sb.sb.ToString(); var asdf = ""; } } }
private void RunBtn_Click(object sender, EventArgs e) //Paso uno, resibe PDF => StringBuilder { if (TxtUrl.Text == "") //En caso que no se ha seleccionado nada { MessageBox.Show("Please upload a PDF file first !!"); } else { progressBar1.Visible = true; this.timer1.Start(); PdfReader reader = new PdfReader(TxtUrl.Text.ToString()); //Lee la ubicacion del archivo StringBuilder builder = new StringBuilder(); //El documento se guarada como string builder for (int x = 1; x <= reader.NumberOfPages; x++) { //PdfDictionary page = reader.GetPageN(x); IRenderListener listener = new SBTextRenderer(builder); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); PdfDictionary pageDic = reader.GetPageN(x); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic); if (listPages.Count != 0) { listPages.Add(builder.ToString().Replace(listPages[x - 2].ToString(), "")); } else { listPages.Add(builder.ToString()); } if (x == reader.NumberOfPages) { foreach (string pagina in listPages) { for (int p = 0; p < pagina.Length; p++) { if (pagina[p].ToString() == "e" && pagina[p + 1].ToString() == "s" && pagina[p + 2].ToString() == " " && pagina[p + 3].ToString() == "D" && pagina[p + 4].ToString() == "a") { int index = p + 7; listHeaders.Add(pagina.Substring(0, index)); break; } } } } }//Find de for de conversion de PDF a string StringBuilder comenzar = new StringBuilder(); int Inicio = 0;//Guardara la ubicacion dentro del PDF donde se puede comenzar a copiar caracteres int final = 0; for (int i = 0; i < builder.ToString().Length; i++) { //ATP es la palabra donde comenzara a guardar caracteres char uno = 'A'; char dos = 'T'; char tres = 'P'; encabezado = encabezado + builder.ToString()[i];//Esta variable guarda el encabezado para eliminarlo en las proximas paginas if (builder.ToString()[i] == uno && builder.ToString()[i + 1] == dos && builder.ToString()[i + 2] == tres && banderaInicio == false || builder.ToString()[i] == 'B' && builder.ToString()[i + 1] == 'a' && builder.ToString()[i + 2] == 's' && builder.ToString()[i + 3] == 'e' && builder.ToString()[i + 4] == ':' && banderaInicio == false) { Inicio = i + 2; //Detecta donde inicia a guardar, le suma dos porque son los caracteres de TP de ATP // break;//Como se ha encontrado donde comenzar a copiar , se rompe el ciclo banderaInicio = true; //Aqui detecta el nombre del aeropuerto for (int r = i; r < builder.ToString().Length; r++) { string testErase = builder.ToString()[r].ToString(); if (builder.ToString()[r] == ':') { inicioAirport = r + 1; } else if (Regex.IsMatch(builder.ToString()[r].ToString(), @"^\d+$")) { finalAirport = r; } if (inicioAirport != 0 && finalAirport != 0 && airport == "") { airport = builder.ToString().Substring(inicioAirport, (finalAirport - inicioAirport)); break; } } } if (i == 2900) { int y = 0; } else if (builder.ToString()[i] == 'U' && builder.ToString()[i + 1] == 'C' && builder.ToString()[i + 2] == 'T' && builder.ToString()[i + 3] == ' ' && builder.ToString()[i + 4] == '2' && builder.ToString()[i + 5] == '2' && builder.ToString()[i + 6] == '1') { final = i; break; } else if (builder.ToString()[i] == 'T' && builder.ToString()[i + 1] == 'T' && builder.ToString()[i + 2] == 'O' && builder.ToString()[i + 3] == ' ' && builder.ToString()[i + 4] == '2' && builder.ToString()[i + 5] == '2' && builder.ToString()[i + 6] == '1') { final = i; break; } }//Find de for de deteccion de inicio para copiar bool bandera = false; // for (int i = Inicio; i < builder.ToString().Length; i++) for (int i = Inicio; i < final; i++) { if (bandera == false) //Verifica si comienza con cero y con ello lograr comenzar a guardar { string auxiliar = builder.ToString()[i].ToString(); //Se puede borrar, solo se usa de prueba if (builder.ToString()[i].ToString() == "0" || Regex.IsMatch(builder.ToString()[i].ToString(), @"^\d+$")) { bandera = true; } } if (bandera == true)//Como se verifico que si comienza en cero comienza a guardarse { char a = 'T'; char b = 'O'; char c = 'T'; char d = 'A'; string auxiliar2 = builder.ToString()[i].ToString();//Solo de prueba, se puede eliminar if (builder.ToString()[i] == a && builder.ToString()[i + 1] == b && builder.ToString()[i + 2] == c && builder.ToString()[i + 3] == d) { int aux = i + 18;//Detecta donde inicia a guardar, le suma dos porque son los caracteres de TP de ATP for (int y = aux; y < builder.ToString().Length; y++) { string auxiliar3 = builder.ToString()[y].ToString();//Se puede borrar, es solo de prueba if (builder.ToString()[y] == 'U' && builder.ToString()[y + 1] == 'S' && builder.ToString()[y + 2] == 'G') { i = y + 2; break; } } } else if (builder.ToString()[i] == 'A' && builder.ToString()[i + 1] == 'i' && builder.ToString()[i + 2] == 'r' && builder.ToString()[i + 3] == 'p' && builder.ToString()[i + 4] == 'o' && builder.ToString()[i + 5] == 'r' && builder.ToString()[i + 6] == 't') { break; } else { string cadena = builder.ToString()[i].ToString(); comenzar.Append(cadena); } } } encabezado = encabezado.Replace("DateA", "Date"); //textBox1.Text = encabezado; //textBox2.Text = comenzar.ToString().Replace(encabezado, ""); string nuevo = comenzar.ToString().Replace(encabezado, ""); LlenarGrid(nuevo.ToString()); } }