/// <summary> /// Removes layers from a PDF document </summary> /// <param name="reader"> a PdfReader containing a PDF document </param> /// <param name="layers"> a sequence of names of OCG layers </param> /// <exception cref="IOException"> </exception> public virtual void RemoveLayers(PdfReader reader, params string[] layers) { int n = reader.NumberOfPages; for (int i = 1; i <= n; i++) reader.SetPageContent(i, reader.GetPageContent(i)); ICollection<string> ocgs = new HashSet2<string>(); for (int i = 0; i < layers.Length; i++) { ocgs.Add(layers[i]); } OCGParser parser = new OCGParser(ocgs); for (int i = 1; i <= n; i++) { PdfDictionary page = reader.GetPageN(i); Parse(parser, page); page.Remove(new PdfName("PieceInfo")); RemoveAnnots(page, ocgs); RemoveProperties(page, ocgs); } PdfDictionary root = reader.Catalog; PdfDictionary ocproperties = root.GetAsDict(PdfName.OCPROPERTIES); if (ocproperties != null) { RemoveOCGsFromArray(ocproperties, PdfName.OCGS, ocgs); PdfDictionary d = ocproperties.GetAsDict(PdfName.D); if (d != null) { RemoveOCGsFromArray(d, PdfName.ON, ocgs); RemoveOCGsFromArray(d, PdfName.OFF, ocgs); RemoveOCGsFromArray(d, PdfName.LOCKED, ocgs); RemoveOCGsFromArray(d, PdfName.RBGROUPS, ocgs); RemoveOCGsFromArray(d, PdfName.ORDER, ocgs); RemoveOCGsFromArray(d, PdfName.AS, ocgs); } } reader.RemoveUnusedObjects(); }
/// <summary> /// /// </summary> /// <param name="inFileName"></param> /// <param name="textToFind"></param> /// <returns></returns> public bool IsTextInPdf(string inFileName, string textToFind) { try { // Create a reader for the given PDF file using (PdfReader reader = new PdfReader(inFileName)) { //Console.Write("Processing: "); for (int page = 1; page <= reader.NumberOfPages; page++) { string temp = ExtractTextFromPDFBytes(reader.GetPageContent(page)); if (temp.IndexOf(textToFind) != -1) { return true; } } return false; } } catch { return false; } }
/// <summary> /// Compress a pdf /// </summary> /// <param name="base64Pdf">A small model to hold a base64 encoded pdf object { "content" : "somebase64" }</param> /// <returns>{ "content" : "smallerBase64" }</returns> public IHttpActionResult Post(Base64Pdf base64Pdf) { try { if (base64Pdf.data == null) return BadRequest("Check supplied pdf model"); byte[] data = Convert.FromBase64String(base64Pdf.data); //Compress byte[] compressedData; using (var memStream = new MemoryStream()) { var reader = new PdfReader(data); var stamper = new PdfStamper(reader, memStream, PdfWriter.VERSION_1_4); var pageNum = reader.NumberOfPages; for (var i = 1; i <= pageNum; i++) reader.SetPageContent(i, reader.GetPageContent(i)); stamper.SetFullCompression(); stamper.Close(); reader.Close(); compressedData = memStream.ToArray(); } var compressedBase64 = Convert.ToBase64String(compressedData); return Json(new Base64Pdf { data = compressedBase64 }); } catch (Exception ex) { return InternalServerError(ex); } }
/// <summary> /// Gets the content stream of a page as a PdfStream object. /// @since 2.1.3 (the method already existed without param compressionLevel) /// </summary> /// <param name="pageNumber">the page of which you want the stream</param> /// <param name="compressionLevel">the compression level you want to apply to the stream</param> /// <returns>a PdfStream object</returns> internal PdfStream GetFormXObject(int pageNumber, int compressionLevel) { var page = reader.GetPageNRelease(pageNumber); var contents = PdfReader.GetPdfObjectRelease(page.Get(PdfName.Contents)); var dic = new PdfDictionary(); byte[] bout = null; if (contents != null) { if (contents.IsStream()) { dic.Merge((PrStream)contents); } else { bout = reader.GetPageContent(pageNumber, File); } } else { bout = new byte[0]; } dic.Put(PdfName.Resources, PdfReader.GetPdfObjectRelease(page.Get(PdfName.Resources))); dic.Put(PdfName.TYPE, PdfName.Xobject); dic.Put(PdfName.Subtype, PdfName.Form); var impPage = (PdfImportedPage)ImportedPages[pageNumber]; dic.Put(PdfName.Bbox, new PdfRectangle(impPage.BoundingBox)); var matrix = impPage.Matrix; if (matrix == null) { dic.Put(PdfName.Matrix, Identitymatrix); } else { dic.Put(PdfName.Matrix, matrix); } dic.Put(PdfName.Formtype, One); PrStream stream; if (bout == null) { stream = new PrStream((PrStream)contents, dic); } else { stream = new PrStream(reader, bout); stream.Merge(dic); } return(stream); }
public void TestMultipleDocuments() { byte[] testFile1 = File.ReadAllBytes(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "test_files\\documents\\document.docx")); byte[] testFile2 = File.ReadAllBytes(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "test_files\\documents\\document.docx")); Dictionary<string, byte[]> files = new Dictionary<string, byte[]>(); files.Add("document1.docx", testFile1); files.Add("document2.docx", testFile2); PdfConverter converter = new PdfConverter(); byte[] pdf = converter.ConvertFiles(files, new ConversionOptions()); Assert.IsNotNull(pdf); Document doc = new Document(); PdfReader reader = new PdfReader(pdf); int pages = reader.NumberOfPages; byte[] page1 = reader.GetPageContent(1); byte[] page2 = reader.GetPageContent(2); doc.Close(); Assert.IsTrue(pages == 2); Assert.AreEqual(page1.Length, page1.Length); }
public string ParsePdf(string filePath) { string text = string.Empty; PdfReader reader = new iTextSharp.text.pdf.PdfReader(filePath); byte[] streamBytes = reader.GetPageContent(1); FileStream fStream = File.OpenRead(filePath); byte[] contents = new byte[fStream.Length]; fStream.Read(contents, 0, (int)fStream.Length); fStream.Close(); string s = Encoding.UTF8.GetString(contents, 0, contents.Length); var table = (Encoding.Default.GetString(streamBytes, 0, streamBytes.Length - 1)).Split(new string[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); byte[] buf = Encoding.Convert(Encoding.GetEncoding("iso-8859-1"), Encoding.UTF8, streamBytes); string tempString = Encoding.UTF8.GetString(buf, 0, buf.Count()); PRTokeniser tokenizer = new PRTokeniser(streamBytes); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TK_STRING) { text += tokenizer.StringValue; } } // create a reader (constructor overloaded for path to local file or URL) //PdfReader reader // = new PdfReader("http://www.chinehamchat.com/Chineham_Chat_Advertisements.pdf"); // total number of pages int n = reader.NumberOfPages; // size of the first page Rectangle psize = reader.GetPageSize(1); //float width = psize.Width; //float height = psize.Height; //Console.WriteLine("Size of page 1 of {0} => {1} × {2}", n, width, height); // file properties Hashtable infoHash = reader.Info; ICollection keys = infoHash.Keys; // Dictionary<string, string> infodict = (Dictionary<string,string>)reader.Info; foreach (string key in keys) { text += key + " => " + infoHash[key]; } // Console.WriteLine(key+ " => " + infoHash[key]); return(text); }
/** * Gets the content stream of a page as a PdfStream object. * @param pageNumber the page of which you want the stream * @param compressionLevel the compression level you want to apply to the stream * @return a PdfStream object * @since 2.1.3 (the method already existed without param compressionLevel) */ internal PdfStream GetFormXObject(int pageNumber, int compressionLevel) { PdfDictionary page = reader.GetPageNRelease(pageNumber); PdfObject contents = PdfReader.GetPdfObjectRelease(page.Get(PdfName.CONTENTS)); PdfDictionary dic = new PdfDictionary(); byte[] bout = null; if (contents != null) { if (contents.IsStream()) { dic.Merge((PRStream)contents); } else { bout = reader.GetPageContent(pageNumber, file); } } else { bout = new byte[0]; } dic.Put(PdfName.RESOURCES, PdfReader.GetPdfObjectRelease(page.Get(PdfName.RESOURCES))); dic.Put(PdfName.TYPE, PdfName.XOBJECT); dic.Put(PdfName.SUBTYPE, PdfName.FORM); PdfImportedPage impPage = importedPages[pageNumber]; dic.Put(PdfName.BBOX, new PdfRectangle(impPage.BoundingBox)); PdfArray matrix = impPage.Matrix; if (matrix == null) { dic.Put(PdfName.MATRIX, IDENTITYMATRIX); } else { dic.Put(PdfName.MATRIX, matrix); } dic.Put(PdfName.FORMTYPE, ONE); PRStream stream; if (bout == null) { stream = new PRStream((PRStream)contents, dic); } else { stream = new PRStream(reader, bout); stream.Merge(dic); } return(stream); }
public void SetPageContentTest01() { String outPdf = DestFolder + "out1.pdf"; PdfReader reader = new PdfReader(TestResourceUtils.GetResourceAsStream(TestResourcesPath, "in.pdf")); PdfStamper stamper = new PdfStamper(reader, new FileStream(outPdf, FileMode.Create)); reader.EliminateSharedStreams(); int total = reader.NumberOfPages + 1; for (int i = 1; i < total; i++) { byte[] bb = reader.GetPageContent(i); reader.SetPageContent(i, bb); } stamper.Close(); Assert.Null(new CompareTool().CompareByContent(outPdf, TestResourceUtils.GetResourceAsTempFile(TestResourcesPath, "cmp_out1.pdf"), DestFolder, "diff_")); }
public static List<String> Read() { var pdfReader = new PdfReader(_filePath); var pages = new List<String>(); for (int i = 0; i < pdfReader.NumberOfPages; i++) { string textFromPage = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(i + 1))); pages.Add(GetDataConvertedData(textFromPage)); //pages.AddRange(textFromPage.Split(new[] { "\n" }, StringSplitOptions.None) // .Where(text => text.Contains("Tj")).ToList()); //pages.Add(textFromPage); } return pages; }
/// <summary> Extracts the full text from a PDF file and writes to a file </summary> /// <param name="PDF_In_Name">Full path to the pdf file</param> /// <param name="Text_Out_Name">Output file name for the extracted text </param> /// <returns>TRUE if successful, otherwise FALSE</returns> public static bool Extract_Text(string PDF_In_Name, string Text_Out_Name) { StreamWriter outFile = null; PdfReader reader = null; try { // Create a reader for the given PDF file reader = new PdfReader(PDF_In_Name); //outFile = File.CreateText(outFileName); outFile = new StreamWriter(Text_Out_Name, false, Encoding.UTF8); for (int page = 1; page <= reader.NumberOfPages; page++) { try { string text_to_add = ExtractTextFromPDFBytes(reader.GetPageContent(page)); if (text_to_add.Trim().Length > 0) { outFile.WriteLine(); outFile.WriteLine("PAGE " + page); outFile.WriteLine(); outFile.WriteLine(text_to_add); } } catch { } } return true; } catch { } finally { if (outFile != null) outFile.Close(); if ( reader != null ) reader.Close(); } return false; }
// --------------------------------------------------------------------------- /** * Parses object and content information of a PDF into a text file. * @param pdf the original PDF * * this method uses code from; * PdfContentReaderTool.ListContentStreamForPage() * so i can pass in a byte array instead of file path * */ public string InspectPdf(byte[] pdf) { PdfReader reader = new PdfReader(pdf); int maxPageNum = reader.NumberOfPages; StringBuilder sb = new StringBuilder(); for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){ sb.AppendLine("==============Page " + pageNum + "===================="); sb.AppendLine("- - - - - Dictionary - - - - - -"); PdfDictionary pageDictionary = reader.GetPageN(pageNum); sb.AppendLine( PdfContentReaderTool.GetDictionaryDetail(pageDictionary) ); sb.AppendLine("- - - - - XObject Summary - - - - - -"); sb.AppendLine(PdfContentReaderTool.GetXObjectDetail( pageDictionary.GetAsDict(PdfName.RESOURCES)) ); sb.AppendLine("- - - - - Content Stream - - - - - -"); RandomAccessFileOrArray f = reader.SafeFile; byte[] contentBytes = reader.GetPageContent(pageNum, f); f.Close(); foreach (byte b in contentBytes) { sb.Append((char)b); } sb.AppendLine("- - - - - Text Extraction - - - - - -"); String extractedText = PdfTextExtractor.GetTextFromPage( reader, pageNum, new LocationTextExtractionStrategy() ); if (extractedText.Length != 0) { sb.AppendLine(extractedText); } else { sb.AppendLine("No text found on page " + pageNum); } sb.AppendLine(); } return sb.ToString(); }
/// <summary> /// Extracts a text from a PDF file. /// </summary> /// <param name="inFileName">the full path to the pdf file.</param> /// <param name="outFileName">the output file name.</param> /// <returns>the extracted text</returns> public String ExtractText(string inFileName,int topage) { StreamWriter outFile = null; try { // Create a reader for the given PDF file PdfReader reader = new PdfReader(inFileName); //outFile = File.CreateText(outFileName); // outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); String outputText =""; Console.Write("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten= 0; float curUnit = 0; // for (int page = 1; page <= reader.NumberOfPages; page++) for (int page = 1; page <= topage; page++) { outputText += ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " "; } return (outputText =="") ? null : outputText; } catch { File.AppendAllText("log_extract.txt", DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + ": " + inFileName + Environment.NewLine); return null; } finally { if (outFile != null) outFile.Close(); } }
// --------------------------------------------------------------------------- /** * Manipulates a PDF file src with the file dest as result * @param src the original PDF */ public byte[] CompressPdf(byte[] src) { PdfReader reader = new PdfReader(src); using (MemoryStream ms = new MemoryStream()) { using (PdfStamper stamper = new PdfStamper(reader, ms, PdfWriter.VERSION_1_5)) { stamper.Writer.CompressionLevel = 9; int total = reader.NumberOfPages + 1; for (int i = 1; i < total; i++) { reader.SetPageContent(i, reader.GetPageContent(i)); } stamper.SetFullCompression(); } return ms.ToArray(); } }
/// <summary> /// Extracts a text from a PDF file. /// </summary> /// <param name="inFileName">the full path to the pdf file.</param> /// <param name="outFileName">the output file name.</param> /// <returns>the extracted text</returns> public bool ExtractText(string inFileName, string outFileName) { StreamWriter outFile = null; try { // Create a reader for the given PDF file PdfReader reader = new PdfReader(inFileName); //outFile = File.CreateText(outFileName); outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); Console.Write("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten= 0; float curUnit = 0; for (int page = 1; page <= reader.NumberOfPages; page++) { outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " "); // Write the progress. if (charUnit >= 1.0f) { for (int i = 0; i < (int)charUnit; i++) { Console.Write("#"); totalWritten++; } } else { curUnit += charUnit; if (curUnit >= 1.0f) { for (int i = 0; i < (int)curUnit; i++) { Console.Write("#"); totalWritten++; } curUnit = 0; } } } if (totalWritten < totalLen) { for (int i = 0; i < (totalLen - totalWritten); i++) { Console.Write("#"); } } return true; } catch(Exception ex) { return false; } finally { if (outFile != null) outFile.Close(); } }
private bool ArePagesIdentical(String firstPdf, int firstStartPage, int firstLastPage, String secondPdf, int secondStartPage) { bool pagesAreIdentical = true; var firstPdfReader = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(firstPdf), null); var secondPdfReader = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(secondPdf), null); int secondPdfPage = secondStartPage; try { for (int currentFirstPage = firstStartPage; currentFirstPage < firstLastPage; currentFirstPage++) { if (BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(firstPdfReader.GetPageContent(currentFirstPage)), 0) != BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(secondPdfReader.GetPageContent(secondPdfPage)), 0)) { pagesAreIdentical = false; break; } secondPdfPage++; } } finally { if (firstPdfReader != null) firstPdfReader.Close(); if (secondPdfReader != null) secondPdfReader.Close(); } return pagesAreIdentical; }
/// <summary> /// Extracts a text from a PDF file. /// </summary> /// <param name="inFileName">the full path to the pdf file.</param> /// <param name="outFileName">the output file name.</param> /// <returns>the extracted text</returns> private static Boolean GetInvoice(string inFileName, String outputFile, String memberNumber, Boolean notUsed) { Boolean memberFound = false; try { // Create a reader for the given PDF file PdfReader reader = new PdfReader(inFileName); //outFile = File.CreateText(outFileName); //outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); //Console.Write("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten = 0; float curUnit = 0; //ExtractPages(inFileName, @"C:\Users\Nikolaj Sostack\Downloads\PDF\pdf.pdf", 1, 1); int pageFound = -1; for (int page = 1; page <= reader.NumberOfPages; page++) { //System.IO.File.WriteAllBytes(@"C:\Users\Nikolaj Sostack\Downloads\PDF\pdf.pdf", reader.GetPageContent(page)); string txt = ExtractTextFromPDFBytes(reader.GetPageContent(page)); var lastLine = txt.Split('\r')[1]; var number = lastLine; if( txt.Contains("\n\r" + memberNumber + "\n\r") ) { if (!number.StartsWith("-")) { if (!String.IsNullOrEmpty(outputFile)) pageFound = page; memberFound = true; } } //// Write the progress. //if (charUnit >= 1.0f) //{ // for (int i = 0; i < (int)charUnit; i++) // { // Console.Write("#"); // totalWritten++; // } //} //else //{ // curUnit += charUnit; // if (curUnit >= 1.0f) // { // for (int i = 0; i < (int)curUnit; i++) // { // Console.Write("#"); // totalWritten++; // } // curUnit = 0; // } //} } if( memberFound && pageFound > -1 ) ExtractPages(inFileName, outputFile, pageFound, pageFound); //if (totalWritten < totalLen) //{ // for (int i = 0; i < (totalLen - totalWritten); i++) // { // Console.Write("#"); // } //} } catch { throw; } finally { //if (outFile != null) outFile.Close(); } return memberFound; }
private KpiBudget ParseC810(byte[] bytestream) { try { PdfReader reader = new PdfReader(bytestream); string content = System.Text.Encoding.UTF8.GetString(reader.GetPageContent(1)); int indexTotal = FindIndex(content, 0, "[(Total)]TJ"); if (!VerifyIndex(indexTotal, "Fant ikke først element 'Total'")) return null; int indexCategory = FindIndex(content, indexTotal + 1, "[(Category)]TJ"); if (!VerifyIndex(indexCategory, "Fant ikke andre element 'Category'")) return null; int indexSale = FindIndex(content, indexCategory + 1, "[(Sale/ Budget incl. VAT - Month to Date )]TJ"); if (!VerifyIndex(indexSale, "Fant ikke tredje element 'Sale/ Budget incl. VAT - Month to Date'")) return null; //int indexTotal = content.IndexOf("[(Total)]TJ"); //int indexCategory = content.IndexOf("[(Category)]TJ", indexTotal + 1); //int indexSale = content.IndexOf("[(Sale/ Budget incl. VAT - Month to Date )]TJ", indexCategory + 1); string strBudget = content.Substring(indexCategory, indexSale - indexCategory); List<string> lines = strBudget.Split('\n').ToList(); lines.RemoveAll(item => !item.ToString().StartsWith("[(")); for (int i = lines.Count - 1; i >= 0; i--) lines[i] = lines[i].Trim().Replace("[(", string.Empty).Replace(")]TJ", string.Empty).Replace(" ", string.Empty).Replace("/", string.Empty); var budget = new KpiBudget(); budget.Date = selectedDate; for (int i = 0; i < lines.Count; i++) { if (lines[i].Contains("MDA") || lines[i].Contains("AudioVideo") || lines[i].Contains("SDA") || lines[i].Contains("Telecom") || lines[i].Contains("Computing") || lines[i].Contains("Kitchen") || lines[i].Contains("Other") || lines[i].Contains("Total")) { string type = lines[i]; i++; decimal decSales = 0; decimal.TryParse(lines[i], out decSales); i++; decimal decGM = 0; decimal.TryParse(lines[i], out decGM); var element = new KpiBudgetElement(); element.Insert(type, decSales, decGM); budget.element.Add(element); } } return budget; } catch (Exception ex) { Log.Unhandled(ex); } return null; }
internal string ExtractText(string inFileName) { PdfReader reader = new PdfReader(inFileName); string Results = string.Empty; try { int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; for (int page = 1; page <= reader.NumberOfPages; page++) { Results = Results + ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " "; } } catch (Exception m) { MyException mobj = new MyException("ExtractText() : " + m.Message); } return Results; }
// --------------------------------------------------------------------------- /** * Reads the content stream of the first page of a PDF into a text file. * @param src the PDF file */ public string ReadContent(byte[] src) { PdfReader reader = new PdfReader(src); byte[] pc = reader.GetPageContent(1); return Encoding.UTF8.GetString(pc, 0, pc.Length); }
private void OpenPdf() { _pdfPages.Clear(); try { var openFileDialog = new OpenFileDialog { DefaultExt = ".pdf", Filter = "Pdf documents (.pdf)|*.pdf" }; bool? result = openFileDialog.ShowDialog(); if (result == true) { string filename = openFileDialog.FileName; var pdfReader = new PdfReader(filename); for (int i = 1; i <= pdfReader.NumberOfPages; i++) { byte[] pagesBytes = pdfReader.GetPageContent(i); var token = new PRTokeniser(pagesBytes); var pageContent = new StringBuilder(); while (token.NextToken()) { if (token.TokenType == PRTokeniser.TokType.STRING) { pageContent.Append(token.StringValue); } } _pdfPages.Add(pageContent.ToString()); } } RaisePropertyChanged("MaxIndex"); } catch (Exception) { MessageBox.Show("Fail to load file"); } CurrentIndex = 1; }
public string ExtractText(string inFileName, out int tot) { string outs = ""; try { PdfReader reader = new PdfReader(inFileName); Debug.WriteLine("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten = 0; float curUnit = 0; for (int page = 1; page <= reader.NumberOfPages; page++) { string k = Encoding.GetEncoding("koi8-r") .GetString(reader.GetPageContent(page));//ExtractTextFromPDFBytes(reader.GetPageContent(page)); string wk = k;//Encoding.GetEncoding("utf-8").GetString(reader.GetPageContent(page));//Encoding.GetEncoding("koi8r").GetString(reader.GetPageContent(page)); // string k = ExtractTextFromPDFBytes(Encoding.GetEncoding("koi8r").GetBytes(wk));*/ outs += wk + " "; // Write the progress. if (charUnit >= 1.0f) { for (int i = 0; i < (int)charUnit; i++) { Debug.WriteLine("#"); totalWritten++; } } else { curUnit += charUnit; if (curUnit >= 1.0f) { for (int i = 0; i < (int)curUnit; i++) { Debug.WriteLine("#"); totalWritten++; } curUnit = 0; } } } if (totalWritten < totalLen) { for (int i = 0; i < (totalLen - totalWritten); i++) { Debug.WriteLine("#"); } } tot = totalWritten; return outs; } catch (Exception ex) { Debug.WriteLine("2"+ex.Message); tot = -1; return "-1"; } finally { // if (outFile != null) outFile.Close(); } }
virtual public bool CompareInnerText(String path1, String path2) { PdfReader reader1 = new PdfReader(path1); byte[] streamBytes1 = reader1.GetPageContent(1); PRTokeniser tokenizer1 = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes1))); PdfReader reader2 = new PdfReader(path2); byte[] streamBytes2 = reader2.GetPageContent(1); PRTokeniser tokenizer2 = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes2))); try { while (tokenizer1.NextToken()) { if (!tokenizer2.NextToken()) return false; else { if (tokenizer1.TokenType != tokenizer2.TokenType) return false; else { if (tokenizer1.TokenType == tokenizer2.TokenType && tokenizer2.TokenType == PRTokeniser.TokType.NUMBER) { if (Math.Abs(float.Parse(tokenizer1.StringValue, CultureInfo.InvariantCulture) - float.Parse(tokenizer2.StringValue, CultureInfo.InvariantCulture)) > 0.001) return false; } else if (!tokenizer1.StringValue.Equals(tokenizer2.StringValue)) return false; } } } return true; } finally { reader1.Close(); reader2.Close(); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsPdfContent(propertyBag.ContentType)) { return; } PdfReader pdfReader = new PdfReader(propertyBag.Response); try { object title = pdfReader.Info["Title"]; if (!title.IsNull()) { string pdfTitle = Convert.ToString(title, CultureInfo.InvariantCulture).Trim(); if (!pdfTitle.IsNullOrEmpty()) { propertyBag.Title = pdfTitle; } } StringBuilder sb = new StringBuilder(); // Following code from: // http://www.vbforums.com/showthread.php?t=475759 for (int p = 1; p <= pdfReader.NumberOfPages; p++) { byte[] pageBytes = pdfReader.GetPageContent(p); if (pageBytes.IsNull()) { continue; } PRTokeniser token = new PRTokeniser(pageBytes); while (token.NextToken()) { int tknType = token.TokenType; string tknValue = token.StringValue; if (tknType == PRTokeniser.TK_STRING) { sb.Append(token.StringValue); sb.Append(" "); } else if (tknType == 1 && tknValue == "-600") { sb.Append(" "); } else if (tknType == 10 && tknValue == "TJ") { sb.Append(" "); } } } propertyBag.Text = sb.ToString(); } finally { pdfReader.Close(); } }
// --------------------------------------------------------------------------- /** * Parses the PDF using PRTokeniser * @param src the ]original PDF file ] */ public string ParsePdf(byte[] src) { PdfReader reader = new PdfReader(src); // we can inspect the syntax of the imported page byte[] streamBytes = reader.GetPageContent(1); StringBuilder sb = new StringBuilder(); PRTokeniser tokenizer = new PRTokeniser(streamBytes); while (tokenizer.NextToken()) { if (tokenizer.TokenType == PRTokeniser.TokType.STRING) { sb.AppendLine(tokenizer.StringValue); } } return sb.ToString(); }
// --------------------------------------------------------------------------- /** * Manipulates a PDF file src with the file dest as result * @param src the original PDF */ public byte[] DecompressPdf(byte[] src) { PdfReader reader = new PdfReader(src); using (MemoryStream ms = new MemoryStream()) { using (PdfStamper stamper = new PdfStamper(reader, ms)) { Document.Compress = false; int total = reader.NumberOfPages + 1; for (int i = 1; i < total; i++) { reader.SetPageContent(i, reader.GetPageContent(i)); } } Document.Compress = true; return ms.ToArray(); } }
private bool ArePagesIdentical(String firstPdf, int firstStartPage, int firstLastPage, String secondPdf, int secondStartPage) { bool pagesAreIdentical = true; var firstPdfReader = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(firstPdf), null); var secondPdfReader = new iTextSharpPDF.PdfReader(new iTextSharpPDF.RandomAccessFileOrArray(secondPdf), null); int secondPdfPage = secondStartPage; try { for (int currentFirstPage = firstStartPage; currentFirstPage < firstLastPage; currentFirstPage++) { if (BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(firstPdfReader.GetPageContent(currentFirstPage)), 0) != BitConverter.ToInt32(new MD5CryptoServiceProvider().ComputeHash(secondPdfReader.GetPageContent(secondPdfPage)), 0)) { pagesAreIdentical = false; break; } secondPdfPage++; } } finally { if (firstPdfReader != null) { firstPdfReader.Close(); } if (secondPdfReader != null) { secondPdfReader.Close(); } } return(pagesAreIdentical); }
/** * Writes information about a specific page from PdfReader to the specified output stream. * @since 2.1.5 * @param reader the PdfReader to read the page content from * @param pageNum the page number to read * @param out the output stream to send the content to * @throws IOException */ public static void ListContentStreamForPage(PdfReader reader, int pageNum, TextWriter outp) { outp.WriteLine("==============Page " + pageNum + "===================="); outp.WriteLine("- - - - - Dictionary - - - - - -"); PdfDictionary pageDictionary = reader.GetPageN(pageNum); outp.WriteLine(GetDictionaryDetail(pageDictionary)); outp.WriteLine("- - - - - XObject Summary - - - - - -"); outp.WriteLine(GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES))); outp.WriteLine("- - - - - Content Stream - - - - - -"); RandomAccessFileOrArray f = reader.SafeFile; byte[] contentBytes = reader.GetPageContent(pageNum, f); f.Close(); outp.Flush(); foreach (byte b in contentBytes) { outp.Write((char)b); } outp.Flush(); outp.WriteLine("- - - - - Text Extraction - - - - - -"); String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy()); if (extractedText.Length != 0) outp.WriteLine(extractedText); else outp.WriteLine("No text found on page " + pageNum); outp.WriteLine(); }
public static void xpdfPage(PdfReader reader, int pageNum, TextWriter outp) { outp.WriteLine("==============Page " + pageNum + "===================="); PdfDictionary pageDictionary = reader.GetPageN(pageNum); if (_outputDictionary) { outp.WriteLine("- - - - - Dictionary - - - - - -"); //outp.WriteLine(PdfContentReaderTool.GetDictionaryDetail(pageDictionary)); //string s = PdfContentReaderTool.GetDictionaryDetail(pageDictionary); string s = GetDictionaryDetail(pageDictionary); outp.WriteLine(s); } if (_outputXObject) { outp.WriteLine("- - - - - XObject summary - - - - - -"); outp.WriteLine(PdfContentReaderTool.GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES))); } if (_outputContentStream) { outp.WriteLine("- - - - - Content stream - - - - - -"); RandomAccessFileOrArray f = reader.SafeFile; byte[] contentBytes = reader.GetPageContent(pageNum, f); f.Close(); outp.Flush(); foreach (byte b in contentBytes) { outp.Write((char)b); } outp.Flush(); } Test_iTextSharp.LocationTextExtractionStrategy strategy = new Test_iTextSharp.LocationTextExtractionStrategy(); //GetTextFromPage(reader, pageNum, strategy); Test_iTextSharp.PdfTools.ProcessContentPage(reader, pageNum, strategy); if (_outputText) { outp.WriteLine("- - - - - Text extraction - - - - - -"); //LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); //String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy()); string extractedText = strategy.GetResultantText(); if (extractedText.Length != 0) { outp.WriteLine(extractedText); outp.WriteLine(); } else outp.WriteLine("No text found on page " + pageNum); } if (_outputTextBlocks1) { outp.WriteLine("- - - - - Text blocks extraction 1 - - - - - -"); //GetTextFromPage(reader, pageNum, strategy); //PrintTextBlocks(outp, strategy.textBlocks); foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks) PrintTextBlock(outp, textBlock, 0); outp.WriteLine(); } if (_outputTextBlocks2) { outp.WriteLine("- - - - - Text blocks extraction 2 - - - - - -"); foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks) { outp.Write("block "); //outp.WriteLine(GetTextBlock(textBlock)); outp.WriteLine(textBlock.GetText()); if (textBlock.childs.Count > 0) outp.WriteLine(" **** warning childs blocks not printed ****"); } outp.WriteLine(); } if (_outputTextBlocks3) { outp.WriteLine("- - - - - Text blocks extraction 3 - - - - - -"); foreach (Test_iTextSharp.TextBlock textBlock in strategy.textBlocks) { bool first = true; //foreach (string s in GetTextBlockByLines(textBlock, _outputMaxCol)) foreach (string s in textBlock.GetTextByLines(_outputMaxCol)) { if (first) { outp.Write("block "); first = false; } else outp.Write(" "); outp.WriteLine(s); } if (textBlock.childs.Count > 0) outp.WriteLine(" **** warning childs blocks not printed ****"); } outp.WriteLine(); } outp.WriteLine(); }