/// <summary> /// Reads the specified stream. /// </summary> /// <param name="stream">The stream.</param> /// <returns>The file</returns> public override IGenericFile Read(Stream stream) { var Builder = new StringBuilder(); string Title = ""; string Meta = ""; try { using PdfDocument inputDocument = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly); Title = inputDocument.Info.Title; Meta = inputDocument.Info.Keywords; foreach (PdfPage page in inputDocument.Pages) { for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream tempStream = page.Contents.Elements.GetDictionary(index).Stream; Builder.Append(ExtractTextFromPDFBytes(tempStream.Value)); } } } catch { } return(new GenericFile(Builder.ToString(), Title, Meta)); }
private Identifier getIdentifier(PdfPage page, string idStartKey, string idEndKey, string nameKey) { Identifier identifier = new Identifier(); for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; var outputText = new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value); var searchNameIndex = outputText.IndexOf(nameKey); if (searchNameIndex > -1) { var restOfString = outputText.Substring(searchNameIndex); var attIndex = restOfString.IndexOf("@"); attIndex = attIndex + searchNameIndex; var startIndex = searchNameIndex + nameKey.Count(); var endIndex = attIndex - startIndex; var name = outputText.Substring(startIndex, endIndex); identifier.PersonName = name; if (identifier.isLoaded) { return(identifier); } } var searchIdIndex = outputText.IndexOf(idStartKey); if (searchIdIndex > -1) { var endKeyIndex = outputText.IndexOf(idEndKey); if (searchNameIndex < endKeyIndex) { return(null); } if (endKeyIndex > -1) { var startIndex = searchIdIndex + idStartKey.Length; var idLength = endKeyIndex - startIndex; var id = outputText.Substring(startIndex, idLength); identifier.IdNumber = id; if (identifier.isLoaded) { return(identifier); } } } } return(null); }
public void SplitPDF(string filepath, string regex) { using (PdfDocument pdf = PdfReader.Open(filepath, PdfDocumentOpenMode.Import)) { PdfDocument neodoc = new PdfDocument(); bool firstrun = true; bool firstsubpage = true; foreach (PdfPage page in pdf.Pages) { neodoc.AddPage(page); if (firstsubpage == true) { if (firstrun != true) { neodoc.Save(Path.GetDirectoryName(filepath) + Path.DirectorySeparatorChar + "SplitPDF_" + System.Guid.NewGuid().ToString() + ".pdf"); neodoc = new PdfDocument(); } else { firstrun = false; } firstsubpage = false; } for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; string outputText = new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value); Regex reg = new Regex(regex); Match m = reg.Match(outputText); if (m.Success) { firstsubpage = true; } } } } }
/// <summary> /// Reads PDF object from input stream. /// </summary> /// <param name="pdfObject">Either the instance of a derived type or null. If it is null /// an appropriate object is created.</param> /// <param name="objectID">The address of the object.</param> /// <param name="includeReferences">If true, specifies that all indirect objects /// are included recursively.</param> public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool includeReferences) { MoveToObject(objectID); int objectNumber = ReadInteger(); int generationNumber = ReadInteger(); #if DEBUG // The following assertion sometime failed (see below) //Debug.Assert(objectID == new PdfObjectID(objectNumber, generationNumber)); if (objectID != new PdfObjectID(objectNumber, generationNumber)) { // A special kind of bug? Or is this an undocumented PDF feature? // PDF4NET 2.6 provides a sample called 'Unicode', which produces a file 'unicode.pdf' // The iref table of this file contains the following entries: // iref // 0 148 // 0000000000 65535 f // 0000000015 00000 n // 0000000346 00000 n // .... // 0000083236 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000080334 00000 n // .... // Object 84, 85, 86, and 87 maps to the same dictionary, but all PDF readers I tested // ignores this mismatch! The following assertion failed about 50 times with this file. #if true_ string message = String.Format("xref entry {0} {1} maps to object {2} {3}.", objectID.ObjectNumber, objectID.GenerationNumber, objectNumber, generationNumber); Debug.Assert(false, message); #endif } #endif // Always use object ID from iref table (see above) objectNumber = objectID.ObjectNumber; generationNumber = objectID.GenerationNumber; #if true_ Debug.WriteLine(String.Format("obj: {0} {1}", objectNumber, generationNumber)); #endif ReadSymbol(Symbol.Obj); bool checkForStream = false; Symbol symbol = ScanNextToken(); switch (symbol) { case Symbol.BeginArray: PdfArray array; if (pdfObject == null) { array = new PdfArray(this.document); } else { array = (PdfArray)pdfObject; } //PdfObject.RegisterObject(array, objectID, generation); pdfObject = ReadArray(array, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; case Symbol.BeginDictionary: PdfDictionary dict; if (pdfObject == null) { dict = new PdfDictionary(this.document); } else { dict = (PdfDictionary)pdfObject; } //PdfObject.RegisterObject(dict, objectID, generation); checkForStream = true; pdfObject = ReadDictionary(dict, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; // Acrobat 6 Professional proudly presents: The Null object! // Even with a one-digit object number an indirect reference �x 0 R� to this object is // one character larger than the direct use of �null�. Probable this is the reason why // it is true that Acrobat Web Capture 6.0 creates this object, but obviously never // creates a reference to it! case Symbol.Null: pdfObject = new PdfNullObject(this.document); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Boolean: pdfObject = new PdfBooleanObject(this.document, string.Compare(this.lexer.Token, Boolean.TrueString, true) == 0); //!!!mod THHO 19.11.09 pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Integer: pdfObject = new PdfIntegerObject(this.document, this.lexer.TokenToInteger); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.UInteger: pdfObject = new PdfUIntegerObject(this.document, this.lexer.TokenToUInteger); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Real: pdfObject = new PdfRealObject(this.document, this.lexer.TokenToReal); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.String: pdfObject = new PdfStringObject(this.document, this.lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Name: pdfObject = new PdfNameObject(this.document, this.lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); ReadSymbol(Symbol.EndObj); return(pdfObject); case Symbol.Keyword: // Should not come here anymore throw new NotImplementedException("Keyword"); default: // Should not come here anymore throw new NotImplementedException("unknown token \"" + symbol + "\""); } symbol = ScanNextToken(); if (symbol == Symbol.BeginStream) { PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); int length = GetStreamLength(dict); byte[] bytes = this.lexer.ReadStream(length); #if true_ if (dict.Elements.GetString("/Filter") == "/FlateDecode") { if (dict.Elements["/Subtype"] == null) { try { byte[] decoded = Filtering.FlateDecode.Decode(bytes); if (decoded.Length == 0) { goto End; } string pageContent = Filtering.FlateDecode.DecodeToString(bytes); if (pageContent.Length > 100) { pageContent = pageContent.Substring(pageContent.Length - 100); } pageContent.GetType(); bytes = decoded; dict.Elements.Remove("/Filter"); dict.Elements.SetInteger("/Length", bytes.Length); } catch { } } End :; } #endif PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); dict.Stream = stream; ReadSymbol(Symbol.EndStream); symbol = ScanNextToken(); } if (symbol != Symbol.EndObj) { throw new PdfReaderException(PSSR.UnexpectedToken(this.lexer.Token)); } return(pdfObject); }
//public PdfObject ReadObject(PdfObject obj, bool includeReferences) /// <summary> /// Reads the stream of a dictionary. /// </summary> private void ReadStream(PdfDictionary dict) { Symbol symbol = _lexer.Symbol; Debug.Assert(symbol == Symbol.BeginStream); int length = GetStreamLength(dict); byte[] bytes = _lexer.ReadStream(length); PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); Debug.Assert(dict.Stream == null, "Dictionary already has a stream."); dict.Stream = stream; ReadSymbol(Symbol.EndStream); ScanNextToken(); }
/// <summary> /// Reads PDF object from input stream. /// </summary> /// <param name="pdfObject">Either the instance of a derived type or null. If it is null /// an appropriate object is created.</param> /// <param name="objectID">The address of the object.</param> /// <param name="includeReferences">If true, specifies that all indirect objects /// are included recursively.</param> /// <param name="fromObjecStream">If true, the objects is parsed from an object stream.</param> public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool includeReferences, bool fromObjecStream) { #if DEBUG_ Debug.WriteLine("ReadObject: " + objectID); if (objectID.ObjectNumber == 20) GetType(); #endif int objectNumber = objectID.ObjectNumber; int generationNumber = objectID.GenerationNumber; if (!fromObjecStream) { MoveToObject(objectID); objectNumber = ReadInteger(); generationNumber = ReadInteger(); } #if DEBUG // The following assertion sometime failed (see below) //Debug.Assert(objectID == new PdfObjectID(objectNumber, generationNumber)); if (!fromObjecStream && objectID != new PdfObjectID(objectNumber, generationNumber)) { // A special kind of bug? Or is this an undocumented PDF feature? // PDF4NET 2.6 provides a sample called 'Unicode', which produces a file 'unicode.pdf' // The iref table of this file contains the following entries: // iref // 0 148 // 0000000000 65535 f // 0000000015 00000 n // 0000000346 00000 n // .... // 0000083236 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000083045 00000 n // 0000080334 00000 n // .... // Object 84, 85, 86, and 87 maps to the same dictionary, but all PDF readers I tested // ignores this mismatch! The following assertion failed about 50 times with this file. #if true_ string message = String.Format("xref entry {0} {1} maps to object {2} {3}.", objectID.ObjectNumber, objectID.GenerationNumber, objectNumber, generationNumber); Debug.Assert(false, message); #endif } #endif // Always use object ID from iref table (see above). objectNumber = objectID.ObjectNumber; generationNumber = objectID.GenerationNumber; #if true_ Debug.WriteLine(String.Format("obj: {0} {1}", objectNumber, generationNumber)); #endif if (!fromObjecStream) ReadSymbol(Symbol.Obj); bool checkForStream = false; Symbol symbol = ScanNextToken(); switch (symbol) { case Symbol.BeginArray: PdfArray array; if (pdfObject == null) array = new PdfArray(_document); else array = (PdfArray)pdfObject; //PdfObject.RegisterObject(array, objectID, generation); pdfObject = ReadArray(array, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; case Symbol.BeginDictionary: PdfDictionary dict; if (pdfObject == null) dict = new PdfDictionary(_document); else dict = (PdfDictionary)pdfObject; //PdfObject.RegisterObject(dict, objectID, generation); checkForStream = true; pdfObject = ReadDictionary(dict, includeReferences); pdfObject.SetObjectID(objectNumber, generationNumber); break; // Acrobat 6 Professional proudly presents: The Null object! // Even with a one-digit object number an indirect reference «x 0 R» to this object is // one character larger than the direct use of «null». Probable this is the reason why // it is true that Acrobat Web Capture 6.0 creates this object, but obviously never // creates a reference to it! case Symbol.Null: pdfObject = new PdfNullObject(_document); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Boolean: pdfObject = new PdfBooleanObject(_document, String.Compare(_lexer.Token, Boolean.TrueString, StringComparison.OrdinalIgnoreCase) == 0); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Integer: pdfObject = new PdfIntegerObject(_document, _lexer.TokenToInteger); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.UInteger: pdfObject = new PdfUIntegerObject(_document, _lexer.TokenToUInteger); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Real: pdfObject = new PdfRealObject(_document, _lexer.TokenToReal); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.String: pdfObject = new PdfStringObject(_document, _lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Name: pdfObject = new PdfNameObject(_document, _lexer.Token); pdfObject.SetObjectID(objectNumber, generationNumber); if (!fromObjecStream) ReadSymbol(Symbol.EndObj); return pdfObject; case Symbol.Keyword: // Should not come here anymore. ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); break; default: // Should not come here anymore. ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); break; } symbol = ScanNextToken(); if (symbol == Symbol.BeginStream) { PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); #if true_ ReadStream(dict); #else int length = GetStreamLength(dict); byte[] bytes = _lexer.ReadStream(length); #if true_ if (dict.Elements.GetString("/Filter") == "/FlateDecode") { if (dict.Elements["/Subtype"] == null) { try { byte[] decoded = Filtering.FlateDecode.Decode(bytes); if (decoded.Length == 0) goto End; string pageContent = Filtering.FlateDecode.DecodeToString(bytes); if (pageContent.Length > 100) pageContent = pageContent.Substring(pageContent.Length - 100); pageContent.GetType(); bytes = decoded; dict.Elements.Remove("/Filter"); dict.Elements.SetInteger("/Length", bytes.Length); } catch { } } End: ; } #endif PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); dict.Stream = stream; ReadSymbol(Symbol.EndStream); symbol = ScanNextToken(); #endif } if (!fromObjecStream && symbol != Symbol.EndObj) ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token)); return pdfObject; }
private void Initialize() { // find plugins folder string pluginPath = System.Reflection.Assembly.GetExecutingAssembly().Location; pluginPath = Path.GetDirectoryName(pluginPath); pagesCount = 0; for (int i = 0; i < FFileNameIn.SliceCount; i++) { // open and parse each .pdf // Open the file PdfDocument document = PdfReader.Open(FFileNameIn[i], PdfDocumentOpenMode.ReadOnly); FSpreadCountLocal.Add(document.PageCount); FScaleFactorLocal.Add(document.Pages[0].Width / document.Pages[0].Height); // render .png files for each page string pngName = i + "_%d.png"; Process exe = new System.Diagnostics.Process(); exe.StartInfo.UseShellExecute = true; exe.StartInfo.FileName = "pdfdraw.exe"; exe.StartInfo.WorkingDirectory = pluginPath; exe.StartInfo.Arguments = "-o " + pngName + " -r 144 \""; exe.StartInfo.Arguments += @FFileNameIn[i]; exe.StartInfo.Arguments += "\" "; exe.StartInfo.ErrorDialog = false; exe.StartInfo.CreateNoWindow = true; exe.Start(); exe.WaitForExit(); for (int j = 0; j < FSpreadCountLocal[i]; j++) { // get image string curPngName = i + "_" + (j + 1) + ".png"; Image img = Image.FromFile(pluginPath + "\\" + curPngName); Image.GetThumbnailImageAbort myCallBack = new Image.GetThumbnailImageAbort(ThumbnailCallback); Image texImg = img.GetThumbnailImage(FWeightIn[0], FWeightIn[0], myCallBack, IntPtr.Zero); img.Dispose(); File.Delete(pluginPath + "\\" + curPngName); // create a byte-array for a picture content MemoryStream ms = new MemoryStream(); texImg.Save(ms, System.Drawing.Imaging.ImageFormat.Bmp); texImg.Dispose(); ms.Seek(54, SeekOrigin.Begin); byte[] bytes = new byte[(int)ms.Length - 54]; ms.Read(bytes, 0, (int)ms.Length - 54); texBytes.Add(bytes); ms.Close(); // get text string pageText = ""; System.Text.StringBuilder sb = new System.Text.StringBuilder(); PdfPage curPage = document.Pages[j]; for (int index = 0; index < curPage.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = curPage.Contents.Elements.GetDictionary(index).Stream; pageText += new PDFParser().ExtractTextFromPDFBytes(stream.Value); } FStringLocal.Add(pageText); // parse text and find hyperlinks string hyperLinks = ""; string pattern = @"(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])"; System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(pattern); System.Text.RegularExpressions.MatchCollection matches = regex.Matches(pageText); for (int l = 0; l < matches.Count; l++) { hyperLinks += matches[l].Value + " "; } // parse text and find email addresses pattern = @"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"; System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(pattern); System.Text.RegularExpressions.MatchCollection matches2 = regex2.Matches(pageText); for (int l = 0; l < matches2.Count; l++) { hyperLinks += matches2[l].Value + " "; } FHTTPLocal.Add(hyperLinks); } pagesCount += FSpreadCountLocal[i]; } }