示例#1
0
        /// <summary>
        /// Reads the specified stream.
        /// </summary>
        /// <param name="stream">The stream.</param>
        /// <returns>The file</returns>
        public override IGenericFile Read(Stream stream)
        {
            var    Builder = new StringBuilder();
            string Title   = "";
            string Meta    = "";

            try
            {
                using PdfDocument inputDocument = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly);
                Title = inputDocument.Info.Title;
                Meta  = inputDocument.Info.Keywords;
                foreach (PdfPage page in inputDocument.Pages)
                {
                    for (int index = 0; index < page.Contents.Elements.Count; index++)
                    {
                        PdfDictionary.PdfStream tempStream = page.Contents.Elements.GetDictionary(index).Stream;
                        Builder.Append(ExtractTextFromPDFBytes(tempStream.Value));
                    }
                }
            }
            catch
            {
            }
            return(new GenericFile(Builder.ToString(), Title, Meta));
        }
示例#2
0
        private Identifier getIdentifier(PdfPage page, string idStartKey, string idEndKey, string nameKey)
        {
            Identifier identifier = new Identifier();

            for (int index = 0; index < page.Contents.Elements.Count; index++)
            {
                PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
                var outputText = new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value);



                var searchNameIndex = outputText.IndexOf(nameKey);
                if (searchNameIndex > -1)
                {
                    var restOfString = outputText.Substring(searchNameIndex);
                    var attIndex     = restOfString.IndexOf("@");
                    attIndex = attIndex + searchNameIndex;


                    var startIndex = searchNameIndex + nameKey.Count();
                    var endIndex   = attIndex - startIndex;
                    var name       = outputText.Substring(startIndex, endIndex);
                    identifier.PersonName = name;
                    if (identifier.isLoaded)
                    {
                        return(identifier);
                    }
                }



                var searchIdIndex = outputText.IndexOf(idStartKey);
                if (searchIdIndex > -1)
                {
                    var endKeyIndex = outputText.IndexOf(idEndKey);

                    if (searchNameIndex < endKeyIndex)
                    {
                        return(null);
                    }

                    if (endKeyIndex > -1)
                    {
                        var startIndex = searchIdIndex + idStartKey.Length;
                        var idLength   = endKeyIndex - startIndex;

                        var id = outputText.Substring(startIndex, idLength);
                        identifier.IdNumber = id;
                        if (identifier.isLoaded)
                        {
                            return(identifier);
                        }
                    }
                }
            }
            return(null);
        }
示例#3
0
 public void SplitPDF(string filepath, string regex)
 {
     using (PdfDocument pdf = PdfReader.Open(filepath, PdfDocumentOpenMode.Import))
     {
         PdfDocument neodoc       = new PdfDocument();
         bool        firstrun     = true;
         bool        firstsubpage = true;
         foreach (PdfPage page in pdf.Pages)
         {
             neodoc.AddPage(page);
             if (firstsubpage == true)
             {
                 if (firstrun != true)
                 {
                     neodoc.Save(Path.GetDirectoryName(filepath) + Path.DirectorySeparatorChar + "SplitPDF_" + System.Guid.NewGuid().ToString() + ".pdf");
                     neodoc = new PdfDocument();
                 }
                 else
                 {
                     firstrun = false;
                 }
                 firstsubpage = false;
             }
             for (int index = 0; index < page.Contents.Elements.Count; index++)
             {
                 PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
                 string outputText = new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value);
                 Regex  reg        = new Regex(regex);
                 Match  m          = reg.Match(outputText);
                 if (m.Success)
                 {
                     firstsubpage = true;
                 }
             }
         }
     }
 }
示例#4
0
        /// <summary>
        /// Reads PDF object from input stream.
        /// </summary>
        /// <param name="pdfObject">Either the instance of a derived type or null. If it is null
        /// an appropriate object is created.</param>
        /// <param name="objectID">The address of the object.</param>
        /// <param name="includeReferences">If true, specifies that all indirect objects
        /// are included recursively.</param>
        public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool includeReferences)
        {
            MoveToObject(objectID);
            int objectNumber     = ReadInteger();
            int generationNumber = ReadInteger();

#if DEBUG
            // The following assertion sometime failed (see below)
            //Debug.Assert(objectID == new PdfObjectID(objectNumber, generationNumber));
            if (objectID != new PdfObjectID(objectNumber, generationNumber))
            {
                // A special kind of bug? Or is this an undocumented PDF feature?
                // PDF4NET 2.6 provides a sample called 'Unicode', which produces a file 'unicode.pdf'
                // The iref table of this file contains the following entries:
                //    iref
                //    0 148
                //    0000000000 65535 f
                //    0000000015 00000 n
                //    0000000346 00000 n
                //    ....
                //    0000083236 00000 n
                //    0000083045 00000 n
                //    0000083045 00000 n
                //    0000083045 00000 n
                //    0000083045 00000 n
                //    0000080334 00000 n
                //    ....
                // Object 84, 85, 86, and 87 maps to the same dictionary, but all PDF readers I tested
                // ignores this mismatch! The following assertion failed about 50 times with this file.
#if true_
                string message = String.Format("xref entry {0} {1} maps to object {2} {3}.",
                                               objectID.ObjectNumber, objectID.GenerationNumber, objectNumber, generationNumber);
                Debug.Assert(false, message);
#endif
            }
#endif
            // Always use object ID from iref table (see above)
            objectNumber     = objectID.ObjectNumber;
            generationNumber = objectID.GenerationNumber;
#if true_
            Debug.WriteLine(String.Format("obj: {0} {1}", objectNumber, generationNumber));
#endif
            ReadSymbol(Symbol.Obj);

            bool   checkForStream = false;
            Symbol symbol         = ScanNextToken();
            switch (symbol)
            {
            case Symbol.BeginArray:
                PdfArray array;
                if (pdfObject == null)
                {
                    array = new PdfArray(this.document);
                }
                else
                {
                    array = (PdfArray)pdfObject;
                }
                //PdfObject.RegisterObject(array, objectID, generation);
                pdfObject = ReadArray(array, includeReferences);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                break;

            case Symbol.BeginDictionary:
                PdfDictionary dict;
                if (pdfObject == null)
                {
                    dict = new PdfDictionary(this.document);
                }
                else
                {
                    dict = (PdfDictionary)pdfObject;
                }
                //PdfObject.RegisterObject(dict, objectID, generation);
                checkForStream = true;
                pdfObject      = ReadDictionary(dict, includeReferences);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                break;

            // Acrobat 6 Professional proudly presents: The Null object!
            // Even with a one-digit object number an indirect reference �x 0 R� to this object is
            // one character larger than the direct use of �null�. Probable this is the reason why
            // it is true that Acrobat Web Capture 6.0 creates this object, but obviously never
            // creates a reference to it!
            case Symbol.Null:
                pdfObject = new PdfNullObject(this.document);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.Boolean:
                pdfObject = new PdfBooleanObject(this.document, string.Compare(this.lexer.Token, Boolean.TrueString, true) == 0); //!!!mod THHO 19.11.09
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.Integer:
                pdfObject = new PdfIntegerObject(this.document, this.lexer.TokenToInteger);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.UInteger:
                pdfObject = new PdfUIntegerObject(this.document, this.lexer.TokenToUInteger);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.Real:
                pdfObject = new PdfRealObject(this.document, this.lexer.TokenToReal);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.String:
                pdfObject = new PdfStringObject(this.document, this.lexer.Token);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.Name:
                pdfObject = new PdfNameObject(this.document, this.lexer.Token);
                pdfObject.SetObjectID(objectNumber, generationNumber);
                ReadSymbol(Symbol.EndObj);
                return(pdfObject);

            case Symbol.Keyword:
                // Should not come here anymore
                throw new NotImplementedException("Keyword");

            default:
                // Should not come here anymore
                throw new NotImplementedException("unknown token \"" + symbol + "\"");
            }
            symbol = ScanNextToken();
            if (symbol == Symbol.BeginStream)
            {
                PdfDictionary dict = (PdfDictionary)pdfObject;
                Debug.Assert(checkForStream, "Unexpected stream...");
                int    length = GetStreamLength(dict);
                byte[] bytes  = this.lexer.ReadStream(length);
#if true_
                if (dict.Elements.GetString("/Filter") == "/FlateDecode")
                {
                    if (dict.Elements["/Subtype"] == null)
                    {
                        try
                        {
                            byte[] decoded = Filtering.FlateDecode.Decode(bytes);
                            if (decoded.Length == 0)
                            {
                                goto End;
                            }
                            string pageContent = Filtering.FlateDecode.DecodeToString(bytes);
                            if (pageContent.Length > 100)
                            {
                                pageContent = pageContent.Substring(pageContent.Length - 100);
                            }
                            pageContent.GetType();
                            bytes = decoded;
                            dict.Elements.Remove("/Filter");
                            dict.Elements.SetInteger("/Length", bytes.Length);
                        }
                        catch
                        {
                        }
                    }
                    End :;
                }
#endif
                PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict);
                dict.Stream = stream;
                ReadSymbol(Symbol.EndStream);
                symbol = ScanNextToken();
            }
            if (symbol != Symbol.EndObj)
            {
                throw new PdfReaderException(PSSR.UnexpectedToken(this.lexer.Token));
            }
            return(pdfObject);
        }
示例#5
0
文件: Parser.cs 项目: Sl0vi/PDFsharp
        //public PdfObject ReadObject(PdfObject obj, bool includeReferences)

        /// <summary>
        /// Reads the stream of a dictionary.
        /// </summary>
        private void ReadStream(PdfDictionary dict)
        {
            Symbol symbol = _lexer.Symbol;
            Debug.Assert(symbol == Symbol.BeginStream);
            int length = GetStreamLength(dict);
            byte[] bytes = _lexer.ReadStream(length);
            PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict);
            Debug.Assert(dict.Stream == null, "Dictionary already has a stream.");
            dict.Stream = stream;
            ReadSymbol(Symbol.EndStream);
            ScanNextToken();
        }
示例#6
0
文件: Parser.cs 项目: Sl0vi/PDFsharp
        /// <summary>
        /// Reads PDF object from input stream.
        /// </summary>
        /// <param name="pdfObject">Either the instance of a derived type or null. If it is null
        /// an appropriate object is created.</param>
        /// <param name="objectID">The address of the object.</param>
        /// <param name="includeReferences">If true, specifies that all indirect objects
        /// are included recursively.</param>
        /// <param name="fromObjecStream">If true, the objects is parsed from an object stream.</param>
        public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool includeReferences, bool fromObjecStream)
        {
#if DEBUG_
            Debug.WriteLine("ReadObject: " + objectID);
            if (objectID.ObjectNumber == 20)
                GetType();
#endif
            int objectNumber = objectID.ObjectNumber;
            int generationNumber = objectID.GenerationNumber;
            if (!fromObjecStream)
            {
                MoveToObject(objectID);
                objectNumber = ReadInteger();
                generationNumber = ReadInteger();
            }
#if DEBUG
            // The following assertion sometime failed (see below)
            //Debug.Assert(objectID == new PdfObjectID(objectNumber, generationNumber));
            if (!fromObjecStream && objectID != new PdfObjectID(objectNumber, generationNumber))
            {
                // A special kind of bug? Or is this an undocumented PDF feature?
                // PDF4NET 2.6 provides a sample called 'Unicode', which produces a file 'unicode.pdf'
                // The iref table of this file contains the following entries:
                //    iref
                //    0 148
                //    0000000000 65535 f 
                //    0000000015 00000 n 
                //    0000000346 00000 n 
                //    ....
                //    0000083236 00000 n 
                //    0000083045 00000 n 
                //    0000083045 00000 n 
                //    0000083045 00000 n 
                //    0000083045 00000 n 
                //    0000080334 00000 n 
                //    ....
                // Object 84, 85, 86, and 87 maps to the same dictionary, but all PDF readers I tested
                // ignores this mismatch! The following assertion failed about 50 times with this file.
#if true_
                string message = String.Format("xref entry {0} {1} maps to object {2} {3}.",
                    objectID.ObjectNumber, objectID.GenerationNumber, objectNumber, generationNumber);
                Debug.Assert(false, message);
#endif
            }
#endif
            // Always use object ID from iref table (see above).
            objectNumber = objectID.ObjectNumber;
            generationNumber = objectID.GenerationNumber;
#if true_
            Debug.WriteLine(String.Format("obj: {0} {1}", objectNumber, generationNumber));
#endif
            if (!fromObjecStream)
                ReadSymbol(Symbol.Obj);

            bool checkForStream = false;
            Symbol symbol = ScanNextToken();
            switch (symbol)
            {
                case Symbol.BeginArray:
                    PdfArray array;
                    if (pdfObject == null)
                        array = new PdfArray(_document);
                    else
                        array = (PdfArray)pdfObject;
                    //PdfObject.RegisterObject(array, objectID, generation);
                    pdfObject = ReadArray(array, includeReferences);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    break;

                case Symbol.BeginDictionary:
                    PdfDictionary dict;
                    if (pdfObject == null)
                        dict = new PdfDictionary(_document);
                    else
                        dict = (PdfDictionary)pdfObject;
                    //PdfObject.RegisterObject(dict, objectID, generation);
                    checkForStream = true;
                    pdfObject = ReadDictionary(dict, includeReferences);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    break;

                // Acrobat 6 Professional proudly presents: The Null object!
                // Even with a one-digit object number an indirect reference «x 0 R» to this object is
                // one character larger than the direct use of «null». Probable this is the reason why
                // it is true that Acrobat Web Capture 6.0 creates this object, but obviously never 
                // creates a reference to it!
                case Symbol.Null:
                    pdfObject = new PdfNullObject(_document);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.Boolean:
                    pdfObject = new PdfBooleanObject(_document, String.Compare(_lexer.Token, Boolean.TrueString, StringComparison.OrdinalIgnoreCase) == 0);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.Integer:
                    pdfObject = new PdfIntegerObject(_document, _lexer.TokenToInteger);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.UInteger:
                    pdfObject = new PdfUIntegerObject(_document, _lexer.TokenToUInteger);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.Real:
                    pdfObject = new PdfRealObject(_document, _lexer.TokenToReal);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.String:
                    pdfObject = new PdfStringObject(_document, _lexer.Token);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.Name:
                    pdfObject = new PdfNameObject(_document, _lexer.Token);
                    pdfObject.SetObjectID(objectNumber, generationNumber);
                    if (!fromObjecStream)
                        ReadSymbol(Symbol.EndObj);
                    return pdfObject;

                case Symbol.Keyword:
                    // Should not come here anymore.
                    ParserDiagnostics.HandleUnexpectedToken(_lexer.Token);
                    break;

                default:
                    // Should not come here anymore.
                    ParserDiagnostics.HandleUnexpectedToken(_lexer.Token);
                    break;
            }
            symbol = ScanNextToken();
            if (symbol == Symbol.BeginStream)
            {
                PdfDictionary dict = (PdfDictionary)pdfObject;
                Debug.Assert(checkForStream, "Unexpected stream...");
#if true_
                ReadStream(dict);
#else
                int length = GetStreamLength(dict);
                byte[] bytes = _lexer.ReadStream(length);
#if true_
                if (dict.Elements.GetString("/Filter") == "/FlateDecode")
                {
                    if (dict.Elements["/Subtype"] == null)
                    {
                        try
                        {
                            byte[] decoded = Filtering.FlateDecode.Decode(bytes);
                            if (decoded.Length == 0)
                                goto End;
                            string pageContent = Filtering.FlateDecode.DecodeToString(bytes);
                            if (pageContent.Length > 100)
                                pageContent = pageContent.Substring(pageContent.Length - 100);
                            pageContent.GetType();
                            bytes = decoded;
                            dict.Elements.Remove("/Filter");
                            dict.Elements.SetInteger("/Length", bytes.Length);
                        }
                        catch
                        {
                        }
                    }
                End: ;
                }
#endif
                PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict);
                dict.Stream = stream;
                ReadSymbol(Symbol.EndStream);
                symbol = ScanNextToken();
#endif
            }
            if (!fromObjecStream && symbol != Symbol.EndObj)
                ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token));
            return pdfObject;
        }
示例#7
0
        private void Initialize()
        {
            // find plugins folder
            string pluginPath = System.Reflection.Assembly.GetExecutingAssembly().Location;

            pluginPath = Path.GetDirectoryName(pluginPath);

            pagesCount = 0;
            for (int i = 0; i < FFileNameIn.SliceCount; i++)
            {
                // open and parse each .pdf
                // Open the file
                PdfDocument document = PdfReader.Open(FFileNameIn[i], PdfDocumentOpenMode.ReadOnly);
                FSpreadCountLocal.Add(document.PageCount);
                FScaleFactorLocal.Add(document.Pages[0].Width / document.Pages[0].Height);

                // render .png files for each page
                string  pngName = i + "_%d.png";
                Process exe     = new System.Diagnostics.Process();
                exe.StartInfo.UseShellExecute  = true;
                exe.StartInfo.FileName         = "pdfdraw.exe";
                exe.StartInfo.WorkingDirectory = pluginPath;
                exe.StartInfo.Arguments        = "-o " + pngName + " -r 144 \"";
                exe.StartInfo.Arguments       += @FFileNameIn[i];
                exe.StartInfo.Arguments       += "\" ";
                exe.StartInfo.ErrorDialog      = false;
                exe.StartInfo.CreateNoWindow   = true;
                exe.Start();
                exe.WaitForExit();

                for (int j = 0; j < FSpreadCountLocal[i]; j++)
                {
                    // get image
                    string curPngName = i + "_" + (j + 1) + ".png";
                    Image  img        = Image.FromFile(pluginPath + "\\" + curPngName);
                    Image.GetThumbnailImageAbort myCallBack = new Image.GetThumbnailImageAbort(ThumbnailCallback);
                    Image texImg = img.GetThumbnailImage(FWeightIn[0], FWeightIn[0], myCallBack, IntPtr.Zero);
                    img.Dispose();
                    File.Delete(pluginPath + "\\" + curPngName);

                    // create a byte-array for a picture content
                    MemoryStream ms = new MemoryStream();
                    texImg.Save(ms, System.Drawing.Imaging.ImageFormat.Bmp);
                    texImg.Dispose();

                    ms.Seek(54, SeekOrigin.Begin);
                    byte[] bytes = new byte[(int)ms.Length - 54];
                    ms.Read(bytes, 0, (int)ms.Length - 54);
                    texBytes.Add(bytes);
                    ms.Close();

                    // get text
                    string pageText = "";
                    System.Text.StringBuilder sb = new System.Text.StringBuilder();

                    PdfPage curPage = document.Pages[j];
                    for (int index = 0; index < curPage.Contents.Elements.Count; index++)
                    {
                        PdfDictionary.PdfStream stream = curPage.Contents.Elements.GetDictionary(index).Stream;
                        pageText += new PDFParser().ExtractTextFromPDFBytes(stream.Value);
                    }
                    FStringLocal.Add(pageText);

                    // parse text and find hyperlinks
                    string hyperLinks = "";
                    string pattern    = @"(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])";
                    System.Text.RegularExpressions.Regex           regex   = new System.Text.RegularExpressions.Regex(pattern);
                    System.Text.RegularExpressions.MatchCollection matches = regex.Matches(pageText);
                    for (int l = 0; l < matches.Count; l++)
                    {
                        hyperLinks += matches[l].Value + " ";
                    }

                    // parse text and find email addresses
                    pattern = @"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?";
                    System.Text.RegularExpressions.Regex           regex2   = new System.Text.RegularExpressions.Regex(pattern);
                    System.Text.RegularExpressions.MatchCollection matches2 = regex2.Matches(pageText);
                    for (int l = 0; l < matches2.Count; l++)
                    {
                        hyperLinks += matches2[l].Value + " ";
                    }
                    FHTTPLocal.Add(hyperLinks);
                }

                pagesCount += FSpreadCountLocal[i];
            }
        }