Пример #1
0
        /// <summary>
        /// Create PDF file's object summary
        /// </summary>
        /// <returns>ASCII byte array</returns>
        public static string ObjectSummary
        (
            PdfIndirectObject ReaderObject
        )
        {
            OutputCtrl Ctrl = new OutputCtrl();

            ReaderObject.ObjectSummary(Ctrl);
            return(ByteArrayToString(Ctrl.ToArray()));
        }
Пример #2
0
        private void Parse(string PDFInput)
        {
            try
            {
                dataGridView1.Rows.Clear();
                PdfFileAnalyzer.PdfReader Reader = new PdfFileAnalyzer.PdfReader();

                Reader.OpenPdfFile(PDFInput);



                int page_number = 0;

                for (int Index = Reader.ObjectArray.Length - 1; Index >= 0; Index--)
                {
                    ///get pages
                    if ((Reader.ObjectArray[Index] != null) && (Reader.ObjectArray[Index].PdfObjectType == "/Pages"))
                    {
                        PdfFileAnalyzer.PdfIndirectObject ReaderObject_pages = Reader.ObjectArray[Index];

                        var object_summary_pages     = Reports.ObjectSummary(ReaderObject_pages).ToString().Replace(System.Environment.NewLine, "");
                        var object_summary_pages1    = Regex.Match(object_summary_pages, @"\[([^)]*)\]").Groups[1].Value;
                        var object_summary_finalpage = object_summary_pages1.Replace(" 0 R ", ",");
                        var page_id = object_summary_finalpage.Replace(" 0 R", ",");

                        var new_page_id = page_id.Split(new[] { ',' }, System.StringSplitOptions.RemoveEmptyEntries);


                        //int page_number = 0;

                        foreach (string page_items in new_page_id)
                        {
                            try
                            {
                                PdfFileAnalyzer.PdfIndirectObject ReaderObject = Reader.ObjectArray[Convert.ToInt32(page_items)];
                                var object_summary       = Reports.ObjectSummary(ReaderObject).ToString().Replace(System.Environment.NewLine, "");
                                int pFrom                = object_summary.IndexOf("Annots ") + "key : ".Length;
                                int pTo                  = object_summary.LastIndexOf(" 0 R/Contents");
                                var annot_object_summary = object_summary.Substring(pFrom, pTo - pFrom).Replace(" ", "");
                                page_number = page_number + 1;

                                //get annot
                                PdfFileAnalyzer.PdfIndirectObject ReaderObject_annot = Reader.ObjectArray[Convert.ToInt32(annot_object_summary)];

                                var object_summary_annot  = Reports.ObjectSummary(ReaderObject_annot).ToString().Replace(System.Environment.NewLine, "");
                                var object_summary_annot1 = Regex.Match(object_summary_annot, @"\[([^)]*)\]").Groups[1].Value;
                                var object_summary_annots = object_summary_annot1.Replace(" 0 R ", ",");
                                var stream_id             = object_summary_annots.Replace(" 0 R", ",");

                                var new_strim_id = stream_id.Split(new[] { ',' }, System.StringSplitOptions.RemoveEmptyEntries);



                                foreach (string items in new_strim_id)
                                {
                                    //get stream
                                    PdfFileAnalyzer.PdfIndirectObject ReaderObject_stream = Reader.ObjectArray[Convert.ToInt32(items)];

                                    var object_summary_stream = Reports.ObjectSummary(ReaderObject_stream).ToString().Replace(System.Environment.NewLine, "");
                                    int pFrom_stream          = object_summary_stream.IndexOf("sData ") + "key : ".Length;
                                    int pTo_stream            = object_summary_stream.IndexOf(" 0 R");
                                    var view_stream_id        = object_summary_stream.Substring(pFrom_stream, pTo_stream - pFrom_stream).Replace(" ", "");

                                    //get rect
                                    var pattern1 = @"\Annots(.*?)\ R";

                                    Regex  rgxA = new Regex(pattern1);
                                    string replacementtextA1 = Reports.ObjectSummary(ReaderObject_stream).Replace(" 0", "");
                                    var    matchA            = rgxA.Match(replacementtextA1);
                                    var    patternoutA       = matchA.Groups[1].Value;
                                    var    pattern2          = @"\[(.*?)\]";

                                    Regex rgxB         = new Regex(pattern2);
                                    var   matchB       = rgxB.Match(Reports.ObjectSummary(ReaderObject_stream));
                                    var   patternoutA2 = matchB.Groups[1].Value;
                                    //get fileds variable
                                    PdfFileAnalyzer.PdfIndirectObject ReaderObject_variable = Reader.ObjectArray[Convert.ToInt32(view_stream_id)];
                                    StreamByteArray = ReaderObject_variable.ReadStream();
                                    byte[] TempByteArray = ReaderObject_variable.DecompressStream(StreamByteArray);
                                    StreamByteArray = TempByteArray;



                                    if (Reports.ByteArrayToString(StreamByteArray).Contains("STARTTAG"))

                                    {
                                        //regex
                                        var pattern = @"\STARTTAG(.+)ENDTAG";
                                        //var pattern = @"\DEFAULT(.+CO)|\DEFAULT(.+TE)|\DEFAULT(.+MC)|\DEFAULT(.+TF)|\DEFAULT(.+DA)";
                                        Regex  rgx              = new Regex(pattern);
                                        string replacementtext  = Regex.Replace(Reports.ByteArrayToString(StreamByteArray), @"\t|\n|\r", "");
                                        string replacementtext1 = replacementtext.Replace(".", "").Replace("!", "").Replace("%", "").Replace("(", "").Replace(")", "").Replace("'", "").Replace("&", "").Replace("#", "").Replace("$", "");
                                        var    match            = rgx.Match(replacementtext1);
                                        var    patternout       = match.Groups[1].Value;
                                        var    patternout2      = patternout.Split(new char[] { ':' })[0];

                                        //txtBox2.Text = txtBox2.Text + Convert.ToString(page_number) + " --> "+ patternout + patternout2 + patternout3 + patternout4 + patternout5 + "-->" + patternoutA2 + Environment.NewLine + Environment.NewLine;
                                        //MessageBox.Show(Convert.ToString(page_number) + patternout + patternout2 + patternout3 + patternout4 + patternout5 + "-->" + patternoutA2);
                                        DataGridViewRow row = (DataGridViewRow)dataGridView1.Rows[0].Clone();

                                        //row.Cells[0].Value = ValidateMe(patternout);
                                        row.Cells[0].Value = ValidateMe(patternout2.Trim());

                                        row.Cells[1].Value = patternoutA2;
                                        row.Cells[2].Value = Convert.ToString(page_number);

                                        string complocation = patternoutA2.ToString();

                                        float llx = float.Parse(complocation.Split(' ')[0]);
                                        float lly = float.Parse(complocation.Split(' ')[1]);
                                        float urx = float.Parse(complocation.Split(' ')[2]);
                                        float ury = float.Parse(complocation.Split(' ')[3]);

                                        if ((urx - llx) > 15)
                                        {
                                            row.Cells[3].Value = "TE";
                                        }
                                        else
                                        {
                                            row.Cells[3].Value = "CB";
                                        }

                                        dataGridView1.Rows.Add(row);
                                    }
                                    else
                                    {
                                        DataGridViewRow row = (DataGridViewRow)dataGridView1.Rows[0].Clone();
                                        row.Cells[0].Value = ToCamelCase(Reports.ByteArrayToString(StreamByteArray));
                                        row.Cells[1].Value = patternoutA2;
                                        row.Cells[2].Value = Convert.ToString(page_number);

                                        //string smartDetectCB = patternoutA2.Split(' ');

                                        string complocation = patternoutA2.ToString();

                                        float llx = float.Parse(complocation.Split(' ')[0]);
                                        float lly = float.Parse(complocation.Split(' ')[1]);
                                        float urx = float.Parse(complocation.Split(' ')[2]);
                                        float ury = float.Parse(complocation.Split(' ')[3]);

                                        if ((urx - llx) > 15)
                                        {
                                            row.Cells[3].Value = "TE";
                                        }
                                        else
                                        {
                                            row.Cells[3].Value = "CB";
                                        }


                                        dataGridView1.Rows.Add(row);
                                    }
                                }
                            }
                            catch
                            {
                            }
                        }
                    }
                }

                Reader.Dispose();
                CreateOutput(PDFInput);
                //End here
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.ToString());
            }
        }
        ////////////////////////////////////////////////////////////////////
        // process objects stream
        ////////////////////////////////////////////////////////////////////

        internal void ProcessObjectsStream()
        {
            // read decrypt and decompress the stream
            byte[] ByteArray = ReadStream();
            ByteArray = DecompressStream(ByteArray);

            // get the count of objects in this cross reference object stream
            if (!Dictionary.FindValue("/N").GetInteger(out int ObjectCount) || ObjectCount <= 0)
            {
                throw new ApplicationException("Object stream: count (/N) is missing");
            }

            // get first byte offset
            if (!Dictionary.FindValue("/First").GetInteger(out int FirstPos))
            {
                throw new ApplicationException("Object stream: first byte offset (/First) is missing");
            }

            // get /Extends (must be a reference)
            PdfBase Extends = Dictionary.FindValue("/Extends");

            if (Extends.IsReference)
            {
                ParentObjectNo = ((PdfReference)Extends).ObjectNumber;
            }

            // create temp array of child objects
            PdfIndirectObject[] Children = new PdfIndirectObject[ObjectCount];

            // read all byte offset array
            PdfByteArrayParser PC = new PdfByteArrayParser(Reader, ByteArray, false);

            PC.ReadFirstChar();
            for (int Index = 0; Index < ObjectCount; Index++)
            {
                // object number
                if (!PC.ParseNextItem().GetInteger(out int ObjNo))
                {
                    throw new ApplicationException("Cross reference object stream: object number error");
                }

                // object offset
                if (!PC.ParseNextItem().GetInteger(out int ObjPos))
                {
                    throw new ApplicationException("Cross reference object stream: object offset error");
                }

                // find object
                PdfIndirectObject ReadObject = Reader.ObjectArray[ObjNo];
                if (ReadObject == null)
                {
                    throw new ApplicationException("Cross reference object stream: object not found");
                }

                // object is free
                if (ReadObject.ObjectType == ObjectType.Free)
                {
                    // save child
                    Children[Index] = ReadObject;

                    // save position
                    ReadObject.FilePosition = FirstPos + ObjPos;
                }
            }

            // copy the object from the stream to the corresponding indirect object
            for (int Index = 0; Index < ObjectCount; Index++)
            {
                // shortcut
                PdfIndirectObject Child = Children[Index];

                // object was loaded by later update
                if (Child == null)
                {
                    continue;
                }

                PC.SetPos(Child.FilePosition);
                PC.ReadFirstChar();
                PdfBase Obj = PC.ParseNextItem();

                // we have a dictionary
                if (Obj.IsDictionary)
                {
                    // set object value type to dictionary
                    Child.ObjectType = ObjectType.Dictionary;
                    Child.Dictionary = (PdfDictionary)Obj;

                    // set object type if available in the dictionary
                    string ObjectTypeStr = Child.Dictionary.FindValue("/Type").ToName;

                    // set special object
                    if (ObjectTypeStr != null)
                    {
                        Child._PdfObjectType = ObjectTypeStr;
                    }
                }

                // we have other type of object
                // note: stream object is not allowed
                else
                {
                    // set object value type to dictionary
                    Child.ObjectType = ObjectType.Other;
                    Child.Value      = Obj;
                }
            }
            return;
        }
        ////////////////////////////////////////////////////////////////////
        // Get stream length
        // Stream length might be in another indirect object
        // This method must run after ReadObject was run for all objects
        ////////////////////////////////////////////////////////////////////
        internal void GetStreamLength()
        {
            // get value
            PdfBase LengthValue = Dictionary.FindValue("/Length");

            // dictionary value is reference to integer
            if (LengthValue.IsReference)
            {
                // get indirect object based on reference number
                PdfIndirectObject LengthObject = Reader.ToPdfIndirectObject((PdfReference)LengthValue);

                // read object type
                if (LengthObject != null && LengthObject.ObjectType == ObjectType.Other && LengthObject.Value.IsInteger)
                {
                    StreamLength = ((PdfInteger)LengthObject.Value).IntValue;
                }

                // replace /Length in dictionary with actual value
                Dictionary.AddInteger("/Length", StreamLength);
            }

            // dictionary value is integer
            else if (LengthValue.IsInteger)
            {
                // save stream length
                StreamLength = ((PdfInteger)LengthValue).IntValue;
            }

            // stream is empty or stream length is in error
            if (StreamLength == 0)
            {
                return;
            }

            // stream might be outside file boundry
            // HP Scanners Scanned PDF does not conform to PDF standards
            // https://www.google.com/search?client=firefox-b-d&q=hp+officejet+PDF+scan+files+not+standard
            try
            {
                // set file position to the end of the stream
                Reader.SetFilePosition(StreamFilePosition + StreamLength);

                // verify end of stream
                // read first byte
                Reader.ParseFile.ReadFirstChar();

                // test for endstream
                if (Reader.ParseFile.ParseNextItem().ToKeyWord != KeyWord.EndStream)
                {
                    throw new ApplicationException("Endstream token missing");
                }

                // test for endobj
                if (Reader.ParseFile.ParseNextItem().ToKeyWord != KeyWord.EndObj)
                {
                    throw new ApplicationException("Endobj token missing");
                }
                return;
            }
            catch
            {
                StreamLength          = 0;
                Reader.InvalidPdfFile = true;
                return;
            }
        }
        /// <summary>
        /// Build contents array for PdfPage
        /// </summary>
        public void BuildContentsArray()
        {
            // must be a page
            if (PdfObjectType != "/Page")
            {
                throw new ApplicationException("Build contents array: Object must be page");
            }

            // get Contents dictionary value
            PdfBase ContentsValue = Dictionary.FindValue("/Contents");

            // page is blank no contents
            if (ContentsValue.IsEmpty)
            {
                ContentsArray = new PdfIndirectObject[0];
                return;
            }

            // test if contents value is a reference
            if (ContentsValue.IsReference)
            {
                // find the object with Object number
                PdfIndirectObject IndirectObject = Reader.ToPdfIndirectObject((PdfReference)ContentsValue);
                if (IndirectObject != null)
                {
                    // the object is a stream return array with one contents object
                    if (IndirectObject.ObjectType == ObjectType.Stream)
                    {
                        IndirectObject._PdfObjectType = "/Contents";
                        ContentsArray = new PdfIndirectObject[] { IndirectObject };
                        return;
                    }

                    // read object must be an array
                    if (IndirectObject.ObjectType == ObjectType.Other)
                    {
                        ContentsValue = IndirectObject.Value;
                    }
                }
            }

            // test if contents value is an array
            if (!ContentsValue.IsArray)
            {
                throw new ApplicationException("Build contents array: /Contents must be array");
            }

            // array of reference numbers to contents objects
            PdfBase[] ReferenceArray = ((PdfArray)ContentsValue).ArrayItems;

            // create empty result list
            ContentsArray = new PdfIndirectObject[ReferenceArray.Length];

            // verify that all array items are references to streams
            for (int Index = 0; Index < ReferenceArray.Length; Index++)
            {
                // shortcut
                PdfBase ContentsRef = ReferenceArray[Index];

                // each item must be a reference
                if (!ContentsRef.IsReference)
                {
                    throw new ApplicationException("Build contents array: Array item must be reference");
                }

                // get read object
                PdfIndirectObject Contents = Reader.ToPdfIndirectObject((PdfReference)ContentsRef);

                // the object is not a stream
                if (Contents == null || Contents.ObjectType != ObjectType.Stream)
                {
                    throw new ApplicationException("Build contents array: Contents must be a stream");
                }

                // mark as page's contents
                Contents._PdfObjectType = "/Contents";

                // add stream to the array
                ContentsArray[Index] = Contents;
            }

            // successful exit
            return;
        }