/// <summary> /// Create PDF file's object summary /// </summary> /// <returns>ASCII byte array</returns> public static string ObjectSummary ( PdfIndirectObject ReaderObject ) { OutputCtrl Ctrl = new OutputCtrl(); ReaderObject.ObjectSummary(Ctrl); return(ByteArrayToString(Ctrl.ToArray())); }
private void Parse(string PDFInput) { try { dataGridView1.Rows.Clear(); PdfFileAnalyzer.PdfReader Reader = new PdfFileAnalyzer.PdfReader(); Reader.OpenPdfFile(PDFInput); int page_number = 0; for (int Index = Reader.ObjectArray.Length - 1; Index >= 0; Index--) { ///get pages if ((Reader.ObjectArray[Index] != null) && (Reader.ObjectArray[Index].PdfObjectType == "/Pages")) { PdfFileAnalyzer.PdfIndirectObject ReaderObject_pages = Reader.ObjectArray[Index]; var object_summary_pages = Reports.ObjectSummary(ReaderObject_pages).ToString().Replace(System.Environment.NewLine, ""); var object_summary_pages1 = Regex.Match(object_summary_pages, @"\[([^)]*)\]").Groups[1].Value; var object_summary_finalpage = object_summary_pages1.Replace(" 0 R ", ","); var page_id = object_summary_finalpage.Replace(" 0 R", ","); var new_page_id = page_id.Split(new[] { ',' }, System.StringSplitOptions.RemoveEmptyEntries); //int page_number = 0; foreach (string page_items in new_page_id) { try { PdfFileAnalyzer.PdfIndirectObject ReaderObject = Reader.ObjectArray[Convert.ToInt32(page_items)]; var object_summary = Reports.ObjectSummary(ReaderObject).ToString().Replace(System.Environment.NewLine, ""); int pFrom = object_summary.IndexOf("Annots ") + "key : ".Length; int pTo = object_summary.LastIndexOf(" 0 R/Contents"); var annot_object_summary = object_summary.Substring(pFrom, pTo - pFrom).Replace(" ", ""); page_number = page_number + 1; //get annot PdfFileAnalyzer.PdfIndirectObject ReaderObject_annot = Reader.ObjectArray[Convert.ToInt32(annot_object_summary)]; var object_summary_annot = Reports.ObjectSummary(ReaderObject_annot).ToString().Replace(System.Environment.NewLine, ""); var object_summary_annot1 = Regex.Match(object_summary_annot, @"\[([^)]*)\]").Groups[1].Value; var object_summary_annots = object_summary_annot1.Replace(" 0 R ", ","); var stream_id = object_summary_annots.Replace(" 0 R", ","); var new_strim_id = stream_id.Split(new[] { ',' }, System.StringSplitOptions.RemoveEmptyEntries); foreach (string items in new_strim_id) { //get stream PdfFileAnalyzer.PdfIndirectObject ReaderObject_stream = Reader.ObjectArray[Convert.ToInt32(items)]; var object_summary_stream = Reports.ObjectSummary(ReaderObject_stream).ToString().Replace(System.Environment.NewLine, ""); int pFrom_stream = object_summary_stream.IndexOf("sData ") + "key : ".Length; int pTo_stream = object_summary_stream.IndexOf(" 0 R"); var view_stream_id = object_summary_stream.Substring(pFrom_stream, pTo_stream - pFrom_stream).Replace(" ", ""); //get rect var pattern1 = @"\Annots(.*?)\ R"; Regex rgxA = new Regex(pattern1); string replacementtextA1 = Reports.ObjectSummary(ReaderObject_stream).Replace(" 0", ""); var matchA = rgxA.Match(replacementtextA1); var patternoutA = matchA.Groups[1].Value; var pattern2 = @"\[(.*?)\]"; Regex rgxB = new Regex(pattern2); var matchB = rgxB.Match(Reports.ObjectSummary(ReaderObject_stream)); var patternoutA2 = matchB.Groups[1].Value; //get fileds variable PdfFileAnalyzer.PdfIndirectObject ReaderObject_variable = Reader.ObjectArray[Convert.ToInt32(view_stream_id)]; StreamByteArray = ReaderObject_variable.ReadStream(); byte[] TempByteArray = ReaderObject_variable.DecompressStream(StreamByteArray); StreamByteArray = TempByteArray; if (Reports.ByteArrayToString(StreamByteArray).Contains("STARTTAG")) { //regex var pattern = @"\STARTTAG(.+)ENDTAG"; //var pattern = @"\DEFAULT(.+CO)|\DEFAULT(.+TE)|\DEFAULT(.+MC)|\DEFAULT(.+TF)|\DEFAULT(.+DA)"; Regex rgx = new Regex(pattern); string replacementtext = Regex.Replace(Reports.ByteArrayToString(StreamByteArray), @"\t|\n|\r", ""); string replacementtext1 = replacementtext.Replace(".", "").Replace("!", "").Replace("%", "").Replace("(", "").Replace(")", "").Replace("'", "").Replace("&", "").Replace("#", "").Replace("$", ""); var match = rgx.Match(replacementtext1); var patternout = match.Groups[1].Value; var patternout2 = patternout.Split(new char[] { ':' })[0]; //txtBox2.Text = txtBox2.Text + Convert.ToString(page_number) + " --> "+ patternout + patternout2 + patternout3 + patternout4 + patternout5 + "-->" + patternoutA2 + Environment.NewLine + Environment.NewLine; //MessageBox.Show(Convert.ToString(page_number) + patternout + patternout2 + patternout3 + patternout4 + patternout5 + "-->" + patternoutA2); DataGridViewRow row = (DataGridViewRow)dataGridView1.Rows[0].Clone(); //row.Cells[0].Value = ValidateMe(patternout); row.Cells[0].Value = ValidateMe(patternout2.Trim()); row.Cells[1].Value = patternoutA2; row.Cells[2].Value = Convert.ToString(page_number); string complocation = patternoutA2.ToString(); float llx = float.Parse(complocation.Split(' ')[0]); float lly = float.Parse(complocation.Split(' ')[1]); float urx = float.Parse(complocation.Split(' ')[2]); float ury = float.Parse(complocation.Split(' ')[3]); if ((urx - llx) > 15) { row.Cells[3].Value = "TE"; } else { row.Cells[3].Value = "CB"; } dataGridView1.Rows.Add(row); } else { DataGridViewRow row = (DataGridViewRow)dataGridView1.Rows[0].Clone(); row.Cells[0].Value = ToCamelCase(Reports.ByteArrayToString(StreamByteArray)); row.Cells[1].Value = patternoutA2; row.Cells[2].Value = Convert.ToString(page_number); //string smartDetectCB = patternoutA2.Split(' '); string complocation = patternoutA2.ToString(); float llx = float.Parse(complocation.Split(' ')[0]); float lly = float.Parse(complocation.Split(' ')[1]); float urx = float.Parse(complocation.Split(' ')[2]); float ury = float.Parse(complocation.Split(' ')[3]); if ((urx - llx) > 15) { row.Cells[3].Value = "TE"; } else { row.Cells[3].Value = "CB"; } dataGridView1.Rows.Add(row); } } } catch { } } } } Reader.Dispose(); CreateOutput(PDFInput); //End here } catch (Exception ex) { MessageBox.Show(ex.ToString()); } }
//////////////////////////////////////////////////////////////////// // process objects stream //////////////////////////////////////////////////////////////////// internal void ProcessObjectsStream() { // read decrypt and decompress the stream byte[] ByteArray = ReadStream(); ByteArray = DecompressStream(ByteArray); // get the count of objects in this cross reference object stream if (!Dictionary.FindValue("/N").GetInteger(out int ObjectCount) || ObjectCount <= 0) { throw new ApplicationException("Object stream: count (/N) is missing"); } // get first byte offset if (!Dictionary.FindValue("/First").GetInteger(out int FirstPos)) { throw new ApplicationException("Object stream: first byte offset (/First) is missing"); } // get /Extends (must be a reference) PdfBase Extends = Dictionary.FindValue("/Extends"); if (Extends.IsReference) { ParentObjectNo = ((PdfReference)Extends).ObjectNumber; } // create temp array of child objects PdfIndirectObject[] Children = new PdfIndirectObject[ObjectCount]; // read all byte offset array PdfByteArrayParser PC = new PdfByteArrayParser(Reader, ByteArray, false); PC.ReadFirstChar(); for (int Index = 0; Index < ObjectCount; Index++) { // object number if (!PC.ParseNextItem().GetInteger(out int ObjNo)) { throw new ApplicationException("Cross reference object stream: object number error"); } // object offset if (!PC.ParseNextItem().GetInteger(out int ObjPos)) { throw new ApplicationException("Cross reference object stream: object offset error"); } // find object PdfIndirectObject ReadObject = Reader.ObjectArray[ObjNo]; if (ReadObject == null) { throw new ApplicationException("Cross reference object stream: object not found"); } // object is free if (ReadObject.ObjectType == ObjectType.Free) { // save child Children[Index] = ReadObject; // save position ReadObject.FilePosition = FirstPos + ObjPos; } } // copy the object from the stream to the corresponding indirect object for (int Index = 0; Index < ObjectCount; Index++) { // shortcut PdfIndirectObject Child = Children[Index]; // object was loaded by later update if (Child == null) { continue; } PC.SetPos(Child.FilePosition); PC.ReadFirstChar(); PdfBase Obj = PC.ParseNextItem(); // we have a dictionary if (Obj.IsDictionary) { // set object value type to dictionary Child.ObjectType = ObjectType.Dictionary; Child.Dictionary = (PdfDictionary)Obj; // set object type if available in the dictionary string ObjectTypeStr = Child.Dictionary.FindValue("/Type").ToName; // set special object if (ObjectTypeStr != null) { Child._PdfObjectType = ObjectTypeStr; } } // we have other type of object // note: stream object is not allowed else { // set object value type to dictionary Child.ObjectType = ObjectType.Other; Child.Value = Obj; } } return; }
//////////////////////////////////////////////////////////////////// // Get stream length // Stream length might be in another indirect object // This method must run after ReadObject was run for all objects //////////////////////////////////////////////////////////////////// internal void GetStreamLength() { // get value PdfBase LengthValue = Dictionary.FindValue("/Length"); // dictionary value is reference to integer if (LengthValue.IsReference) { // get indirect object based on reference number PdfIndirectObject LengthObject = Reader.ToPdfIndirectObject((PdfReference)LengthValue); // read object type if (LengthObject != null && LengthObject.ObjectType == ObjectType.Other && LengthObject.Value.IsInteger) { StreamLength = ((PdfInteger)LengthObject.Value).IntValue; } // replace /Length in dictionary with actual value Dictionary.AddInteger("/Length", StreamLength); } // dictionary value is integer else if (LengthValue.IsInteger) { // save stream length StreamLength = ((PdfInteger)LengthValue).IntValue; } // stream is empty or stream length is in error if (StreamLength == 0) { return; } // stream might be outside file boundry // HP Scanners Scanned PDF does not conform to PDF standards // https://www.google.com/search?client=firefox-b-d&q=hp+officejet+PDF+scan+files+not+standard try { // set file position to the end of the stream Reader.SetFilePosition(StreamFilePosition + StreamLength); // verify end of stream // read first byte Reader.ParseFile.ReadFirstChar(); // test for endstream if (Reader.ParseFile.ParseNextItem().ToKeyWord != KeyWord.EndStream) { throw new ApplicationException("Endstream token missing"); } // test for endobj if (Reader.ParseFile.ParseNextItem().ToKeyWord != KeyWord.EndObj) { throw new ApplicationException("Endobj token missing"); } return; } catch { StreamLength = 0; Reader.InvalidPdfFile = true; return; } }
/// <summary> /// Build contents array for PdfPage /// </summary> public void BuildContentsArray() { // must be a page if (PdfObjectType != "/Page") { throw new ApplicationException("Build contents array: Object must be page"); } // get Contents dictionary value PdfBase ContentsValue = Dictionary.FindValue("/Contents"); // page is blank no contents if (ContentsValue.IsEmpty) { ContentsArray = new PdfIndirectObject[0]; return; } // test if contents value is a reference if (ContentsValue.IsReference) { // find the object with Object number PdfIndirectObject IndirectObject = Reader.ToPdfIndirectObject((PdfReference)ContentsValue); if (IndirectObject != null) { // the object is a stream return array with one contents object if (IndirectObject.ObjectType == ObjectType.Stream) { IndirectObject._PdfObjectType = "/Contents"; ContentsArray = new PdfIndirectObject[] { IndirectObject }; return; } // read object must be an array if (IndirectObject.ObjectType == ObjectType.Other) { ContentsValue = IndirectObject.Value; } } } // test if contents value is an array if (!ContentsValue.IsArray) { throw new ApplicationException("Build contents array: /Contents must be array"); } // array of reference numbers to contents objects PdfBase[] ReferenceArray = ((PdfArray)ContentsValue).ArrayItems; // create empty result list ContentsArray = new PdfIndirectObject[ReferenceArray.Length]; // verify that all array items are references to streams for (int Index = 0; Index < ReferenceArray.Length; Index++) { // shortcut PdfBase ContentsRef = ReferenceArray[Index]; // each item must be a reference if (!ContentsRef.IsReference) { throw new ApplicationException("Build contents array: Array item must be reference"); } // get read object PdfIndirectObject Contents = Reader.ToPdfIndirectObject((PdfReference)ContentsRef); // the object is not a stream if (Contents == null || Contents.ObjectType != ObjectType.Stream) { throw new ApplicationException("Build contents array: Contents must be a stream"); } // mark as page's contents Contents._PdfObjectType = "/Contents"; // add stream to the array ContentsArray[Index] = Contents; } // successful exit return; }