/// <summary> /// Add any object derived from PdfBase to dictionary /// </summary> /// <param name="Key">Dictionary key</param> /// <param name="Value">Derived object of PdfBase</param> public void AddKeyValue ( string Key, PdfBase Value ) { // create pair PdfKeyValue KeyValue = new PdfKeyValue(Key, Value); // keep dictionary sorted int Index = KeyValueArray.BinarySearch(KeyValue); // replace existing duplicate entry if (Index >= 0) { KeyValueArray[Index] = KeyValue; } // add to result dictionary else { KeyValueArray.Insert(~Index, KeyValue); } // exit return; }
/// <summary> /// Add one object to the array /// </summary> /// <param name="Obj">Added value</param> public void Add ( PdfBase Obj ) { Items.Add(Obj); return; }
/// <summary> /// Skip white space /// </summary> public void SkipWhiteSpace() { // skip white space if (PdfBase.IsWhiteSpace(NextChar)) { while ((NextChar = ReadChar()) != EOF && PdfBase.IsWhiteSpace(NextChar)) { ; } } return; }
/// <summary> /// Contructor /// </summary> /// <param name="Key">Dictionary key</param> /// <param name="Value">Dictionary value</param> public PdfKeyValue ( string Key, PdfBase Value ) { // DEBUG if (Key[0] != '/') { throw new ApplicationException("Key must start with /"); } this.Key = Key; this.Value = Value; return; }
//////////////////////////////////////////////////////////////////// // Parse Array //////////////////////////////////////////////////////////////////// internal PdfArray ParseArray() { // create empty array List <PdfBase> ResultArray = new List <PdfBase>(); // read first character after [ NextChar = ReadChar(); // loop until closing ] or EOF for (;;) { // skip white space SkipWhiteSpace(); // end of file if (NextChar == EOF) { throw new ApplicationException("Invalid array (end of contents)"); } // end of array if (NextChar == ']') { break; } // parse next item PdfBase NextItem = ParseNextItem(); // end of file if (NextItem.IsEmpty) { throw new ApplicationException("Invalid array (end of contents)"); } // add to result array ResultArray.Add(NextItem); } // read next character after closing ] NextChar = ReadChar(); // exit return(new PdfArray(ResultArray.ToArray())); }
//////////////////////////////////////////////////////////////////// // Get filter names //////////////////////////////////////////////////////////////////// internal string[] GetFilterNameArray() { // look for filter PdfBase Filter = Dictionary.FindValue("/Filter"); // no filter if (Filter.IsEmpty) { return(null); } // one filter name if (Filter.IsName) { string[] FilterNameArray = new string[1]; FilterNameArray[0] = ((PdfName)Filter).NameValue; return(FilterNameArray); } // array of filters if (Filter.IsArray) { // filter name items PdfBase[] FilterNames = ((PdfArray)Filter).ArrayItems; string[] FilterNameArray = new string[FilterNames.Length]; // loop for each filter int Index; for (Index = 0; Index < FilterNames.Length; Index++) { if (!FilterNames[Index].IsName) { break; } FilterNameArray[Index] = ((PdfName)FilterNames[Index]).NameValue; } if (Index == FilterNames.Length) { return(FilterNameArray); } } // filter is in error throw new ApplicationException("/Filter nust be a name or an array of names"); }
/// <summary> /// Append text string /// </summary> /// <param name="Text"></param> public void AppendText ( string Text ) { // remove double delimeters if (ByteArray.Count > 0 && !PdfBase.IsDelimiter(ByteArray[ByteArray.Count - 1]) && !PdfBase.IsDelimiter(Text[0])) { ByteArray.Add((byte)' '); } // move charaters to bytes foreach (char Chr in Text) { ByteArray.Add((byte)Chr); } return; }
//////////////////////////////////////////////////////////////////// // Parse inline image //////////////////////////////////////////////////////////////////// internal PdfOp ParseInlineImage() { // create empty dictionary PdfDictionary ImageDict = ParseDictionary(true); // get image width if (!ImageDict.FindValue("/W").GetInteger(out int Width) || Width <= 0) { throw new ApplicationException("Parse inline image: Width error"); } // get image height if (!ImageDict.FindValue("/H").GetInteger(out int Height) || Height <= 0) { throw new ApplicationException("Parse inline image: Height error"); } // get image bits per component if (!ImageDict.FindValue("/BPC").GetInteger(out int BitPerComp) || BitPerComp != 1 && BitPerComp != 2 && BitPerComp != 4 && BitPerComp != 8) { throw new ApplicationException("Parse inline image: BPC error"); } int Components = 0; // get color space string ColorSpace = ImageDict.FindValue("/CS").ToName; if (ColorSpace != null) { // number of components if (ColorSpace == "/G") { Components = 1; } else if (ColorSpace == "/RGB") { Components = 3; } else if (ColorSpace == "/CMYK") { Components = 4; } else { throw new ApplicationException("Parse inline image: ColorSpace error"); } } ImageDict.FindValue("/IM").GetBoolean(out bool IM); if (IM) { Components = 1; } PdfBase Filter = ImageDict.FindValue("/F"); if (!Filter.IsEmpty) { throw new ApplicationException("Parse inline image: No filter support"); } // no ASCIIHexDecode AHx or ASCII85Decode A85 if (!PdfBase.IsWhiteSpace(NextChar)) { throw new ApplicationException("Parse inline image: ID must be followed by white space"); } // image width in bytes int WidthBytes = 0; switch (BitPerComp) { case 1: WidthBytes = (Width + 7) / 8; break; case 2: WidthBytes = (Width + 3) / 4; break; case 4: WidthBytes = (Width + 1) / 2; break; case 8: WidthBytes = Width; break; } // image size int Size = WidthBytes * Height * Components; // image stream byte[] ImageStream = new byte[Size]; for (int Index = 0; Index < Size; Index++) { // read next character NextChar = ReadChar(); // end of file error if (NextChar == EOF) { throw new ApplicationException("Invalid inline image (end of contents)"); } // save it in bitmap ImageStream[Index] = (byte)NextChar; } // get termination NextChar = ReadChar(); SkipWhiteSpace(); if (NextChar != 'E' || ReadChar() != 'I') { throw new ApplicationException("Parse inline image: EI is missing"); } NextChar = ReadChar(); PdfOp InlineImage = new PdfOp(Operator.BeginInlineImage); InlineImage.ArgumentArray = new PdfBase[] { ImageDict, new PdfString(ImageStream) }; // exit return(InlineImage); }
//////////////////////////////////////////////////////////////////// // Parse Dictionary //////////////////////////////////////////////////////////////////// internal PdfDictionary ParseDictionary ( bool InlineImage ) { // create empty dictionary PdfDictionary Dictionary = new PdfDictionary(); // read first character after << NextChar = ReadChar(); // loop until closing >> or EOF for (;;) { // skip white space SkipWhiteSpace(); // end of file if (NextChar == EOF) { throw new ApplicationException("Invalid dictionary (end of contents)"); } // next character must be / for name if (NextChar != '/') { // end of dictionary if (!InlineImage) { if (NextChar == '>' && ReadChar() == '>') { break; } } // inline image else { if (NextChar == 'I' && ReadChar() == 'D') { break; } } throw new ApplicationException("Invalid dictionary (name entry must have /)"); } // read name StringBuilder Name = new StringBuilder(); Name.Append((char)NextChar); // add more characters until next delimiter while ((NextChar = ReadChar()) != EOF && !PdfBase.IsDelimiter(NextChar)) { Name.Append((char)NextChar); } // read next item PdfBase Value = ParseNextItem(); // end of file if (Value.IsEmpty) { throw new ApplicationException("Invalid dictionary (end of contents)"); } // add to result dictionary Dictionary.AddKeyValue(Name.ToString(), Value); } // read next character after >> or ID NextChar = ReadChar(); // exit return(Dictionary); }
/// <summary> /// Parse object reference number n 0 R obj /// </summary> /// <returns>Object number</returns> public int ParseObjectRefNo() { // loop in case of one or more comments SkipComments(); // must be a digit if (NextChar < '0' || NextChar > '9') { return(0); } // next content element StringBuilder NextItem = new StringBuilder(); NextItem.Append((char)NextChar); // add more characters until next delimiter while ((NextChar = ReadChar()) != EOF && !PdfBase.IsDelimiter(NextChar)) { NextItem.Append((char)NextChar); } // integer if (!int.TryParse(NextItem.ToString(), out int ObjNo) || ObjNo <= 0) { return(0); } // next character must be space if (!PdfBase.IsWhiteSpace(NextChar)) { return(0); } // skip additional white space while ((NextChar = ReadChar()) != EOF && PdfBase.IsWhiteSpace(NextChar)) { ; } // next character must be zero if (NextChar != '0') { return(0); } // next character must be white space NextChar = ReadChar(); if (!PdfBase.IsWhiteSpace(NextChar)) { return(0); } // skip additional white space while ((NextChar = ReadChar()) != EOF && PdfBase.IsWhiteSpace(NextChar)) { ; } // next 3 characters must be obj if (NextChar != 'o' || ReadChar() != 'b' || ReadChar() != 'j') { return(0); } // next character must be a delimiter NextChar = ReadChar(); if (!PdfBase.IsDelimiter(NextChar)) { return(0); } // return object number return(ObjNo); }
//////////////////////////////////////////////////////////////////// // Parse hex string item and return PdfString //////////////////////////////////////////////////////////////////// internal PdfBase ParseHexString() { // create value string List <byte> StrArr = new List <byte>(); // add more hexadecimal numbers until next closing > bool First = true; int OneChar; int OneByte = 0; for (;;) { // read next character NextChar = ReadChar(); if (NextChar == EOF) { throw new ApplicationException("Invalid hex string (End of contents)"); } // end of string if (NextChar == '>') { break; } // ignore white space within the string if (PdfBase.IsWhiteSpace(NextChar)) { continue; } // test for hex digits if (NextChar >= '0' && NextChar <= '9') { OneChar = NextChar - '0'; } else if (NextChar >= 'A' && NextChar <= 'F') { OneChar = NextChar - ('A' - 10); } else if (NextChar >= 'a' && NextChar <= 'f') { OneChar = NextChar - ('a' - 10); } else { throw new ApplicationException("Invalid hex string"); } if (First) { OneByte = OneChar; First = false; } else { StrArr.Add((byte)((OneByte << 4) | OneChar)); First = true; } } if (!First) { StrArr.Add((byte)(OneByte << 4)); } // read next character after closing > NextChar = ReadChar(); // exit return(new PdfString(StrArr.ToArray())); }
//////////////////////////////////////////////////////////////////// // Get stream length // Stream length might be in another indirect object // This method must run after ReadObject was run for all objects //////////////////////////////////////////////////////////////////// internal void GetStreamLength() { // get value PdfBase LengthValue = Dictionary.FindValue("/Length"); // dictionary value is reference to integer if (LengthValue.IsReference) { // get indirect object based on reference number PdfIndirectObject LengthObject = Reader.ToPdfIndirectObject((PdfReference)LengthValue); // read object type if (LengthObject != null && LengthObject.ObjectType == ObjectType.Other && LengthObject.Value.IsInteger) { StreamLength = ((PdfInteger)LengthObject.Value).IntValue; } // replace /Length in dictionary with actual value Dictionary.AddInteger("/Length", StreamLength); } // dictionary value is integer else if (LengthValue.IsInteger) { // save stream length StreamLength = ((PdfInteger)LengthValue).IntValue; } // stream is empty or stream length is in error if (StreamLength == 0) { return; } // stream might be outside file boundry // HP Scanners Scanned PDF does not conform to PDF standards // https://www.google.com/search?client=firefox-b-d&q=hp+officejet+PDF+scan+files+not+standard try { // set file position to the end of the stream Reader.SetFilePosition(StreamFilePosition + StreamLength); // verify end of stream // read first byte Reader.ParseFile.ReadFirstChar(); // test for endstream if (Reader.ParseFile.ParseNextItem().ToKeyWord != KeyWord.EndStream) { throw new ApplicationException("Endstream token missing"); } // test for endobj if (Reader.ParseFile.ParseNextItem().ToKeyWord != KeyWord.EndObj) { throw new ApplicationException("Endobj token missing"); } return; } catch { StreamLength = 0; Reader.InvalidPdfFile = true; return; } }
/// <summary> /// Parse next item /// </summary> /// <returns>Derived class from PdfBase</returns> public PdfBase ParseNextItem() { // loop in case of one or more comments SkipComments(); // end of file if (NextChar == EOF) { return(PdfBase.Empty); } // string if (NextChar == '(') { return(ParseString()); } // array if (NextChar == '[') { return(ParseArray()); } // hex string or dictionary if (NextChar == '<') { // test for dictionary if (ReadChar() == '<') { return(ParseDictionary(false)); } // move pointer back StepBack(); // hex string return(ParseHexString()); } // next content element StringBuilder NextItem = new StringBuilder(); NextItem.Append((char)NextChar); // add more characters until next delimiter while ((NextChar = ReadChar()) != EOF && !PdfBase.IsDelimiter(NextChar)) { NextItem.Append((char)NextChar); } // convert next item to string token string Token = NextItem.ToString(); // name if (Token[0] == '/') { // empty name if (Token.Length == 1) { throw new ApplicationException("Empty name token"); } // exit return(new PdfName(Token)); } // integer if (int.TryParse(Token, out int IntVal)) { // if parsing non contents streams, an integer can be the start of indirect reference number if (!ContentsStream && IntVal > 0 && TestReference()) { return(new PdfReference(IntVal)); } // integer return(new PdfInteger(IntVal)); } // real number with period as decimal separator regardless of region if (float.TryParse(Token, NumberStyles.AllowDecimalPoint | NumberStyles.AllowLeadingSign, NumFormatInfo.PeriodDecSep, out float RealVal)) { // if real number is an integer return PdfInt object int TestInt = (int)Math.Truncate(RealVal); if (RealVal == (double)TestInt) { return(new PdfInteger(TestInt)); } return(new PdfReal(RealVal)); } // false if (Token == "false") { return(new PdfBoolean(false)); } // true if (Token == "true") { return(new PdfBoolean(true)); } // null if (Token == "null") { return(new PdfNull()); } // parse all but contents stream if (!ContentsStream) { // stream special case if (Token == "stream") { // stream must be foloowed by NL or CR and NL // if(NextChar == '\n' || NextChar == '\r' && ReadChar() == '\n') return new PdfKeyword(KeyWord.Stream); if (NextChar == '\n') { return(new PdfKeyword(KeyWord.Stream)); } if (NextChar == '\r') { // the PDF spec is very clear that stream must be foloowed by NL or CR and NL // CR by itself is not acceptable if (ReadChar() != '\n') { // HP Scanners Scanned PDF does not conform to PDF standards // https://www.google.com/search?client=firefox-b-d&q=hp+officejet+PDF+scan+files+not+standard // step back to allow re-parsing of the last character StepBack(); Reader.InvalidPdfFile = true; } return(new PdfKeyword(KeyWord.Stream)); } // error throw new ApplicationException("Stream word must be followed by EOL"); } // endstream if (Token == "endstream") { return(new PdfKeyword(KeyWord.EndStream)); } // endobj if (Token == "endobj") { return(new PdfKeyword(KeyWord.EndObj)); } // xref if (Token == "xref") { return(new PdfKeyword(KeyWord.XRef)); } // xref n if (Token == "n") { return(new PdfKeyword(KeyWord.N)); } // xref f if (Token == "f") { return(new PdfKeyword(KeyWord.F)); } // trailer if (Token == "trailer") { return(new PdfKeyword(KeyWord.Trailer)); } } // parse contents stream else { // search for contents operator int OpIndex = Array.BinarySearch(OpCtrl.OpCtrlArray, new OpCtrl(Token)); // not found if (OpIndex < 0) { throw new ApplicationException("Parsing failed: Unknown contents operator"); } // operator enumeration Operator OpCode = OpCtrl.OpCtrlArray[OpIndex].OpCode; // inline image if (OpCode == Operator.BeginInlineImage) { return(ParseInlineImage()); } // PDF operator object if (OpCode != Operator.BeginInlineImageData && OpCode != Operator.EndInlineImage) { return(new PdfOp(OpCode)); } } // error throw new ApplicationException("Parsing failed: Unknown token: " + Token); }
/// <summary> /// Apply ASCII 85 decode /// </summary> /// <param name="InputBuffer">Input buffer</param> /// <returns>Output buffer</returns> internal byte[] Ascii85Decode ( byte[] InputBuffer ) { // array of power of 85: 85**4, 85**3, 85**2, 85**1, 85**0 uint[] Power85 = new uint[] { 85 * 85 * 85 * 85, 85 * 85 * 85, 85 * 85, 85, 1 }; // output buffer List <byte> OutputBuffer = new List <byte>(); // convert input to output buffer int State = 0; uint FourBytes = 0; for (int Index = 0; Index < InputBuffer.Length; Index++) { // next character char NextChar = (char)InputBuffer[Index]; // end of stream "~>" if (NextChar == '~') { break; } // ignore white space if (PdfBase.IsWhiteSpace(NextChar)) { continue; } // special case of four zero bytes if (NextChar == 'z' && State == 0) { OutputBuffer.Add(0); OutputBuffer.Add(0); OutputBuffer.Add(0); OutputBuffer.Add(0); continue; } // test for valid characters if (NextChar < '!' || NextChar > 'u') { throw new ApplicationException("Illegal character in ASCII85Decode"); } // accumulate 4 output bytes from 5 input bytes FourBytes += Power85[State++] * (uint)(NextChar - '!'); // we have 4 output bytes if (State == 5) { OutputBuffer.Add((byte)(FourBytes >> 24)); OutputBuffer.Add((byte)(FourBytes >> 16)); OutputBuffer.Add((byte)(FourBytes >> 8)); OutputBuffer.Add((byte)FourBytes); // reset state State = 0; FourBytes = 0; } } // if state is not zero add one, two or three terminating bytes if (State != 0) { if (State == 1) { throw new ApplicationException("Illegal length in ASCII85Decode"); } // add padding of 84 for (int PadState = State; PadState < 5; PadState++) { FourBytes += Power85[PadState] * (uint)('u' - '!'); } // add one, two or three terminating bytes OutputBuffer.Add((byte)(FourBytes >> 24)); if (State >= 3) { OutputBuffer.Add((byte)(FourBytes >> 16)); if (State >= 4) { OutputBuffer.Add((byte)(FourBytes >> 8)); } } } // exit return(OutputBuffer.ToArray()); }
//////////////////////////////////////////////////////////////////// // process objects stream //////////////////////////////////////////////////////////////////// internal void ProcessObjectsStream() { // read decrypt and decompress the stream byte[] ByteArray = ReadStream(); ByteArray = DecompressStream(ByteArray); // get the count of objects in this cross reference object stream if (!Dictionary.FindValue("/N").GetInteger(out int ObjectCount) || ObjectCount <= 0) { throw new ApplicationException("Object stream: count (/N) is missing"); } // get first byte offset if (!Dictionary.FindValue("/First").GetInteger(out int FirstPos)) { throw new ApplicationException("Object stream: first byte offset (/First) is missing"); } // get /Extends (must be a reference) PdfBase Extends = Dictionary.FindValue("/Extends"); if (Extends.IsReference) { ParentObjectNo = ((PdfReference)Extends).ObjectNumber; } // create temp array of child objects PdfIndirectObject[] Children = new PdfIndirectObject[ObjectCount]; // read all byte offset array PdfByteArrayParser PC = new PdfByteArrayParser(Reader, ByteArray, false); PC.ReadFirstChar(); for (int Index = 0; Index < ObjectCount; Index++) { // object number if (!PC.ParseNextItem().GetInteger(out int ObjNo)) { throw new ApplicationException("Cross reference object stream: object number error"); } // object offset if (!PC.ParseNextItem().GetInteger(out int ObjPos)) { throw new ApplicationException("Cross reference object stream: object offset error"); } // find object PdfIndirectObject ReadObject = Reader.ObjectArray[ObjNo]; if (ReadObject == null) { throw new ApplicationException("Cross reference object stream: object not found"); } // object is free if (ReadObject.ObjectType == ObjectType.Free) { // save child Children[Index] = ReadObject; // save position ReadObject.FilePosition = FirstPos + ObjPos; } } // copy the object from the stream to the corresponding indirect object for (int Index = 0; Index < ObjectCount; Index++) { // shortcut PdfIndirectObject Child = Children[Index]; // object was loaded by later update if (Child == null) { continue; } PC.SetPos(Child.FilePosition); PC.ReadFirstChar(); PdfBase Obj = PC.ParseNextItem(); // we have a dictionary if (Obj.IsDictionary) { // set object value type to dictionary Child.ObjectType = ObjectType.Dictionary; Child.Dictionary = (PdfDictionary)Obj; // set object type if available in the dictionary string ObjectTypeStr = Child.Dictionary.FindValue("/Type").ToName; // set special object if (ObjectTypeStr != null) { Child._PdfObjectType = ObjectTypeStr; } } // we have other type of object // note: stream object is not allowed else { // set object value type to dictionary Child.ObjectType = ObjectType.Other; Child.Value = Obj; } } return; }
//////////////////////////////////////////////////////////////////// // Test for reference // We have positive integer already. Test for zero and R //////////////////////////////////////////////////////////////////// internal bool TestReference() { // save current file position int Pos = GetPos(); // save next character int TempChar = NextChar; for (;;) { // next character must be space if (!PdfBase.IsWhiteSpace(TempChar)) { break; } // skip additional white space while ((TempChar = ReadChar()) != EOF && PdfBase.IsWhiteSpace(TempChar)) { ; } // generation is not supported // next character must be zero if (TempChar != '0') { break; } // next character must be white space TempChar = ReadChar(); if (!PdfBase.IsWhiteSpace(TempChar)) { break; } // skip additional white space while ((TempChar = ReadChar()) != EOF && PdfBase.IsWhiteSpace(TempChar)) { ; } // next character must be R if (TempChar != 'R') { break; } // next character must be a delimiter TempChar = ReadChar(); if (!PdfBase.IsDelimiter(TempChar)) { break; } // found NextChar = TempChar; return(true); } // restore position SetPos(Pos); return(false); }
/// <summary> /// Build contents array for PdfPage /// </summary> public void BuildContentsArray() { // must be a page if (PdfObjectType != "/Page") { throw new ApplicationException("Build contents array: Object must be page"); } // get Contents dictionary value PdfBase ContentsValue = Dictionary.FindValue("/Contents"); // page is blank no contents if (ContentsValue.IsEmpty) { ContentsArray = new PdfIndirectObject[0]; return; } // test if contents value is a reference if (ContentsValue.IsReference) { // find the object with Object number PdfIndirectObject IndirectObject = Reader.ToPdfIndirectObject((PdfReference)ContentsValue); if (IndirectObject != null) { // the object is a stream return array with one contents object if (IndirectObject.ObjectType == ObjectType.Stream) { IndirectObject._PdfObjectType = "/Contents"; ContentsArray = new PdfIndirectObject[] { IndirectObject }; return; } // read object must be an array if (IndirectObject.ObjectType == ObjectType.Other) { ContentsValue = IndirectObject.Value; } } } // test if contents value is an array if (!ContentsValue.IsArray) { throw new ApplicationException("Build contents array: /Contents must be array"); } // array of reference numbers to contents objects PdfBase[] ReferenceArray = ((PdfArray)ContentsValue).ArrayItems; // create empty result list ContentsArray = new PdfIndirectObject[ReferenceArray.Length]; // verify that all array items are references to streams for (int Index = 0; Index < ReferenceArray.Length; Index++) { // shortcut PdfBase ContentsRef = ReferenceArray[Index]; // each item must be a reference if (!ContentsRef.IsReference) { throw new ApplicationException("Build contents array: Array item must be reference"); } // get read object PdfIndirectObject Contents = Reader.ToPdfIndirectObject((PdfReference)ContentsRef); // the object is not a stream if (Contents == null || Contents.ObjectType != ObjectType.Stream) { throw new ApplicationException("Build contents array: Contents must be a stream"); } // mark as page's contents Contents._PdfObjectType = "/Contents"; // add stream to the array ContentsArray[Index] = Contents; } // successful exit return; }