private int ReadXrefTable(byte[] pdf, string[] parts, int xrefStartPointer, Dictionary <int, XrefItem> Xref) { // el array de partes es mas grande, cada elemento es una posición // 0 6 <- position apunta al 0 // 0000000003 65535 f // 0000000017 00000 n // 0000000081 00000 n // 0000000000 00007 f // 0000000331 00000 n // 0000000409 00000 n int count = 0; int kk; if (!Int32.TryParse(parts[xrefStartPointer], NumberStyles.None, CultureInfo.InvariantCulture, out kk)) { return(parts.Length); } for (int xrefIndex = Convert.ToInt32(parts[xrefStartPointer]); xrefIndex < Convert.ToInt32(parts[xrefStartPointer + 1]); xrefIndex++) { uint offset = Convert.ToUInt32(parts[xrefStartPointer + 2 + count * 3]); bool IsUsed = parts[xrefStartPointer + 4 + count * 3] == "n" ? true : false; if (IsUsed) { XrefItem item = new XrefItem() { id = GetReferenceNumberForward(pdf, offset), pos = offset, // iteration is ignored, the new pdf will have 0 iteration IsUsed = parts[xrefStartPointer + 4 + count * 3] == "n" ? true : false }; Xref.Add(item.id, item); } count++; } return(xrefStartPointer + 2 + count * 3); }
private void ReadXRefStream(byte[] pdf, Dictionary <int, XrefItem> Xref, uint XrefPos) { // cross-reference stream // 12 0 obj % Cross-reference stream // << /Type /XRef % Cross-reference stream dictionary // /Size ... // /Root ... // >> // stream // ... // Stream data containing cross-reference information ... // endstream // endobj bool continueWorking = true; uint currentIndex = XrefPos; int predictor = 1; while (continueWorking) { if (currentIndex >= pdf.Length - 5) { throw new Exception("xref stream table ends abruptly"); } if (pdf[currentIndex] == 's' && pdf[currentIndex + 1] == 't' && pdf[currentIndex + 2] == 'r' && pdf[currentIndex + 3] == 'e' && pdf[currentIndex + 4] == 'a' && pdf[currentIndex + 5] == 'm') { currentIndex += 6; uint startStream = currentIndex; if (pdf[startStream] == '\r' && pdf[startStream + 1] == '\n') { startStream += 2; } else if (pdf[startStream] == '\n') { startStream++; } string dictionary = GetString(pdf, XrefPos, startStream); string[] partsXrefStream = dictionary.Split(separator, StringSplitOptions.RemoveEmptyEntries); int i = 0; uint? streamLength = null; int w1 = 1; int w2 = 1; int w3 = 1; int size = 1; uint prev; string filter; List <int> indexArray = new List <int>(); while (i < partsXrefStream.Length) { if (partsXrefStream[i] == "Length") { streamLength = Convert.ToUInt32(partsXrefStream[i + 1]); } if (partsXrefStream[i] == "W") { w1 = Convert.ToInt16(partsXrefStream[i + 1]); w2 = Convert.ToInt16(partsXrefStream[i + 2]); w3 = Convert.ToInt16(partsXrefStream[i + 3]); } if (partsXrefStream[i] == "Index") { int j = i + 1; int num; while (Int32.TryParse(partsXrefStream[j], out num)) { indexArray.Add(num); j++; } } if (partsXrefStream[i] == "Size") { size = Convert.ToInt16(partsXrefStream[i + 1]); } if (partsXrefStream[i] == "Predictor") { predictor = Convert.ToInt16(partsXrefStream[i + 1]); } if (partsXrefStream[i] == "Prev") { prev = Convert.ToUInt32(partsXrefStream[i + 1]); ReadXRefStream(pdf, Xref, prev); } if (partsXrefStream[i] == "Filter") { filter = partsXrefStream[i + 1]; if (filter != "FlateDecode") { throw new Exception("only flatedecode filter implemented, but " + filter + " found"); } } i++; } if (indexArray.Count == 0) { indexArray.Add(0); indexArray.Add(size); } if (streamLength == null) { throw new Exception("stream without length definition"); } uint endStream = startStream + streamLength.Value; currentIndex = endStream + 9; // 9 = endstream text length byte[] stream = pdf.Slice(startStream, endStream); byte[] deflated = Deflate(stream, predictor, w1 + w2 + w3); int deflatedIndex = 0; int elementIndex = 0; List <StmObjSubItem> lstObjStm = new List <StmObjSubItem>(); for (int indexArrayIndex = 0; indexArrayIndex < indexArray.Count; indexArrayIndex += 2) { i = indexArray[indexArrayIndex]; for (int j = 0; j < indexArray[indexArrayIndex + 1]; j++) { uint type = GetUInt(deflated, w1, ref deflatedIndex, 1); uint value2 = GetUInt(deflated, w2, ref deflatedIndex, 0); uint value3 = GetUInt(deflated, w3, ref deflatedIndex, 0); // type 0 => free item, they are ignored // type 1 object is in position value2, with generation value3 if (type == 1) { XrefItem item = new XrefItem() { id = GetReferenceNumberForward(pdf, value2), pos = value2, IsUsed = true }; Xref.Add(item.id, item); } // type 2 object is in position value2, with generation value3 else if (type == 2) { StmObjSubItem item = new StmObjSubItem() { StmObjId = (int)value2, Position = (int)value3 }; lstObjStm.Add(item); } elementIndex++; } } // mark in stmobj whose are used foreach (StmObjSubItem item in lstObjStm) { Xref[item.StmObjId].stmobjUsed.Add(item.Position); } break; } if (pdf[currentIndex] == 'e' && pdf[currentIndex + 1] == 'n' && pdf[currentIndex + 2] == 'd' && pdf[currentIndex + 3] == 'o' && pdf[currentIndex + 4] == 'b' && pdf[currentIndex + 5] == 'j') { throw new Exception("xref stream without stream"); } else { currentIndex++; } } }
private void ReadContent(byte[] pdf, XrefItem item, List <XrefItem> newItems, Dictionary <int, XrefItem> dctXref1) { // |--TEXT--||- optional binary-||--TEXT-..-| // OBJ.......STREAM......ENDSTREM.TEXT.ENDOBJ uint startIndex = item.pos; uint startTextIndex = startIndex; uint currentIndex = startIndex; bool continueWorking = true; // already read if (item.text.Count > 0) { return; } while (continueWorking) { if (currentIndex + 5 < pdf.Length && pdf[currentIndex] == 's' && pdf[currentIndex + 1] == 't' && pdf[currentIndex + 2] == 'r' && pdf[currentIndex + 3] == 'e' && pdf[currentIndex + 4] == 'a' && pdf[currentIndex + 5] == 'm') { currentIndex += 6; item.text.Add(GetString(pdf, startTextIndex, currentIndex)); uint startStream = currentIndex; if (pdf[startStream] == '\r' && pdf[startStream + 1] == '\n') { startStream += 2; } else if (pdf[startStream] == '\n') { startStream++; } string dictionary = GetString(pdf, startIndex, startStream); string[] parts = dictionary.Split(separator, StringSplitOptions.RemoveEmptyEntries); int i = 0; uint? streamLength = null; while (i < parts.Length) { if (parts[i].ToLower() == "length") { // easy: /Length 1234 => 1234 bytes // hard: /Length 3 0 R => Look in object 3, to obtain the 1234 if (i + 3 < parts.Length && parts[i + 2] == "0" && parts[i + 3] == "R") { int lengthPointer = Convert.ToInt32(parts[i + 1]); ReadContent(pdf, dctXref1[lengthPointer], newItems, dctXref1); if (dctXref1[lengthPointer].text.Count != 1) { throw new Exception("Length points to an object that is not a number"); } string[] partsLength = dctXref1[lengthPointer].text[0].Split(separator, StringSplitOptions.RemoveEmptyEntries); // 3 0 Obj 1234 endobj streamLength = Convert.ToUInt32(partsLength[3]); } else { streamLength = Convert.ToUInt32(parts[i + 1]); } break; } i++; } if (streamLength == null) { throw new Exception("stream without length definition"); } uint endStream = startStream + (streamLength ?? 0) + 1; startTextIndex = endStream; currentIndex = endStream + 9; // 9 = endstream text length item.streamContent = pdf.Slice(startStream, endStream); } if (currentIndex + 5 < pdf.Length && pdf[currentIndex] == 'e' && pdf[currentIndex + 1] == 'n' && pdf[currentIndex + 2] == 'd' && pdf[currentIndex + 3] == 'o' && pdf[currentIndex + 4] == 'b' && pdf[currentIndex + 5] == 'j') { currentIndex += 6; continueWorking = false; } else { currentIndex++; } if (currentIndex >= pdf.Length) { continueWorking = false; } } item.text.Add(GetString(pdf, startTextIndex, currentIndex)); // ObjStm // 10 0 obj // <</Filter/FlateDecode/First 94/Length 773/N 13/Type/ObjStm>>stream // 11 0 12 547 13 665 <- num obj, offset, num obj, offset... // << obj1 >> // << obj2 >> // ... // endstream endobj if (item.text[0].Contains("/ObjStm")) { item.IsUsed = false; string[] partsXrefStream = item.text[0].Split(separator, StringSplitOptions.RemoveEmptyEntries); int i = 0; uint? streamLength = null; uint FirstElementOffset = 0; uint NumberOfElements = 0; string filter; while (i < partsXrefStream.Length) { if (partsXrefStream[i] == "Length") { streamLength = Convert.ToUInt32(partsXrefStream[i + 1]); } if (partsXrefStream[i] == "First") { FirstElementOffset = Convert.ToUInt32(partsXrefStream[i + 1]); } if (partsXrefStream[i] == "N") { NumberOfElements = Convert.ToUInt32(partsXrefStream[i + 1]); } if (partsXrefStream[i] == "Filter") { filter = partsXrefStream[i + 1]; if (filter != "FlateDecode") { throw new Exception("only flatedecode filter implemented, but " + filter + " found"); } } i++; } byte[] deflated = Deflate(item.streamContent, 1, 0); string preamble = GetString(deflated, 0, FirstElementOffset); string[] preambleParts = preamble.Split(separator, StringSplitOptions.RemoveEmptyEntries); foreach (int subitemUsed in item.stmobjUsed) { uint offset = Convert.ToUInt32(preambleParts[subitemUsed * 2 + 1]) + FirstElementOffset; uint offsetContinuous; int objectIndex = Convert.ToInt32(preambleParts[subitemUsed * 2]); if (subitemUsed < NumberOfElements - 1) { offsetContinuous = Convert.ToUInt32(preambleParts[(subitemUsed + 1) * 2 + 1]) + FirstElementOffset; } else { offsetContinuous = (uint)deflated.Length; } XrefItem newItem = new XrefItem() { id = objectIndex, pos = 0, IsUsed = true }; newItems.Add(newItem); ReadContent(deflated.Slice(offset, offsetContinuous), newItem, newItems, dctXref1); newItem.text[0] = objectIndex.ToString() + " 0 obj\n" + newItem.text[0] + "\nendobj"; } } }