private void ParseSizes() { double glyphSpaceToTextSpace = 1000.0; // TODO: PdfFont.ParseSizes: SubType:Type3 Uses a FontMatrix that may not correspond to 1/1000th _widths = new Dictionary <char, double>(); char firstChar = (char)_baseData.GetParamAsInt("FirstChar"); char lastChar = (char)_baseData.GetParamAsInt("LastChar"); PdfArray widths = _baseData.Values["Widths"] as PdfArray; char actualChar = firstChar; foreach (IPdfElement elem in widths.Values) { double width = PdfElementUtils.GetReal(elem, 500); if (width < 0.0001f && width > -0.0001f) { width = 500; } _widths.Add(actualChar, width / glyphSpaceToTextSpace); actualChar++; } // FIMXE: Calculate real height }
private void OpTextPutMultiple(PdfArray array) { if (inText == false) { return; } foreach (IPdfElement elem in array.Values) { if (elem is PdfString) { OpTextPut(((PdfString)elem).Value); } else if (elem is PdfInteger || elem is PdfReal) { double spacing = PdfElementUtils.GetReal(elem, 0); _textWidth -= (spacing / 1000) * _fontSize; } else if (elem is PdfArray) { OpTextPutMultiple(((PdfArray)elem)); } } }
public PdfObject ParseObject(List <PdfObject> knownObjects) { PdfObject obj = null; long startPosition = _streamPosition; do { SkipWhitespace(); byte character = PeekChar(); if (character == '%') { SkipComment(); } else if (IsDigit(character)) { IPdfElement objectID = ParseNumber(); SkipWhitespace(); IPdfElement objectGeneration = ParseNumber(); SkipWhitespace(); string token = ParseToken(); if (token == "obj") { SkipWhitespace(); IPdfElement element = ParseElement(); string endToken = ParseToken(); // Intercept streams if (endToken == "stream") { PdfDictionary streamDict = element as PdfDictionary; if (streamDict == null) { throw new Exception(string.Format("Stream after a not dictionary element at: {0}", _streamPosition)); } SkipEndOfLine(); // Find the length of the stream long length = -1; if (streamDict.Values.ContainsKey("Length")) { length = PdfElementUtils.GetInt(streamDict.Values["Length"], -1); if (length == -1 && streamDict.Values["Length"] is PdfObjectReference) { IPdfElement lenghtObj = SearchObjectID(knownObjects, ((PdfObjectReference)streamDict.Values["Length"]).ObjectID); length = PdfElementUtils.GetInt(lenghtObj, -1); } } if (length == -1) { byte lineFeed = 0x0A; byte carriageReturn = 0x0D; length = MeasureToMarkers(new char[][] { new char[] { (char)carriageReturn, (char)lineFeed, 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm' }, new char[] { (char)lineFeed, 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm' }, new char[] { 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm', (char)lineFeed }, new char[] { 'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm', (char)carriageReturn, (char)lineFeed }, }); } // Get the stream byte[] streamBody = GetRawData(length); SkipEndOfLine(); endToken = ParseToken(); if (endToken != "endstream") { throw new Exception(string.Format("Expected \"endstream\" token, \"{0}\" found at: {1}", token, _streamPosition)); } SkipWhitespace(); endToken = ParseToken(); PdfStream stream = new PdfStream(); stream.Dictionary = streamDict; stream.Data = streamBody; element = stream; } if (endToken == "endobj") { obj = new PdfObject(); obj.ObjectID = (int)((PdfInteger)objectID).Value; obj.ObjectGeneration = (int)((PdfInteger)objectGeneration).Value; obj.Data = element; break; } } } else { long streamPosition = _streamPosition; string token = ParseToken(); if (token == "startxref") { // TODO: PdfParser: Ignoring startxref for now SkipEndOfLine(); SkipToEndOfLine(); SkipEndOfLine(); SkipToEndOfLine(); SkipEndOfLine(); SkipWhitespace(); continue; } if (token == "xref") { // TODO: PdfParser: Ignoring xref for now SkipToEndOfLine(); SkipEndOfLine(); do { SkipWhitespace(); IPdfElement objNumber = ParseNumber(); SkipWhitespace(); objNumber = ParseNumber(); SkipEndOfLine(); PdfInteger refNumber = objNumber as PdfInteger; for (int i = 0; i < refNumber.Value; i++) { SkipToEndOfLine(); SkipEndOfLine(); } long currentPosition = _streamPosition; IPdfElement testElem = ParseElement(); _streamPosition = currentPosition; if ((testElem is PdfInteger) == false) { break; } } while (IsEndOfStream() == false); continue; } if (token == "trailer") { // TODO: PdfParser: Ignoring trailer for now SkipEndOfLine(); ParseElement(); SkipWhitespace(); SkipToEndOfLine(); SkipEndOfLine(); SkipToEndOfLine(); SkipEndOfLine(); SkipToEndOfLine(); SkipEndOfLine(); SkipWhitespace(); continue; } // Try to find an object marker byte lineFeed = 0x0A; byte carriageReturn = 0x0D; long distToObject = MeasureToMarkers(new char[][] { new char[] { ' ', 'o', 'b', 'j', (char)lineFeed }, new char[] { ' ', 'o', 'b', 'j', (char)carriageReturn, (char)lineFeed }, }); if (distToObject > 0) { // Object marker found, backtrack and retry long originalPosition = _streamPosition; _streamPosition += distToObject; long marker = _streamPosition; SkipWhitespaceBack(); if (_streamPosition == marker) { // Abort backtrack, skip garbage _streamPosition = originalPosition + distToObject + 4; continue; } marker = _streamPosition; SkipDigitsBack(); if (_streamPosition == marker) { // Abort backtrack, skip garbage _streamPosition = originalPosition + distToObject + 4; continue; } marker = _streamPosition; SkipWhitespaceBack(); if (_streamPosition == marker) { // Abort backtrack, skip garbage _streamPosition = originalPosition + distToObject + 4; continue; } marker = _streamPosition; SkipDigitsBack(); if (_streamPosition == marker) { // Abort backtrack, skip garbage _streamPosition = originalPosition + distToObject + 4; continue; } NextChar(); } else { // No more obj markers found, abort all. _streamPosition = _stream.Length; } } } while (IsEndOfStream() == false); return(obj); }
private void ProcessPageContent() { int unknowCount = 0; int lineCount = 0; int strokeCount = 0; int pathCount = 0; for (int i = 0; i < _page.ContentActions.Count; i++) { PdfContentAction action = _page.ContentActions[i]; // Special graphics state if (action.Token == "q") { OpPushGraphState(); } else if (action.Token == "Q") { OpPopGraphState(); } else if (action.Token == "cm") { double a = PdfElementUtils.GetReal(action.Parameters[0], 0); double b = PdfElementUtils.GetReal(action.Parameters[1], 0); double c = PdfElementUtils.GetReal(action.Parameters[2], 0); double d = PdfElementUtils.GetReal(action.Parameters[3], 0); double e = PdfElementUtils.GetReal(action.Parameters[4], 0); double f = PdfElementUtils.GetReal(action.Parameters[5], 0); OpSetGraphMatrix(a, b, c, d, e, f); } // Text Operations else if (action.Token == "BT") { OpBeginText(); } else if (action.Token == "ET") { OpEndText(); } else if (action.Token == "Tc") { double charSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0); OpTextCharSpacing(charSpacing); } else if (action.Token == "Tw") { double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0); OpTextWordSpacing(wordSpacing); } else if (action.Token == "Tz") { // TODO: PdfTextExtractor: Horizontal Scale } else if (action.Token == "Tf") { string fontName = PdfElementUtils.GetString(action.Parameters[0], string.Empty); double fontSize = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTextFont(fontName, fontSize); } else if (action.Token == "TL") { double leading = PdfElementUtils.GetReal(action.Parameters[0], 0); OpTextLeading(leading); } else if (action.Token == "Tr") { // TODO: PdfTextExtractor: Rendering mode } else if (action.Token == "Ts") { // TODO: PdfTextExtractor: Text rise } else if (action.Token == "Td") { double x = PdfElementUtils.GetReal(action.Parameters[0], 0); double y = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTextDisplace(x, y); } else if (action.Token == "TD") { double x = PdfElementUtils.GetReal(action.Parameters[0], 0); double y = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTextLeading(-y); OpTextDisplace(x, y); } else if (action.Token == "Tm") { double a = PdfElementUtils.GetReal(action.Parameters[0], 0); double b = PdfElementUtils.GetReal(action.Parameters[1], 0); double c = PdfElementUtils.GetReal(action.Parameters[2], 0); double d = PdfElementUtils.GetReal(action.Parameters[3], 0); double e = PdfElementUtils.GetReal(action.Parameters[4], 0); double f = PdfElementUtils.GetReal(action.Parameters[5], 0); OpSetTextMatrix(a, b, c, d, e, f); } else if (action.Token == "T*") { OpTextLineFeed(); } else if (action.Token == "Tj") { string text = PdfElementUtils.GetString(action.Parameters[0], string.Empty); OpTextPut(text); } else if (action.Token == "'") { string text = PdfElementUtils.GetString(action.Parameters[0], string.Empty); OpTextLineFeed(); OpTextPut(text); } else if (action.Token == "\"") { double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0); double charSpacing = PdfElementUtils.GetReal(action.Parameters[1], 0); string text = PdfElementUtils.GetString(action.Parameters[0], string.Empty); OpTextCharSpacing(charSpacing); OpTextWordSpacing(wordSpacing); OpTextPut(text); } else if (action.Token == "TJ") { OpTextPutMultiple(((PdfArray)action.Parameters[0])); } else if (action.Token == "re") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "f") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "g") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "rg") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "BI") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "ID") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "EI") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "W") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "n") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "Do") { // TODO: PdfTextExtractor: Interpret this } else if (action.Token == "m") { // TODO: PdfTextExtractor: Interpret this "moveto: Begin new subpath" } else if (action.Token == "l") { // TODO: PdfTextExtractor: Interpret this "lineto: Append straight line segment to path" lineCount++; } else if (action.Token == "h") { // TODO: PdfTextExtractor: Interpret this "closepath: Close subpath" pathCount++; } else if (action.Token == "W") { // TODO: PdfTextExtractor: Interpret this "clip: Set clipping path using nonzero winding number rule" } else if (action.Token == "W*") { // TODO: PdfTextExtractor: Interpret this "eoclip: Set clipping path using even-odd rule" } else if (action.Token == "w") { // TODO: PdfTextExtractor: Interpret this "setlinewidth: Set line width" } else if (action.Token == "G") { // TODO: PdfTextExtractor: Interpret this "setgray: Set gray level for stroking operations" } else if (action.Token == "S") { // TODO: PdfTextExtractor: Interpret this "stroke: Stroke path" strokeCount++; } else if (action.Token == "M") { // TODO: PdfTextExtractor: Interpret this "setmiterlimit: Set miter limit" } else { unknowCount++; } } FlushTextElement(); }
private void btnProcess_Click(object sender, EventArgs e) { if (System.IO.File.Exists(txtPdfPath.Text) == false) { MessageBox.Show("File does not exist"); return; } PdfDocument doc = PdfDocument.Load(txtPdfPath.Text); int nObjects = doc.Objects.Count; int nRootObject = doc.Objects.Where(obj => obj.UsageCount == 0).Count(); List <PdfStream> streams = doc.Objects .Where(obj => obj.Data.Type == PdfElementTypes.Stream) .Select(obj => (PdfStream)obj.Data) .ToList(); int nStreams = streams.Count; int nPages = doc.Pages.Count; List <string> lines = new List <string>(); lines.Add(string.Format("Filename : {0}", System.IO.Path.GetFileNameWithoutExtension(txtPdfPath.Text))); lines.Add(string.Format("Number of Objects : {0}", nObjects)); lines.Add(string.Format("Number of Roots : {0}", nRootObject)); lines.Add(string.Format("Number of Streams : {0}", nStreams)); lines.Add(string.Format("Number of Pages : {0}", nPages)); int pageNumber = 1; foreach (PdfDocumentPage page in doc.Pages) { lines.Add("-----------------------------------------------------------------------------------------"); if (page.BaseData.Values.ContainsKey("CropBox")) { PdfArray cropBox = page.BaseData.Values["CropBox"] as PdfArray; lines.Add(string.Format("Page({0} of {1}): {2} {3} {4} {5}", pageNumber, doc.Pages.Count, PdfElementUtils.GetReal(cropBox.Values[0], 0), PdfElementUtils.GetReal(cropBox.Values[1], 0), PdfElementUtils.GetReal(cropBox.Values[2], 0), PdfElementUtils.GetReal(cropBox.Values[3], 0))); } else { lines.Add(string.Format("Page({0} of {1}): ", pageNumber, doc.Pages.Count)); } pageNumber++; PdfTextExtractor extractor = new PdfTextExtractor(page); foreach (PdfTextElement textElement in extractor.Elements) { string fontName = textElement.Font == null ? "#NULL#" : textElement.Font.Name; if (fontName == "#NULL#" && textElement.Childs.Count > 0) { var fontNames = textElement.Childs.Select(c => c.Font == null ? "#NULL#" : c.Font.Name); StringBuilder sbFontName = new StringBuilder(); foreach (string fontNameAux in fontNames) { if (sbFontName.Length > 0) { sbFontName.Append(";"); } sbFontName.Append(fontNameAux); } fontName = sbFontName.ToString(); } lines.Add(string.Format("Text({0}, {1})({2}, {3})[{4}]: \"{5}\"", Math.Round(textElement.Matrix.Matrix[0, 2], 2), Math.Round(textElement.Matrix.Matrix[1, 2], 2), Math.Round(textElement.VisibleWidth, 2), Math.Round(textElement.VisibleHeight, 2), fontName, textElement.VisibleText)); } } txtOutput.Lines = lines.ToArray(); }