public static void ApplyToUnitCode( DictionaryToken toUnicodeStream, ref char[] encoding, out string toUnicodeHeader, out SortedDictionary <int, char> cMap) { var tokeniser = toUnicodeStream.GetStreamBytes(); if (tokeniser.GetStreamOpCode("begincmap") is null) { throw tokeniser.StreamException("ToUnicode stream is missing 'begincmap'."); } tokeniser.SetStreamMark(); if (tokeniser.GetStreamOpCode("endcodespacerange") is null) { throw tokeniser.StreamException("ToUnicode stream is missing 'endcodespacerange'."); } toUnicodeHeader = tokeniser.GetStreamMarkedText(); cMap = new SortedDictionary <int, char>(); while (true) { var opCode = tokeniser.GetStreamOpCode(); if (opCode is null) { throw tokeniser.StreamException("ToUnicode stream incomplete."); } var opCodeSpan = opCode.Value.Span; if (isEqual("beginbfchar", opCodeSpan)) { tokeniser.StartStreamArgumentReading(); var linesCount = tokeniser.GetStreamInt(); tokeniser.EndStreamArgumentReading(); for (int lineIndex = 0; lineIndex < linesCount; lineIndex++) { var cid = tokeniser.GetStreamCid(); var unicodeChar = (char)tokeniser.GetStreamUnicode(); encoding[cid] = unicodeChar; cMap.Add(cid, unicodeChar); } opCode = tokeniser.GetStreamOpCode(); opCodeSpan = opCode !.Value.Span; if (!isEqual("endbfchar", opCodeSpan)) { throw tokeniser.StreamException("ToUnicode stream is missing 'endbfchar' after 'beginbfchar'."); } } else if (isEqual("beginbfrange", opCodeSpan)) { tokeniser.StartStreamArgumentReading(); var linesCount = tokeniser.GetStreamInt(); tokeniser.EndStreamArgumentReading(); for (int lineIndex = 0; lineIndex < linesCount; lineIndex++) { var cidStart = tokeniser.GetStreamCid(); var cidEnd = tokeniser.GetStreamCid(); if (cidEnd < cidStart) { throw tokeniser.StreamException($"ToUnicode: beginbfrange cid1 '{cidStart} should be smaller than cid2'{cidEnd}'."); } var unicodeChar = tokeniser.GetStreamUnicode(); for (ushort cidIndex = cidStart; cidIndex <= cidEnd; cidIndex++) { cMap.Add(cidIndex, (char)unicodeChar); encoding[cidIndex] = (char)unicodeChar++; } } opCode = tokeniser.GetStreamOpCode(); opCodeSpan = opCode !.Value.Span; if (!isEqual("endbfrange", opCodeSpan)) { throw tokeniser.StreamException("ToUnicode stream is missing 'endbfrange' after 'beginbfrange'."); } } else if (isEqual("endcmap", opCodeSpan)) { break; } } }
public PdfContent(DictionaryToken contentsDictionaryToken, IReadOnlyDictionary <string, PdfFont> fonts) { contentsDictionaryToken.PdfObject = this; var tokeniser = contentsDictionaryToken.GetStreamBytes(); Fonts = fonts; decimal?lastLineOffset = null; string? newText = null; try { //q 0.12 0 0 0.12 0 0 cm ///R7 gs //0 0 0 rg //q //8.33333 0 0 8.33333 0 0 cm BT //BT // /F1 24 Tf // 100 100 Td // ( Hello World ) Tj //ET while (true) { //find BT ReadOnlySpan <byte> opCodeSpan; do { var opCode = tokeniser.GetStreamOpCode(); if (opCode is null) { return; } opCodeSpan = opCode.Value.Span; if (opCodeSpan.Length == 2 && opCodeSpan[0] == 'B' && opCodeSpan[1] == 'I') { tokeniser.SkipInlineImage(); continue; } } while (opCodeSpan.Length != 2 || opCodeSpan[0] != 'B' || opCodeSpan[1] != 'T'); //processes text operation until ET PdfFont?font = null; while (true) { var opCode = tokeniser.GetStreamOpCode();//cannot return null (end of stream), because opCode ET must follow if (opCode is null) { Error += "Error Content stream: stream end found but 'ET' still missing." + Environment.NewLine; return; } opCodeSpan = opCode.Value.Span; if (opCodeSpan.Length == 1) { if (opCodeSpan[0] == '\'') { tokeniser.StartStreamArgumentReading(); newText = tokeniser.GetStreamString(font); tokeniser.EndStreamArgumentReading(); } else if (opCodeSpan[0] == '"') { tokeniser.StartStreamArgumentReading(); tokeniser.SkipStreamArgument(); tokeniser.SkipStreamArgument(); newText = tokeniser.GetStreamString(font); tokeniser.EndStreamArgumentReading(); } else { continue; } } else if (opCodeSpan.Length == 2) { if (opCodeSpan[0] == 'T') { var opCodeChar1 = opCodeSpan[1]; if (opCodeChar1 == 'j') { tokeniser.StartStreamArgumentReading(); newText = tokeniser.GetStreamString(font); tokeniser.EndStreamArgumentReading(); } else if (opCodeChar1 == 'J') { tokeniser.StartStreamArgumentReading(); newText = tokeniser.GetStreamArrayString(font); tokeniser.EndStreamArgumentReading(); } else if (opCodeChar1 == 'f') { tokeniser.StartStreamArgumentReading(); PdfFontName = tokeniser.GetStreamName(); if (!fonts.TryGetValue(PdfFontName, out font)) { Error += $"Could not find font '{PdfFontName}'." + Environment.NewLine; } tokeniser.EndStreamArgumentReading(); continue; } else if (opCodeChar1 == 'd' || opCodeChar1 == 'D' || opCodeChar1 == '*') { Text += Environment.NewLine; continue; } else if (opCodeChar1 == 'm') { tokeniser.StartStreamArgumentReading(); tokeniser.GetStreamInt(); tokeniser.GetStreamInt(); tokeniser.GetStreamInt(); tokeniser.GetStreamInt(); tokeniser.GetStreamNumber(); var lineOffset = tokeniser.GetStreamNumber(); if (lastLineOffset != lineOffset) { lastLineOffset = lineOffset; if (Text != null) { Text += Environment.NewLine; } } tokeniser.EndStreamArgumentReading(); continue; } else { //skip operants like TL continue; } } else if (opCodeSpan[0] == 'E' && opCodeSpan[1] == 'T') { break; } else { //other 2 characters opcodes continue; } } else { //opcode is longer than 2 letters continue; } if (newText?.Contains("size") ?? false) { } Text += newText + tokeniser.ContentDelimiter; } } //var s = tokeniser.StreamBytesToString(); } catch (Exception ex) { if (Exception is null) { Exception = ""; } else { Exception += Environment.NewLine + Environment.NewLine; } if (ex is PdfStreamException || ex is PdfException) { Exception = ex.ToDetailString(); } else { Exception = ex.ToDetailString() + Environment.NewLine + tokeniser.ShowStreamContentAtIndex(); } } }
// /CIDInit /ProcSet findresource begin // 11 dict begin // begincmap // /CIDSystemInfo // << /Registry(Adobe) // /Ordering(UCS) // /Supplement 0 // >> def // /CMapName /Adobe-Identity-UCS def // /CMapType 2 def // 1 begincodespacerange // <0000> <FFFF> // endcodespacerange // 2 beginbfchar // <0003> <0020> // <00B1> <2013> // endbfchar // 1 beginbfrange // <00B5> <00B6> <2018> // endbfrange // endcmap // CMapName currentdict /CMap defineresource pop // end // end public PdfToUnitCode(DictionaryToken toUnicodeStream) { var tokeniser = toUnicodeStream.GetStreamBytes(); if (tokeniser.GetStreamOpCode("begincmap") is null) { throw tokeniser.StreamException("ToUnicode stream is missing 'begincmap'."); } tokeniser.SetStreamMark(); if (tokeniser.GetStreamOpCode("endcodespacerange") is null) { throw tokeniser.StreamException("ToUnicode stream is missing 'endcodespacerange'."); } Header = tokeniser.GetStreamMarkedText(); var cidUniCodes = new List <(ushort, char)>(); var cidRangeUniCodes = new List <(ushort, ushort, char)>(); var minCid = int.MaxValue; var maxCid = int.MinValue; while (true) { var opCode = tokeniser.GetStreamOpCode(); if (opCode is null) { throw tokeniser.StreamException("ToUnicode stream incomplete."); } var opCodeSpan = opCode.Value.Span; if (isEqual("beginbfchar", opCodeSpan)) { tokeniser.StartStreamArgumentReading(); var linesCount = tokeniser.GetStreamInt(); tokeniser.EndStreamArgumentReading(); for (int lineIndex = 0; lineIndex < linesCount; lineIndex++) { var cid = tokeniser.GetStreamCid(); minCid = Math.Min(minCid, cid); maxCid = Math.Max(maxCid, cid); var unicodeChar = tokeniser.GetStreamUnicode(); cidUniCodes.Add((cid, (char)unicodeChar)); } opCode = tokeniser.GetStreamOpCode(); opCodeSpan = opCode !.Value.Span; if (!isEqual("endbfchar", opCodeSpan)) { throw tokeniser.StreamException("ToUnicode stream is missing 'endbfchar' after 'beginbfchar'."); } } else if (isEqual("beginbfrange", opCodeSpan)) { tokeniser.StartStreamArgumentReading(); var linesCount = tokeniser.GetStreamInt(); tokeniser.EndStreamArgumentReading(); for (int lineIndex = 0; lineIndex < linesCount; lineIndex++) { var cidStart = tokeniser.GetStreamCid(); minCid = Math.Min(minCid, cidStart); var cidEnd = tokeniser.GetStreamCid(); maxCid = Math.Max(maxCid, cidEnd); if (cidEnd < cidStart) { throw tokeniser.StreamException($"ToUnicode: beginbfrange cid1 '{cidStart} should be smaller than cid2'{cidEnd}'."); } var unicodeChar = tokeniser.GetStreamUnicode(); cidRangeUniCodes.Add((cidStart, cidEnd, (char)unicodeChar)); } opCode = tokeniser.GetStreamOpCode(); opCodeSpan = opCode !.Value.Span; if (!isEqual("endbfrange", opCodeSpan)) { throw tokeniser.StreamException("ToUnicode stream is missing 'endbfrange' after 'beginbfrange'."); } } else if (isEqual("endcmap", opCodeSpan)) { break; } } unicodes = new char[maxCid + 1]; for (int unicodesIndex = 0; unicodesIndex < unicodes.Length; unicodesIndex++) { unicodes[unicodesIndex] = (char)unicodesIndex; } foreach ((ushort cid, ushort unicodeChar)cidUniCode in cidUniCodes) { if (unicodes[cidUniCode.cid] != cidUniCode.cid) { throw tokeniser.StreamException($"ToUnicode defines the same cid '{cidUniCode.cid}' twice."); } unicodes[cidUniCode.cid] = (char)cidUniCode.unicodeChar; } foreach ((ushort cidStart, ushort cidEnd, ushort unicodeChar)cidRangeUniCode in cidRangeUniCodes) { var unicodeIndex = cidRangeUniCode.unicodeChar; for (ushort cidIndex = cidRangeUniCode.cidStart; cidIndex <= cidRangeUniCode.cidEnd; cidIndex++) { if (unicodes[cidIndex] != cidIndex) { throw tokeniser.StreamException($"ToUnicode defines the same cid '{cidIndex}' twice."); } unicodes[cidIndex] = (char)unicodeIndex++; } } //var s = tokeniser.StreamBytesToString(); }