コード例 #1
0
        public static void ApplyToUnitCode(
            DictionaryToken toUnicodeStream,
            ref char[] encoding,
            out string toUnicodeHeader,
            out SortedDictionary <int, char> cMap)
        {
            var tokeniser = toUnicodeStream.GetStreamBytes();

            if (tokeniser.GetStreamOpCode("begincmap") is null)
            {
                throw tokeniser.StreamException("ToUnicode stream is missing 'begincmap'.");
            }

            tokeniser.SetStreamMark();
            if (tokeniser.GetStreamOpCode("endcodespacerange") is null)
            {
                throw tokeniser.StreamException("ToUnicode stream is missing 'endcodespacerange'.");
            }

            toUnicodeHeader = tokeniser.GetStreamMarkedText();
            cMap            = new SortedDictionary <int, char>();
            while (true)
            {
                var opCode = tokeniser.GetStreamOpCode();
                if (opCode is null)
                {
                    throw tokeniser.StreamException("ToUnicode stream incomplete.");
                }

                var opCodeSpan = opCode.Value.Span;
                if (isEqual("beginbfchar", opCodeSpan))
                {
                    tokeniser.StartStreamArgumentReading();
                    var linesCount = tokeniser.GetStreamInt();
                    tokeniser.EndStreamArgumentReading();
                    for (int lineIndex = 0; lineIndex < linesCount; lineIndex++)
                    {
                        var cid         = tokeniser.GetStreamCid();
                        var unicodeChar = (char)tokeniser.GetStreamUnicode();
                        encoding[cid] = unicodeChar;
                        cMap.Add(cid, unicodeChar);
                    }
                    opCode     = tokeniser.GetStreamOpCode();
                    opCodeSpan = opCode !.Value.Span;
                    if (!isEqual("endbfchar", opCodeSpan))
                    {
                        throw tokeniser.StreamException("ToUnicode stream is missing 'endbfchar' after 'beginbfchar'.");
                    }
                }
                else if (isEqual("beginbfrange", opCodeSpan))
                {
                    tokeniser.StartStreamArgumentReading();
                    var linesCount = tokeniser.GetStreamInt();
                    tokeniser.EndStreamArgumentReading();
                    for (int lineIndex = 0; lineIndex < linesCount; lineIndex++)
                    {
                        var cidStart = tokeniser.GetStreamCid();
                        var cidEnd   = tokeniser.GetStreamCid();
                        if (cidEnd < cidStart)
                        {
                            throw tokeniser.StreamException($"ToUnicode: beginbfrange cid1 '{cidStart} should be smaller than cid2'{cidEnd}'.");
                        }

                        var unicodeChar = tokeniser.GetStreamUnicode();
                        for (ushort cidIndex = cidStart; cidIndex <= cidEnd; cidIndex++)
                        {
                            cMap.Add(cidIndex, (char)unicodeChar);
                            encoding[cidIndex] = (char)unicodeChar++;
                        }
                    }
                    opCode     = tokeniser.GetStreamOpCode();
                    opCodeSpan = opCode !.Value.Span;
                    if (!isEqual("endbfrange", opCodeSpan))
                    {
                        throw tokeniser.StreamException("ToUnicode stream is missing 'endbfrange' after 'beginbfrange'.");
                    }
                }
                else if (isEqual("endcmap", opCodeSpan))
                {
                    break;
                }
            }
        }
コード例 #2
0
        public PdfContent(DictionaryToken contentsDictionaryToken, IReadOnlyDictionary <string, PdfFont> fonts)
        {
            contentsDictionaryToken.PdfObject = this;
            var tokeniser = contentsDictionaryToken.GetStreamBytes();

            Fonts = fonts;
            decimal?lastLineOffset = null;
            string? newText        = null;

            try {
                //q 0.12 0 0 0.12 0 0 cm
                ///R7 gs
                //0 0 0 rg
                //q
                //8.33333 0 0 8.33333 0 0 cm BT

                //BT
                //  /F1 24 Tf
                //  100 100 Td
                //  ( Hello World ) Tj
                //ET

                while (true)
                {
                    //find BT
                    ReadOnlySpan <byte> opCodeSpan;
                    do
                    {
                        var opCode = tokeniser.GetStreamOpCode();
                        if (opCode is null)
                        {
                            return;
                        }

                        opCodeSpan = opCode.Value.Span;
                        if (opCodeSpan.Length == 2 && opCodeSpan[0] == 'B' && opCodeSpan[1] == 'I')
                        {
                            tokeniser.SkipInlineImage();
                            continue;
                        }
                    } while (opCodeSpan.Length != 2 || opCodeSpan[0] != 'B' || opCodeSpan[1] != 'T');

                    //processes text operation until ET
                    PdfFont?font = null;
                    while (true)
                    {
                        var opCode = tokeniser.GetStreamOpCode();//cannot return null (end of stream), because opCode ET must follow
                        if (opCode is null)
                        {
                            Error += "Error Content stream: stream end found but 'ET' still missing." + Environment.NewLine;
                            return;
                        }

                        opCodeSpan = opCode.Value.Span;
                        if (opCodeSpan.Length == 1)
                        {
                            if (opCodeSpan[0] == '\'')
                            {
                                tokeniser.StartStreamArgumentReading();
                                newText = tokeniser.GetStreamString(font);
                                tokeniser.EndStreamArgumentReading();
                            }
                            else if (opCodeSpan[0] == '"')
                            {
                                tokeniser.StartStreamArgumentReading();
                                tokeniser.SkipStreamArgument();
                                tokeniser.SkipStreamArgument();
                                newText = tokeniser.GetStreamString(font);
                                tokeniser.EndStreamArgumentReading();
                            }
                            else
                            {
                                continue;
                            }
                        }
                        else if (opCodeSpan.Length == 2)
                        {
                            if (opCodeSpan[0] == 'T')
                            {
                                var opCodeChar1 = opCodeSpan[1];
                                if (opCodeChar1 == 'j')
                                {
                                    tokeniser.StartStreamArgumentReading();
                                    newText = tokeniser.GetStreamString(font);
                                    tokeniser.EndStreamArgumentReading();
                                }
                                else if (opCodeChar1 == 'J')
                                {
                                    tokeniser.StartStreamArgumentReading();
                                    newText = tokeniser.GetStreamArrayString(font);
                                    tokeniser.EndStreamArgumentReading();
                                }
                                else if (opCodeChar1 == 'f')
                                {
                                    tokeniser.StartStreamArgumentReading();
                                    PdfFontName = tokeniser.GetStreamName();
                                    if (!fonts.TryGetValue(PdfFontName, out font))
                                    {
                                        Error += $"Could not find font '{PdfFontName}'." + Environment.NewLine;
                                    }
                                    tokeniser.EndStreamArgumentReading();
                                    continue;
                                }
                                else if (opCodeChar1 == 'd' || opCodeChar1 == 'D' || opCodeChar1 == '*')
                                {
                                    Text += Environment.NewLine;
                                    continue;
                                }
                                else if (opCodeChar1 == 'm')
                                {
                                    tokeniser.StartStreamArgumentReading();
                                    tokeniser.GetStreamInt();
                                    tokeniser.GetStreamInt();
                                    tokeniser.GetStreamInt();
                                    tokeniser.GetStreamInt();
                                    tokeniser.GetStreamNumber();
                                    var lineOffset = tokeniser.GetStreamNumber();
                                    if (lastLineOffset != lineOffset)
                                    {
                                        lastLineOffset = lineOffset;
                                        if (Text != null)
                                        {
                                            Text += Environment.NewLine;
                                        }
                                    }
                                    tokeniser.EndStreamArgumentReading();
                                    continue;
                                }
                                else
                                {
                                    //skip operants like TL
                                    continue;
                                }
                            }
                            else if (opCodeSpan[0] == 'E' && opCodeSpan[1] == 'T')
                            {
                                break;
                            }
                            else
                            {
                                //other 2 characters opcodes
                                continue;
                            }
                        }
                        else
                        {
                            //opcode is longer than 2 letters
                            continue;
                        }

                        if (newText?.Contains("size") ?? false)
                        {
                        }
                        Text += newText + tokeniser.ContentDelimiter;
                    }
                }
                //var s = tokeniser.StreamBytesToString();
            } catch (Exception ex) {
                if (Exception is null)
                {
                    Exception = "";
                }
                else
                {
                    Exception += Environment.NewLine + Environment.NewLine;
                }
                if (ex is PdfStreamException || ex is PdfException)
                {
                    Exception = ex.ToDetailString();
                }
                else
                {
                    Exception = ex.ToDetailString() + Environment.NewLine + tokeniser.ShowStreamContentAtIndex();
                }
            }
        }
コード例 #3
0
        // /CIDInit /ProcSet findresource begin
        // 11 dict begin
        // begincmap
        // /CIDSystemInfo
        // << /Registry(Adobe)
        // /Ordering(UCS)
        // /Supplement 0
        // >> def
        // /CMapName /Adobe-Identity-UCS def
        // /CMapType 2 def
        // 1 begincodespacerange
        // <0000> <FFFF>
        // endcodespacerange
        // 2 beginbfchar
        // <0003> <0020>
        // <00B1> <2013>
        // endbfchar
        // 1 beginbfrange
        // <00B5> <00B6> <2018>
        // endbfrange
        // endcmap
        // CMapName currentdict /CMap defineresource pop
        // end
        // end

        public PdfToUnitCode(DictionaryToken toUnicodeStream)
        {
            var tokeniser = toUnicodeStream.GetStreamBytes();

            if (tokeniser.GetStreamOpCode("begincmap") is null)
            {
                throw tokeniser.StreamException("ToUnicode stream is missing 'begincmap'.");
            }

            tokeniser.SetStreamMark();
            if (tokeniser.GetStreamOpCode("endcodespacerange") is null)
            {
                throw tokeniser.StreamException("ToUnicode stream is missing 'endcodespacerange'.");
            }

            Header = tokeniser.GetStreamMarkedText();
            var cidUniCodes      = new List <(ushort, char)>();
            var cidRangeUniCodes = new List <(ushort, ushort, char)>();
            var minCid           = int.MaxValue;
            var maxCid           = int.MinValue;

            while (true)
            {
                var opCode = tokeniser.GetStreamOpCode();
                if (opCode is null)
                {
                    throw tokeniser.StreamException("ToUnicode stream incomplete.");
                }

                var opCodeSpan = opCode.Value.Span;
                if (isEqual("beginbfchar", opCodeSpan))
                {
                    tokeniser.StartStreamArgumentReading();
                    var linesCount = tokeniser.GetStreamInt();
                    tokeniser.EndStreamArgumentReading();
                    for (int lineIndex = 0; lineIndex < linesCount; lineIndex++)
                    {
                        var cid = tokeniser.GetStreamCid();
                        minCid = Math.Min(minCid, cid);
                        maxCid = Math.Max(maxCid, cid);
                        var unicodeChar = tokeniser.GetStreamUnicode();
                        cidUniCodes.Add((cid, (char)unicodeChar));
                    }
                    opCode     = tokeniser.GetStreamOpCode();
                    opCodeSpan = opCode !.Value.Span;
                    if (!isEqual("endbfchar", opCodeSpan))
                    {
                        throw tokeniser.StreamException("ToUnicode stream is missing 'endbfchar' after 'beginbfchar'.");
                    }
                }
                else if (isEqual("beginbfrange", opCodeSpan))
                {
                    tokeniser.StartStreamArgumentReading();
                    var linesCount = tokeniser.GetStreamInt();
                    tokeniser.EndStreamArgumentReading();
                    for (int lineIndex = 0; lineIndex < linesCount; lineIndex++)
                    {
                        var cidStart = tokeniser.GetStreamCid();
                        minCid = Math.Min(minCid, cidStart);
                        var cidEnd = tokeniser.GetStreamCid();
                        maxCid = Math.Max(maxCid, cidEnd);
                        if (cidEnd < cidStart)
                        {
                            throw tokeniser.StreamException($"ToUnicode: beginbfrange cid1 '{cidStart} should be smaller than cid2'{cidEnd}'.");
                        }

                        var unicodeChar = tokeniser.GetStreamUnicode();
                        cidRangeUniCodes.Add((cidStart, cidEnd, (char)unicodeChar));
                    }
                    opCode     = tokeniser.GetStreamOpCode();
                    opCodeSpan = opCode !.Value.Span;
                    if (!isEqual("endbfrange", opCodeSpan))
                    {
                        throw tokeniser.StreamException("ToUnicode stream is missing 'endbfrange' after 'beginbfrange'.");
                    }
                }
                else if (isEqual("endcmap", opCodeSpan))
                {
                    break;
                }
            }

            unicodes = new char[maxCid + 1];
            for (int unicodesIndex = 0; unicodesIndex < unicodes.Length; unicodesIndex++)
            {
                unicodes[unicodesIndex] = (char)unicodesIndex;
            }
            foreach ((ushort cid, ushort unicodeChar)cidUniCode in cidUniCodes)
            {
                if (unicodes[cidUniCode.cid] != cidUniCode.cid)
                {
                    throw tokeniser.StreamException($"ToUnicode defines the same cid '{cidUniCode.cid}' twice.");
                }

                unicodes[cidUniCode.cid] = (char)cidUniCode.unicodeChar;
            }
            foreach ((ushort cidStart, ushort cidEnd, ushort unicodeChar)cidRangeUniCode in cidRangeUniCodes)
            {
                var unicodeIndex = cidRangeUniCode.unicodeChar;
                for (ushort cidIndex = cidRangeUniCode.cidStart; cidIndex <= cidRangeUniCode.cidEnd; cidIndex++)
                {
                    if (unicodes[cidIndex] != cidIndex)
                    {
                        throw tokeniser.StreamException($"ToUnicode defines the same cid '{cidIndex}' twice.");
                    }

                    unicodes[cidIndex] = (char)unicodeIndex++;
                }
            }

            //var s = tokeniser.StreamBytesToString();
        }