コード例 #1
0
        private void ParseCIDChar(CMap codes, IList <object> operands)
        {
            for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++)
            {
                MoveNext();
                var inputCode = ParseInputCode();
                int mappedCID = ConvertUtils.ByteArrayToInt(inputCode);
                MoveNext();
                var mappedCode = ParseUnicode();

                codes.AddCIDMapping(mappedCode, mappedCID);
            }
        }
コード例 #2
0
 private void ParseBFChar(CMap codes, IList <object> operands)
 {
     //NOTE: The first element on each line is the input code of the template font;
     //the second element is the code or name of the character.
     for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++)
     {
         MoveNext();
         var inputCode = ParseInputCode();
         MoveNext();
         // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72].
         try
         {
             codes.AddCharMapping(inputCode, ParseUnicode());
         }
         catch (OverflowException)
         { Debug.WriteLine($"WARN: Unable to process Unicode sequence from {codes.CMapName} CMap: {Token}"); }
     }
 }
コード例 #3
0
 private void ParseCIDRange(CMap codes, IList <object> operands)
 {
     for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++)
     {
         // 1. Beginning input code.
         MoveNext();
         byte[] beginInputCode = ParseInputCode();
         int    beginInput     = ConvertUtils.ByteArrayToInt(beginInputCode);
         // 2. Ending input code.
         MoveNext();
         byte[] endInputCode = ParseInputCode();
         int    entInput     = ConvertUtils.ByteArrayToInt(endInputCode);
         MoveNext();
         int mappedCode = ParseUnicode();
         // 3. Character codes.
         if (beginInputCode.Length <= 2 && endInputCode.Length <= 2)
         {
             // some CMaps are using CID ranges to map single values
             if (beginInput == entInput)
             {
                 codes.AddCIDMapping(mappedCode, beginInput);
             }
             else
             {
                 codes.AddCIDRange((char)beginInput, (char)entInput, mappedCode);
             }
         }
         else
         {
             // TODO Is this even possible?
             int endOfMappings = mappedCode + entInput - beginInput;
             while (mappedCode <= endOfMappings)
             {
                 int mappedCID = ConvertUtils.ByteArrayToInt(beginInputCode);
                 codes.AddCIDMapping(mappedCode++, mappedCID);
                 OperationUtils.Increment(beginInputCode);
             }
         }
     }
 }
コード例 #4
0
        /**
         * <summary>Parses the character-code-to-unicode mapping [PDF:1.6:5.9.1].</summary>
         */
        public CMap Parse()
        {
            Stream.Seek(0);
            var codes = new CMap();

            {
                IList <object> operands = new List <object>();

                while (MoveNext())
                {
                    switch (TokenType)
                    {
                    case TokenTypeEnum.Keyword:
                    {
                        string @operator = (string)Token;
                        if (@operator.Equals(BeginCodeSpaceRangeOperator, StringComparison.Ordinal))
                        {
                            ParseCodeSpaceRange(codes, operands);
                        }
                        else if (@operator.Equals(BeginBaseFontCharOperator, StringComparison.Ordinal))
                        {
                            ParseBFChar(codes, operands);
                        }
                        else if (@operator.Equals(BeginCIDCharOperator, StringComparison.Ordinal))
                        {
                            ParseCIDChar(codes, operands);
                        }
                        else if (@operator.Equals(BeginBaseFontRangeOperator, StringComparison.Ordinal))
                        {
                            ParseBFRange(codes, operands);
                        }
                        else if (@operator.Equals(BeginCIDRangeOperator, StringComparison.Ordinal))
                        {
                            ParseCIDRange(codes, operands);
                        }
                        else if (@operator.Equals(UseCMapOperator, StringComparison.Ordinal))
                        {
                            var useCMap = CMap.Get((string)operands[0]);
                            codes.UseCmap(useCMap);
                        }
                        else if (@operator.Equals(DefOperator, StringComparison.Ordinal) && operands.Count != 0)
                        {
                            if (CMapName.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                codes.CMapName = (string)operands[1];
                            }
                            else if (CMapType.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                codes.CMapType = (int)operands[1];
                            }
                            else if (Registry.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                codes.Registry = (string)operands[1];
                            }
                            else if (Ordering.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                codes.Ordering = (string)operands[1];
                            }
                            else if (WMode.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                codes.WMode = (int)operands[1];
                            }
                        }
                        operands.Clear();
                        break;
                    }

                    case TokenTypeEnum.ArrayBegin:
                    case TokenTypeEnum.DictionaryBegin:
                    {
                        // Skip.
                        while (MoveNext())
                        {
                            if (TokenType == TokenTypeEnum.ArrayEnd ||
                                TokenType == TokenTypeEnum.DictionaryEnd)
                            {
                                break;
                            }
                        }
                        break;
                    }

                    case TokenTypeEnum.Comment:
                        // Skip.
                        break;

                    default:
                    {
                        operands.Add(Token);
                        break;
                    }
                    }
                }
            }
            return(codes);
        }
コード例 #5
0
        private void ParseBFRange(CMap codes, IList <object> operands)
        {
            //NOTE: The first and second elements in each line are the beginning and
            //ending valid input codes for the template font; the third element is
            //the beginning character code for the range.
            for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++)
            {
                // 1. Beginning input code.
                MoveNext();
                byte[] beginInputCode = ParseInputCode();
                int    beginInput     = ConvertUtils.ByteArrayToInt(beginInputCode);
                // 2. Ending input code.
                MoveNext();
                byte[] endInputCode = ParseInputCode();
                int    entInput     = ConvertUtils.ByteArrayToInt(endInputCode);


                MoveNext();
                switch (TokenType)
                {
                case TokenTypeEnum.ArrayBegin:
                {
                    byte[] inputCode = beginInputCode;
                    while (MoveNext() &&
                           TokenType != TokenTypeEnum.ArrayEnd)
                    {
                        // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72].
                        try
                        {
                            codes.AddCharMapping(inputCode, ParseUnicode());
                        }
                        catch (OverflowException)
                        { Debug.WriteLine($"WARN: Unable to process Unicode sequence from {codes.CMapName} CMap: {Token}"); }
                        OperationUtils.Increment(inputCode);
                    }
                    break;
                }

                default:
                {
                    var tokenBytes = ParseInputCode();
                    if (tokenBytes.Length > 0)
                    {
                        // some pdfs use the malformed bfrange <0000> <FFFF> <0000>. Add support by adding a identity
                        // mapping for the whole range instead of cutting it after 255 entries
                        // TODO find a more efficient method to represent all values for a identity mapping
                        if (tokenBytes.Length == 2 && beginInput == 0 && entInput == 0xffff &&
                            tokenBytes[0] == 0 && tokenBytes[1] == 0)
                        {
                            for (int i = 0; i < 256; i++)
                            {
                                beginInputCode[1] = (byte)i;
                                tokenBytes[1]     = (byte)i;
                                AddMappingFrombfrange(codes, beginInputCode, 0xff, tokenBytes);
                            }
                        }
                        else
                        {
                            // PDFBOX-4661: avoid overflow of the last byte, all following values are undefined
                            int values = Math.Min(entInput - beginInput,
                                                  255 - (tokenBytes[tokenBytes.Length - 1] & 0xFF)) + 1;
                            AddMappingFrombfrange(codes, beginInputCode, values, tokenBytes);
                        }
                    }
                    break;
                }
                }
            }
        }
コード例 #6
0
ファイル: CMapParser.cs プロジェクト: iWeaverMan/pdf-clown
        /**
         * <summary>Parses the character-code-to-unicode mapping [PDF:1.6:5.9.1].</summary>
         */
        public IDictionary <ByteArray, int> Parse()
        {
            Stream.Seek(0);
            IDictionary <ByteArray, int> codes = new Dictionary <ByteArray, int>();

            {
                IList <object> operands = new List <object>();
                string         cmapName = null;
                int            cmapType;
                while (MoveNext())
                {
                    switch (TokenType)
                    {
                    case TokenTypeEnum.Keyword:
                    {
                        string @operator = (string)Token;
                        if (@operator.Equals(BeginBaseFontCharOperator, StringComparison.Ordinal) ||
                            @operator.Equals(BeginCIDCharOperator, StringComparison.Ordinal))
                        {
                            /*
                             * NOTE: The first element on each line is the input code of the template font;
                             * the second element is the code or name of the character.
                             */
                            for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++)
                            {
                                MoveNext();
                                ByteArray inputCode = new ByteArray(ParseInputCode());
                                MoveNext();
                                // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72].
                                try
                                {
                                    codes[inputCode] = ParseUnicode();
                                }
                                catch (OverflowException)
                                { Debug.WriteLine(String.Format("WARN: Unable to process Unicode sequence from {0} CMap: {1}", cmapName, Token)); }
                            }
                        }
                        else if (@operator.Equals(BeginBaseFontRangeOperator, StringComparison.Ordinal) ||
                                 @operator.Equals(BeginCIDRangeOperator, StringComparison.Ordinal))
                        {
                            /*
                             * NOTE: The first and second elements in each line are the beginning and
                             * ending valid input codes for the template font; the third element is
                             * the beginning character code for the range.
                             */
                            for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++)
                            {
                                // 1. Beginning input code.
                                MoveNext();
                                byte[] beginInputCode = ParseInputCode();
                                // 2. Ending input code.
                                MoveNext();
                                byte[] endInputCode = ParseInputCode();
                                // 3. Character codes.
                                MoveNext();
                                switch (TokenType)
                                {
                                case TokenTypeEnum.ArrayBegin:
                                {
                                    byte[] inputCode = beginInputCode;
                                    while (MoveNext() &&
                                           TokenType != TokenTypeEnum.ArrayEnd)
                                    {
                                        // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72].
                                        try
                                        {
                                            codes[new ByteArray(inputCode)] = ParseUnicode();
                                        }
                                        catch (OverflowException)
                                        { Debug.WriteLine(String.Format("WARN: Unable to process Unicode sequence from {0} CMap: {1}", cmapName, Token)); }
                                        OperationUtils.Increment(inputCode);
                                    }
                                    break;
                                }

                                default:
                                {
                                    byte[] inputCode   = beginInputCode;
                                    int    charCode    = ParseUnicode();
                                    int    endCharCode = charCode + (ConvertUtils.ByteArrayToInt(endInputCode) - ConvertUtils.ByteArrayToInt(beginInputCode));
                                    while (true)
                                    {
                                        codes[new ByteArray(inputCode)] = charCode;
                                        if (charCode == endCharCode)
                                        {
                                            break;
                                        }

                                        OperationUtils.Increment(inputCode);
                                        charCode++;
                                    }
                                    break;
                                }
                                }
                            }
                        }
                        else if (@operator.Equals(UseCMapOperator, StringComparison.Ordinal))
                        {
                            codes = CMap.Get((string)operands[0]);
                        }
                        else if (@operator.Equals(DefOperator, StringComparison.Ordinal) && operands.Count != 0)
                        {
                            if (CMapName.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                cmapName = (string)operands[1];
                            }
                            if (CMapType.Equals((string)operands[0], StringComparison.Ordinal))
                            {
                                cmapType = (int)operands[1];
                            }
                        }
                        operands.Clear();
                        break;
                    }

                    case TokenTypeEnum.ArrayBegin:
                    case TokenTypeEnum.DictionaryBegin:
                    {
                        // Skip.
                        while (MoveNext())
                        {
                            if (TokenType == TokenTypeEnum.ArrayEnd ||
                                TokenType == TokenTypeEnum.DictionaryEnd)
                            {
                                break;
                            }
                        }
                        break;
                    }

                    case TokenTypeEnum.Comment:
                        // Skip.
                        break;

                    default:
                    {
                        operands.Add(Token);
                        break;
                    }
                    }
                }
            }
            return(codes);
        }
コード例 #7
0
        protected void LoadEncoding()
        {
            PdfDataObject encodingObject = BaseDataObject.Resolve(PdfName.Encoding);

            // CMap [PDF:1.6:5.6.4].
            IDictionary <ByteArray, int> cmap = CMap.Get(encodingObject);

            // 1. Unicode.
            if (codes == null)
            {
                codes = new BiDictionary <ByteArray, int>();
                if (encodingObject is PdfName &&
                    !(encodingObject.Equals(PdfName.IdentityH) ||
                      encodingObject.Equals(PdfName.IdentityV)))
                {
                    /*
                     * NOTE: According to [PDF:1.6:5.9.1], the fallback method to retrieve
                     * the character-code-to-Unicode mapping implies getting the UCS2 CMap
                     * (Unicode value to CID) corresponding to the font's one (character code to CID);
                     * CIDs are the bridge from character codes to Unicode values.
                     */
                    BiDictionary <ByteArray, int> ucs2CMap;
                    {
                        PdfDictionary cidSystemInfo = (PdfDictionary)CIDFontDictionary.Resolve(PdfName.CIDSystemInfo);
                        String        registry      = (String)((PdfTextString)cidSystemInfo[PdfName.Registry]).Value;
                        String        ordering      = (String)((PdfTextString)cidSystemInfo[PdfName.Ordering]).Value;
                        String        ucs2CMapName  = registry + "-" + ordering + "-" + "UCS2";
                        ucs2CMap = new BiDictionary <ByteArray, int>(CMap.Get(ucs2CMapName));
                    }
                    if (ucs2CMap.Count > 0)
                    {
                        foreach (KeyValuePair <ByteArray, int> cmapEntry in cmap)
                        {
                            codes[cmapEntry.Key] = ConvertUtils.ByteArrayToInt(ucs2CMap.GetKey(cmapEntry.Value).Data);
                        }
                    }
                }
                if (codes.Count == 0)
                {
                    /*
                     * NOTE: In case no clue is available to determine the Unicode resolution map,
                     * the font is considered symbolic and an identity map is synthesized instead.
                     */
                    symbolic = true;
                    foreach (KeyValuePair <ByteArray, int> cmapEntry in cmap)
                    {
                        codes[cmapEntry.Key] = ConvertUtils.ByteArrayToInt(cmapEntry.Key.Data);
                    }
                }
            }

            // 2. Glyph indexes.

            /*
             * TODO: gids map for glyph indexes as glyphIndexes is used to map cids!!!
             */
            // Character-code-to-CID mapping [PDF:1.6:5.6.4,5].
            glyphIndexes = new Dictionary <int, int>();
            foreach (KeyValuePair <ByteArray, int> cmapEntry in cmap)
            {
                if (!codes.ContainsKey(cmapEntry.Key))
                {
                    continue;
                }

                glyphIndexes[codes[cmapEntry.Key]] = cmapEntry.Value;
            }
        }