private void ParseCIDChar(CMap codes, IList <object> operands) { for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++) { MoveNext(); var inputCode = ParseInputCode(); int mappedCID = ConvertUtils.ByteArrayToInt(inputCode); MoveNext(); var mappedCode = ParseUnicode(); codes.AddCIDMapping(mappedCode, mappedCID); } }
private void ParseBFChar(CMap codes, IList <object> operands) { //NOTE: The first element on each line is the input code of the template font; //the second element is the code or name of the character. for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++) { MoveNext(); var inputCode = ParseInputCode(); MoveNext(); // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72]. try { codes.AddCharMapping(inputCode, ParseUnicode()); } catch (OverflowException) { Debug.WriteLine($"WARN: Unable to process Unicode sequence from {codes.CMapName} CMap: {Token}"); } } }
private void ParseCIDRange(CMap codes, IList <object> operands) { for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++) { // 1. Beginning input code. MoveNext(); byte[] beginInputCode = ParseInputCode(); int beginInput = ConvertUtils.ByteArrayToInt(beginInputCode); // 2. Ending input code. MoveNext(); byte[] endInputCode = ParseInputCode(); int entInput = ConvertUtils.ByteArrayToInt(endInputCode); MoveNext(); int mappedCode = ParseUnicode(); // 3. Character codes. if (beginInputCode.Length <= 2 && endInputCode.Length <= 2) { // some CMaps are using CID ranges to map single values if (beginInput == entInput) { codes.AddCIDMapping(mappedCode, beginInput); } else { codes.AddCIDRange((char)beginInput, (char)entInput, mappedCode); } } else { // TODO Is this even possible? int endOfMappings = mappedCode + entInput - beginInput; while (mappedCode <= endOfMappings) { int mappedCID = ConvertUtils.ByteArrayToInt(beginInputCode); codes.AddCIDMapping(mappedCode++, mappedCID); OperationUtils.Increment(beginInputCode); } } } }
/** * <summary>Parses the character-code-to-unicode mapping [PDF:1.6:5.9.1].</summary> */ public CMap Parse() { Stream.Seek(0); var codes = new CMap(); { IList <object> operands = new List <object>(); while (MoveNext()) { switch (TokenType) { case TokenTypeEnum.Keyword: { string @operator = (string)Token; if (@operator.Equals(BeginCodeSpaceRangeOperator, StringComparison.Ordinal)) { ParseCodeSpaceRange(codes, operands); } else if (@operator.Equals(BeginBaseFontCharOperator, StringComparison.Ordinal)) { ParseBFChar(codes, operands); } else if (@operator.Equals(BeginCIDCharOperator, StringComparison.Ordinal)) { ParseCIDChar(codes, operands); } else if (@operator.Equals(BeginBaseFontRangeOperator, StringComparison.Ordinal)) { ParseBFRange(codes, operands); } else if (@operator.Equals(BeginCIDRangeOperator, StringComparison.Ordinal)) { ParseCIDRange(codes, operands); } else if (@operator.Equals(UseCMapOperator, StringComparison.Ordinal)) { var useCMap = CMap.Get((string)operands[0]); codes.UseCmap(useCMap); } else if (@operator.Equals(DefOperator, StringComparison.Ordinal) && operands.Count != 0) { if (CMapName.Equals((string)operands[0], StringComparison.Ordinal)) { codes.CMapName = (string)operands[1]; } else if (CMapType.Equals((string)operands[0], StringComparison.Ordinal)) { codes.CMapType = (int)operands[1]; } else if (Registry.Equals((string)operands[0], StringComparison.Ordinal)) { codes.Registry = (string)operands[1]; } else if (Ordering.Equals((string)operands[0], StringComparison.Ordinal)) { codes.Ordering = (string)operands[1]; } else if (WMode.Equals((string)operands[0], StringComparison.Ordinal)) { codes.WMode = (int)operands[1]; } } operands.Clear(); break; } case TokenTypeEnum.ArrayBegin: case TokenTypeEnum.DictionaryBegin: { // Skip. while (MoveNext()) { if (TokenType == TokenTypeEnum.ArrayEnd || TokenType == TokenTypeEnum.DictionaryEnd) { break; } } break; } case TokenTypeEnum.Comment: // Skip. break; default: { operands.Add(Token); break; } } } } return(codes); }
private void ParseBFRange(CMap codes, IList <object> operands) { //NOTE: The first and second elements in each line are the beginning and //ending valid input codes for the template font; the third element is //the beginning character code for the range. for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++) { // 1. Beginning input code. MoveNext(); byte[] beginInputCode = ParseInputCode(); int beginInput = ConvertUtils.ByteArrayToInt(beginInputCode); // 2. Ending input code. MoveNext(); byte[] endInputCode = ParseInputCode(); int entInput = ConvertUtils.ByteArrayToInt(endInputCode); MoveNext(); switch (TokenType) { case TokenTypeEnum.ArrayBegin: { byte[] inputCode = beginInputCode; while (MoveNext() && TokenType != TokenTypeEnum.ArrayEnd) { // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72]. try { codes.AddCharMapping(inputCode, ParseUnicode()); } catch (OverflowException) { Debug.WriteLine($"WARN: Unable to process Unicode sequence from {codes.CMapName} CMap: {Token}"); } OperationUtils.Increment(inputCode); } break; } default: { var tokenBytes = ParseInputCode(); if (tokenBytes.Length > 0) { // some pdfs use the malformed bfrange <0000> <FFFF> <0000>. Add support by adding a identity // mapping for the whole range instead of cutting it after 255 entries // TODO find a more efficient method to represent all values for a identity mapping if (tokenBytes.Length == 2 && beginInput == 0 && entInput == 0xffff && tokenBytes[0] == 0 && tokenBytes[1] == 0) { for (int i = 0; i < 256; i++) { beginInputCode[1] = (byte)i; tokenBytes[1] = (byte)i; AddMappingFrombfrange(codes, beginInputCode, 0xff, tokenBytes); } } else { // PDFBOX-4661: avoid overflow of the last byte, all following values are undefined int values = Math.Min(entInput - beginInput, 255 - (tokenBytes[tokenBytes.Length - 1] & 0xFF)) + 1; AddMappingFrombfrange(codes, beginInputCode, values, tokenBytes); } } break; } } } }
/** * <summary>Parses the character-code-to-unicode mapping [PDF:1.6:5.9.1].</summary> */ public IDictionary <ByteArray, int> Parse() { Stream.Seek(0); IDictionary <ByteArray, int> codes = new Dictionary <ByteArray, int>(); { IList <object> operands = new List <object>(); string cmapName = null; int cmapType; while (MoveNext()) { switch (TokenType) { case TokenTypeEnum.Keyword: { string @operator = (string)Token; if (@operator.Equals(BeginBaseFontCharOperator, StringComparison.Ordinal) || @operator.Equals(BeginCIDCharOperator, StringComparison.Ordinal)) { /* * NOTE: The first element on each line is the input code of the template font; * the second element is the code or name of the character. */ for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++) { MoveNext(); ByteArray inputCode = new ByteArray(ParseInputCode()); MoveNext(); // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72]. try { codes[inputCode] = ParseUnicode(); } catch (OverflowException) { Debug.WriteLine(String.Format("WARN: Unable to process Unicode sequence from {0} CMap: {1}", cmapName, Token)); } } } else if (@operator.Equals(BeginBaseFontRangeOperator, StringComparison.Ordinal) || @operator.Equals(BeginCIDRangeOperator, StringComparison.Ordinal)) { /* * NOTE: The first and second elements in each line are the beginning and * ending valid input codes for the template font; the third element is * the beginning character code for the range. */ for (int itemIndex = 0, itemCount = (int)operands[0]; itemIndex < itemCount; itemIndex++) { // 1. Beginning input code. MoveNext(); byte[] beginInputCode = ParseInputCode(); // 2. Ending input code. MoveNext(); byte[] endInputCode = ParseInputCode(); // 3. Character codes. MoveNext(); switch (TokenType) { case TokenTypeEnum.ArrayBegin: { byte[] inputCode = beginInputCode; while (MoveNext() && TokenType != TokenTypeEnum.ArrayEnd) { // FIXME: Unicode character sequences (such as ligatures) have not been supported yet [BUG:72]. try { codes[new ByteArray(inputCode)] = ParseUnicode(); } catch (OverflowException) { Debug.WriteLine(String.Format("WARN: Unable to process Unicode sequence from {0} CMap: {1}", cmapName, Token)); } OperationUtils.Increment(inputCode); } break; } default: { byte[] inputCode = beginInputCode; int charCode = ParseUnicode(); int endCharCode = charCode + (ConvertUtils.ByteArrayToInt(endInputCode) - ConvertUtils.ByteArrayToInt(beginInputCode)); while (true) { codes[new ByteArray(inputCode)] = charCode; if (charCode == endCharCode) { break; } OperationUtils.Increment(inputCode); charCode++; } break; } } } } else if (@operator.Equals(UseCMapOperator, StringComparison.Ordinal)) { codes = CMap.Get((string)operands[0]); } else if (@operator.Equals(DefOperator, StringComparison.Ordinal) && operands.Count != 0) { if (CMapName.Equals((string)operands[0], StringComparison.Ordinal)) { cmapName = (string)operands[1]; } if (CMapType.Equals((string)operands[0], StringComparison.Ordinal)) { cmapType = (int)operands[1]; } } operands.Clear(); break; } case TokenTypeEnum.ArrayBegin: case TokenTypeEnum.DictionaryBegin: { // Skip. while (MoveNext()) { if (TokenType == TokenTypeEnum.ArrayEnd || TokenType == TokenTypeEnum.DictionaryEnd) { break; } } break; } case TokenTypeEnum.Comment: // Skip. break; default: { operands.Add(Token); break; } } } } return(codes); }
protected void LoadEncoding() { PdfDataObject encodingObject = BaseDataObject.Resolve(PdfName.Encoding); // CMap [PDF:1.6:5.6.4]. IDictionary <ByteArray, int> cmap = CMap.Get(encodingObject); // 1. Unicode. if (codes == null) { codes = new BiDictionary <ByteArray, int>(); if (encodingObject is PdfName && !(encodingObject.Equals(PdfName.IdentityH) || encodingObject.Equals(PdfName.IdentityV))) { /* * NOTE: According to [PDF:1.6:5.9.1], the fallback method to retrieve * the character-code-to-Unicode mapping implies getting the UCS2 CMap * (Unicode value to CID) corresponding to the font's one (character code to CID); * CIDs are the bridge from character codes to Unicode values. */ BiDictionary <ByteArray, int> ucs2CMap; { PdfDictionary cidSystemInfo = (PdfDictionary)CIDFontDictionary.Resolve(PdfName.CIDSystemInfo); String registry = (String)((PdfTextString)cidSystemInfo[PdfName.Registry]).Value; String ordering = (String)((PdfTextString)cidSystemInfo[PdfName.Ordering]).Value; String ucs2CMapName = registry + "-" + ordering + "-" + "UCS2"; ucs2CMap = new BiDictionary <ByteArray, int>(CMap.Get(ucs2CMapName)); } if (ucs2CMap.Count > 0) { foreach (KeyValuePair <ByteArray, int> cmapEntry in cmap) { codes[cmapEntry.Key] = ConvertUtils.ByteArrayToInt(ucs2CMap.GetKey(cmapEntry.Value).Data); } } } if (codes.Count == 0) { /* * NOTE: In case no clue is available to determine the Unicode resolution map, * the font is considered symbolic and an identity map is synthesized instead. */ symbolic = true; foreach (KeyValuePair <ByteArray, int> cmapEntry in cmap) { codes[cmapEntry.Key] = ConvertUtils.ByteArrayToInt(cmapEntry.Key.Data); } } } // 2. Glyph indexes. /* * TODO: gids map for glyph indexes as glyphIndexes is used to map cids!!! */ // Character-code-to-CID mapping [PDF:1.6:5.6.4,5]. glyphIndexes = new Dictionary <int, int>(); foreach (KeyValuePair <ByteArray, int> cmapEntry in cmap) { if (!codes.ContainsKey(cmapEntry.Key)) { continue; } glyphIndexes[codes[cmapEntry.Key]] = cmapEntry.Value; } }