Beispiel #1
0
        /// <summary>
        /// Reads the next token and returns its type. If the token starts with a digit, the parameter
        /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
        /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
        /// the token is set to the object ID followed by the generation number separated by a blank
        /// (the 'R' is omitted from the token).
        /// </summary>
        // /// <param name="location">The start position of the next token.</param>
        public Symbol ScanNextToken(out int position)
        {
            Symbol symbol = Symbol.None;

            if (!TryScanNextToken(out symbol, out position))
            {
                ParserDiagnostics.HandleUnexpectedCharacter(_nextChar);
            }
            return(symbol);
        }
Beispiel #2
0
        public Symbol ScanHexadecimalString()
        {
            Debug.Assert(_currChar == Chars.Less);

            _token = new StringBuilder();
            char[] hex = new char[2];
            ScanNextChar(true);
            while (true)
            {
                MoveToNonWhiteSpace();
                if (_currChar == '>')
                {
                    ScanNextChar(true);
                    break;
                }
                if (char.IsLetterOrDigit(_currChar))
                {
                    hex[0] = char.ToUpper(_currChar);
                    // Second char is optional in PDF spec.
                    if (char.IsLetterOrDigit(_nextChar))
                    {
                        hex[1] = char.ToUpper(_nextChar);
                        ScanNextChar(true);
                    }
                    else
                    {
                        // We could check for ">" here and throw if we find anything else. The throw comes after the next iteration anyway.
                        hex[1] = '0';
                    }
                    ScanNextChar(true);

                    int ch = int.Parse(new string(hex), NumberStyles.AllowHexSpecifier);
                    _token.Append(Convert.ToChar(ch));
                }
                else
                {
                    ParserDiagnostics.HandleUnexpectedCharacter(_currChar);
                }
            }
            string chars = _token.ToString();
            int    count = chars.Length;

            if (count > 2 && chars[0] == (char)0xFE && chars[1] == (char)0xFF)
            {
                Debug.Assert(count % 2 == 0);
                _token.Length = 0;
                for (int idx = 2; idx < count; idx += 2)
                {
                    _token.Append((char)(chars[idx] * 256 + chars[idx + 1]));
                }
                return(_symbol = Symbol.UnicodeHexString);
            }
            return(_symbol = Symbol.HexString);
        }
Beispiel #3
0
        /// <summary>
        /// Reads the next token and returns its type. If the token starts with a digit, the parameter
        /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer.
        /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference,
        /// the token is set to the object ID followed by the generation number separated by a blank
        /// (the 'R' is omitted from the token).
        /// </summary>
        // /// <param name="testReference">Indicates whether to test the next token if it is a reference.</param>
        public Symbol ScanNextToken()
        {
Again:
            _token = new StringBuilder();

            char ch = MoveToNonWhiteSpace();

            switch (ch)
            {
            case '%':
                // Eat comments, the parser doesn't handle them
                //return symbol = ScanComment();
                ScanComment();
                goto Again;

            case '/':
                return(_symbol = ScanName());

            //case 'R':
            //  if (Lexer.IsWhiteSpace(nextChar))
            //  {
            //    ScanNextChar();
            //    return Symbol.R;
            //  }
            //  break;

            case '+':     //TODO is it so easy?
            case '-':
                return(_symbol = ScanNumber());

            case '(':
                return(_symbol = ScanLiteralString());

            case '[':
                ScanNextChar(true);
                return(_symbol = Symbol.BeginArray);

            case ']':
                ScanNextChar(true);
                return(_symbol = Symbol.EndArray);

            case '<':
                if (_nextChar == '<')
                {
                    ScanNextChar(true);
                    ScanNextChar(true);
                    return(_symbol = Symbol.BeginDictionary);
                }
                return(_symbol = ScanHexadecimalString());

            case '>':
                if (_nextChar == '>')
                {
                    ScanNextChar(true);
                    ScanNextChar(true);
                    return(_symbol = Symbol.EndDictionary);
                }
                ParserDiagnostics.HandleUnexpectedCharacter(_nextChar);
                break;

            case '.':
                return(_symbol = ScanNumber());

            case '#':
                // Not part of the PDF spec, but at least one program includes
                // "#QNB" which is a math error. We can try to ignore it
                if (_nextChar == 'Q')
                {
                    ScanNextChar(true);
                    ScanNextChar(true);
                    ScanNextChar(true);
                    ScanNextChar(true);
                    return(ScanNextToken());
                }
                ParserDiagnostics.HandleUnexpectedCharacter(ch);
                break;
            }
            if (char.IsDigit(ch))
#if true_
            { return(ScanNumberOrReference()); }
#else
            { if (PeekReference())
              {
                  return(_symbol = ScanNumber());
              }
              else
              {
                  return(_symbol = ScanNumber());
              } }
#endif

            if (char.IsLetter(ch))
            {
                return(_symbol = ScanKeyword());
            }

            if (ch == Chars.EOF)
            {
                return(_symbol = Symbol.Eof);
            }

            // #???

            ParserDiagnostics.HandleUnexpectedCharacter(ch);
            return(_symbol = Symbol.None);
        }
Beispiel #4
0
        /// <summary>
        /// Scans a literal string, contained between "(" and ")".
        /// </summary>
        public Symbol ScanLiteralString()
        {
            // Reference: 3.2.3  String Objects / Page 53
            // Reference: TABLE 3.32  String Types / Page 157

            Debug.Assert(_currChar == Chars.ParenLeft);
            _token = new StringBuilder();
            int  parenLevel = 0;
            char ch         = ScanNextChar(false);

            // Phase 1: deal with escape characters.
            while (ch != Chars.EOF)
            {
                switch (ch)
                {
                case '(':
                    parenLevel++;
                    break;

                case ')':
                    if (parenLevel == 0)
                    {
                        ScanNextChar(false);
                        // Is goto evil? We could move Phase 2 code here or create a subroutine for Phase 1.
                        goto Phase2;
                    }
                    parenLevel--;
                    break;

                case '\\':
                {
                    ch = ScanNextChar(false);
                    switch (ch)
                    {
                    case 'n':
                        ch = Chars.LF;
                        break;

                    case 'r':
                        ch = Chars.CR;
                        break;

                    case 't':
                        ch = Chars.HT;
                        break;

                    case 'b':
                        ch = Chars.BS;
                        break;

                    case 'f':
                        ch = Chars.FF;
                        break;

                    case '(':
                        ch = Chars.ParenLeft;
                        break;

                    case ')':
                        ch = Chars.ParenRight;
                        break;

                    case '\\':
                        ch = Chars.BackSlash;
                        break;

                    // AutoCAD PDFs my contain such strings: (\ )
                    case ' ':
                        ch = ' ';
                        break;

                    case Chars.CR:
                    case Chars.LF:
                        ch = ScanNextChar(false);
                        continue;

                    default:
                        if (char.IsDigit(ch))              // First octal character.
                        {
                            // Octal character code.
                            if (ch >= '8')
                            {
                                ParserDiagnostics.HandleUnexpectedCharacter(ch);
                            }

                            int n = ch - '0';
                            if (char.IsDigit(_nextChar))              // Second octal character.
                            {
                                ch = ScanNextChar(false);
                                if (ch >= '8')
                                {
                                    ParserDiagnostics.HandleUnexpectedCharacter(ch);
                                }

                                n = n * 8 + ch - '0';
                                if (char.IsDigit(_nextChar))              // Third octal character.
                                {
                                    ch = ScanNextChar(false);
                                    if (ch >= '8')
                                    {
                                        ParserDiagnostics.HandleUnexpectedCharacter(ch);
                                    }

                                    n = n * 8 + ch - '0';
                                }
                            }
                            ch = (char)n;
                        }
                        else
                        {
                            //TODO
                            // Debug.As sert(false, "Not implemented; unknown escape character.");
                            ParserDiagnostics.HandleUnexpectedCharacter(ch);
                        }
                        break;
                    }
                    break;
                }

                default:
                    break;
                }

                _token.Append(ch);
                ch = ScanNextChar(false);
            }

            // Phase 2: deal with UTF-16BE if necessary.
            // UTF-16BE Unicode strings start with U+FEFF ("þÿ"). There can be empty strings with UTF-16BE prefix.
Phase2:
            if (_token.Length >= 2 && _token[0] == '\xFE' && _token[1] == '\xFF')
            {
                // Combine two ANSI characters to get one Unicode character.
                StringBuilder temp   = _token;
                int           length = temp.Length;
                if ((length & 1) == 1)
                {
                    // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
                    temp.Append(0);
                    ++length;
                    DebugBreak.Break();
                }
                _token = new StringBuilder();
                for (int i = 2; i < length; i += 2)
                {
                    _token.Append((char)(256 * temp[i] + temp[i + 1]));
                }
                return(_symbol = Symbol.UnicodeString);
            }
            // Adobe Reader also supports UTF-16LE.
            if (_token.Length >= 2 && _token[0] == '\xFF' && _token[1] == '\xFE')
            {
                // Combine two ANSI characters to get one Unicode character.
                StringBuilder temp   = _token;
                int           length = temp.Length;
                if ((length & 1) == 1)
                {
                    // TODO What does the PDF Reference say about this case? Assume (char)0 or treat the file as corrupted?
                    temp.Append(0);
                    ++length;
                    DebugBreak.Break();
                }
                _token = new StringBuilder();
                for (int i = 2; i < length; i += 2)
                {
                    _token.Append((char)(256 * temp[i + 1] + temp[i]));
                }
                return(_symbol = Symbol.UnicodeString);
            }
            return(_symbol = Symbol.String);
        }