Пример #1
0
 private void DetectEndOfQuoteSquence(int position, MutableStringBuilder output)
 {
     if (_quoteStart != -1) {
         output.Append(_source, _quoteStart, position - _quoteStart);
         _quoteStart = -1;
     }
 }
        private void HandleLowSurrogate(char highSurrogate, MutableStringBuilder output)
        {
            _surrogatePairStart = _charStart;
            Ensure(1);
            char lowSurrogate = ReadUTF8Character();

            if (lowSurrogate == '\\')
            {
                Ensure(5);
                if (ReadUTF8Character() != 'u')
                {
                    ThrowExceptionForInvalidUTF8();
                }
                lowSurrogate = ReadUnicodeCodepoint();
            }

            if (Char.IsLowSurrogate(lowSurrogate))
            {
                WriteUTF8Character(Char.ConvertToUtf32(highSurrogate, lowSurrogate), output);
            }
            else
            {
                ThrowExceptionForInvalidUTF8();
            }
        }
 private void WriteUTF8Character(int chr, MutableStringBuilder output)
 {
     byte[] utf16Bytes = Encoding.Convert(Encoding.UTF32, Encoding.UTF8, BitConverter.GetBytes(chr));
     for (int b = 0; b < utf16Bytes.Length; b++)
     {
         output.Append(utf16Bytes[b]);
     }
 }
 private void DetectEndOfQuoteSquence(int position, MutableStringBuilder output)
 {
     if (_quoteStart != -1)
     {
         output.Append(_source, _quoteStart, position - _quoteStart);
         _quoteStart = -1;
     }
 }
Пример #5
0
 public MutableString Unescape()
 {
     MutableStringBuilder output = new MutableStringBuilder(RubyEncoding.Binary);
     while (_position < _source.Length) {
         HandleCharacter(ReadUTF8Character(), output);
     }
     DetectEndOfQuoteSquence(_source.Length, output);
     return output.ToMutableString();
 }
        public MutableString Unescape()
        {
            MutableStringBuilder output = new MutableStringBuilder(RubyEncoding.Binary);

            while (_position < _source.Length)
            {
                HandleCharacter(ReadUTF8Character(), output);
            }
            DetectEndOfQuoteSquence(_source.Length, output);
            return(output.ToMutableString());
        }
        private void HandleEscapeSequence(MutableStringBuilder output)
        {
            Ensure(1);
            switch (ReadUTF8Character())
            {
            case 'b':
                output.Append('\b');
                break;

            case 'f':
                output.Append('\f');
                break;

            case 'n':
                output.Append('\n');
                break;

            case 'r':
                output.Append('\r');
                break;

            case 't':
                output.Append('\t');
                break;

            case 'u':
                Ensure(4);
                char codePoint = ReadUnicodeCodepoint();
                if (Char.IsHighSurrogate(codePoint))
                {
                    HandleLowSurrogate(codePoint, output);
                }
                else if (Char.IsLowSurrogate(codePoint))
                {
                    // low surrogate with no high surrogate
                    ThrowExceptionForInvalidUTF8();
                }
                else
                {
                    WriteUTF8Character((int)codePoint, output);
                }
                break;

            default:
                // '\\', '"', '/'...
                StartQuoteSequence();
                break;
            }
        }
Пример #8
0
 private void HandleCharacter(char chr, MutableStringBuilder output)
 {
     if (chr == '\\') {
         DetectEndOfQuoteSquence(_charStart, output);
         HandleEscapeSequence(output);
     }
     else if (Char.IsHighSurrogate(chr)) {
         DetectEndOfQuoteSquence(_charStart, output);
         HandleLowSurrogate(chr, output);
     }
     else if (Char.IsLowSurrogate(chr)) {
         // low surrogate with no high surrogate
         ThrowExceptionForInvalidUTF8();
     }
     else {
         StartQuoteSequence();
     }
 }
 private void HandleCharacter(char chr, MutableStringBuilder output)
 {
     if (chr == '\\')
     {
         DetectEndOfQuoteSquence(_charStart, output);
         HandleEscapeSequence(output);
     }
     else if (Char.IsHighSurrogate(chr))
     {
         DetectEndOfQuoteSquence(_charStart, output);
         HandleLowSurrogate(chr, output);
     }
     else if (Char.IsLowSurrogate(chr))
     {
         // low surrogate with no high surrogate
         ThrowExceptionForInvalidUTF8();
     }
     else
     {
         StartQuoteSequence();
     }
 }
Пример #10
0
 private void HandleEscapeSequence(MutableStringBuilder output)
 {
     Ensure(1);
     switch (ReadUTF8Character()) {
         case 'b':
             output.Append('\b');
             break;
         case 'f':
             output.Append('\f');
             break;
         case 'n':
             output.Append('\n');
             break;
         case 'r':
             output.Append('\r');
             break;
         case 't':
             output.Append('\t');
             break;
         case 'u':
             Ensure(4);
             char codePoint = ReadUnicodeCodepoint();
             if (Char.IsHighSurrogate(codePoint)) {
                 HandleLowSurrogate(codePoint, output);
             }
             else if (Char.IsLowSurrogate(codePoint)) {
                 // low surrogate with no high surrogate
                 ThrowExceptionForInvalidUTF8();
             }
             else {
                 WriteUTF8Character((int)codePoint, output);
             }
             break;
         default:
             // '\\', '"', '/'...
             StartQuoteSequence();
             break;
     }
 }
Пример #11
0
 private void AppendByte(MutableStringBuilder/*!*/ content, byte b, StringType stringType) {
     if (b == 0 && (stringType & StringType.Symbol) != 0) {
         ReportError(Errors.NullCharacterInSymbol);
     } else {
         content.Append(b);
     }
 }
Пример #12
0
        //
        // returns tokens: 
        // - StringEnd/RegexEnd           ... string/regex closed
        // - (Tokens)' '                  ... space in word list
        // - StringEmbeddedVariableBegin  ... #$, #@ (start of an embedded global/instance variable)
        // - StringEmbeddedCodeBegin      ... #{ (start of an embedded expression)
        // - StringContent                ... string data
        //
        internal Tokens TokenizeString(StringContentTokenizer/*!*/ info) {
            StringType stringKind = info.Properties;
            bool whitespaceSeen = false;

            // final separator in the list of words (see grammar):
            if (stringKind == StringType.FinalWordSeparator) {
                MarkTokenStart();
                MarkSingleLineTokenEnd();
                return Tokens.StringEnd;
            }

            MarkTokenStart();

            int eolnWidth;
            int c = ReadNormalizeEndOfLine(out eolnWidth);

            // unterminated string (error recovery is slightly different from MRI):
            if (c == -1) {
                ReportError(Errors.UnterminatedString);
                _unterminatedToken = true;
                MarkSingleLineTokenEnd();
                return Tokens.StringEnd;
            }

            bool isMultiline = c == '\n';

            // skip whitespace in word list:
            if ((stringKind & StringType.Words) != 0 && IsWhiteSpace(c)) {
                isMultiline |= SkipWhitespace();
                c = Read(); 
                whitespaceSeen = true;
            }

            // end of the top-level string:
            if (c == info.TerminatingCharacter && info.NestingLevel == 0) {
                
                // end of words:
                if ((stringKind & StringType.Words) != 0) {
                    // final separator in the list of words (see grammar):
                    info.Properties = StringType.FinalWordSeparator;
                    MarkTokenEnd(isMultiline);
                    return Tokens.WordSeparator;
                }

                // end of regex:
                if ((stringKind & StringType.RegularExpression) != 0) {
                    _tokenValue.SetRegexOptions(ReadRegexOptions());
                    MarkTokenEnd(isMultiline);
                    return Tokens.RegexpEnd;
                }
                
                // end of string/symbol:
                MarkTokenEnd(isMultiline);
                return Tokens.StringEnd;
            }

            // word separator:
            if (whitespaceSeen) {
                Debug.Assert(!IsWhiteSpace(c));
                Back(c);
                MarkTokenEnd(isMultiline);
                return Tokens.WordSeparator;
            }

            MutableStringBuilder content;

            // start of #$variable, #@variable, #{expression} in a string:
            if ((stringKind & StringType.ExpandsEmbedded) != 0 && c == '#') {
                switch (Peek()) {
                    case '$':
                    case '@':
                        MarkSingleLineTokenEnd();
                        return StringEmbeddedVariableBegin();

                    case '{':
                        Skip('{');
                        MarkSingleLineTokenEnd();
                        return StringEmbeddedCodeBegin();
                }
                content = new MutableStringBuilder(_encoding);
                content.Append('#');
            } else {
                content = new MutableStringBuilder(_encoding);
                SeekRelative(-eolnWidth);
            }

            int nestingLevel = info.NestingLevel;
            ReadStringContent(content, stringKind, info.TerminatingCharacter, info.OpeningParenthesis, ref nestingLevel);
            info.NestingLevel = nestingLevel;

            _tokenValue.SetStringContent(content);
            MarkMultiLineTokenEnd();
            return Tokens.StringContent;
        }
Пример #13
0
        // returns last character read
        private int ReadStringContent(MutableStringBuilder/*!*/ content, StringType stringType, int terminator, int openingParenthesis, 
            ref int nestingLevel) {

            while (true) {
                int eolnWidth;
                int c = ReadNormalizeEndOfLine(out eolnWidth);
                if (c == -1) {
                    return -1;
                }

                if (openingParenthesis != 0 && c == openingParenthesis) {
                    nestingLevel++;
                } else if (c == terminator) {
                    if (nestingLevel == 0) {
                        SeekRelative(-eolnWidth);
                        return c;
                    }
                    nestingLevel--;
                } else if (((stringType & StringType.ExpandsEmbedded) != 0) && c == '#' && _bufferPos < _lineLength) {
                    int c2 = _lineBuffer[_bufferPos];
                    if (c2 == '$' || c2 == '@' || c2 == '{') {
                        SeekRelative(-eolnWidth);
                        return c;
                    }
                } else if ((stringType & StringType.Words) != 0 && IsWhiteSpace(c)) {
                    SeekRelative(-eolnWidth);
                    return c;
                } else if (c == '\\') {
                    c = ReadNormalizeEndOfLine(out eolnWidth);

                    if (c == '\n') {
                        if ((stringType & StringType.Words) == 0) {
                            if ((stringType & StringType.ExpandsEmbedded) != 0) {
                                continue;
                            }
                            content.Append('\\');
                        }
                    } else if (c == '\\') {
                        if ((stringType & StringType.RegularExpression) != 0) {
                            content.Append('\\');
                        }
                    } else if ((stringType & StringType.RegularExpression) != 0) {
                        // \uFFFF, \u{codepoint}
                        if (c == 'u' && _compatibility >= RubyCompatibility.Ruby19) {
                            content.Append('\\');
                            AppendEscapedUnicode(content);
                        } else {
                            SeekRelative(-eolnWidth);
                            AppendEscapedRegexEscape(content, terminator);
                        }
                        continue;
                    } else if ((stringType & StringType.ExpandsEmbedded) != 0) {
                        if (c == 'u' && _compatibility >= RubyCompatibility.Ruby19) {
                            // TODO: if the string contains ascii characters only => it is ok and the encoding of the string will be UTF8
                            if (_encoding != RubyEncoding.UTF8) {
                                ReportError(Errors.EncodingsMixed, RubyEncoding.UTF8.Name, _encoding.Name);
                                content.Append('\\');
                                content.Append('u');
                                continue;
                            }

                            // \uFFFF, \u{codepoint}
                            if (Peek() == '{') {
                                AppendUnicodeCodePoint(content, stringType);
                                continue;
                            } else {
                                c = ReadUnicodeEscape();
                            }
                        } else {
                            // other escapes:
                            SeekRelative(-eolnWidth);
                            c = ReadEscape();
                            Debug.Assert(c <= 0xff);
                            AppendByte(content, (byte)c, stringType);
                            continue;
                        }
                    } else if ((stringType & StringType.Words) != 0 && IsWhiteSpace(c)) {
                        /* ignore backslashed spaces in %w */
                    } else if (c != terminator && !(openingParenthesis != 0 && c == openingParenthesis)) {
                        content.Append('\\');
                    }
                }

                AppendCharacter(content, c, stringType);
            }
        }
Пример #14
0
 private void AppendCharacter(MutableStringBuilder/*!*/ content, int c, StringType stringType) {
     if (c == 0 && (stringType & StringType.Symbol) != 0) {
         ReportError(Errors.NullCharacterInSymbol);
     } else {
         content.Append((char)c);
     }
 }
Пример #15
0
        private void AppendEscapedUnicode(MutableStringBuilder/*!*/ content) {
            int start = _bufferPos - 1;

            if (Peek() == '{') {
                ReadUnicodeCodePoint();
            } else {
                ReadUnicodeEscape();
            }

            Debug.Assert(_lineBuffer[start] == 'u');
            content.Append(_lineBuffer, start, _bufferPos - start);
        }
Пример #16
0
        // Reads up to 6 hex characters, treats them as a exadecimal code-point value and appends the result to the buffer.
        private void AppendUnicodeCodePoint(MutableStringBuilder/*!*/ content, StringType stringType) {
            int codepoint = ReadUnicodeCodePoint();

            if (codepoint < 0x10000) {
                // code-points [0xd800 .. 0xdffff] are not treated as invalid
                AppendCharacter(content, codepoint, stringType);
            } else {
                codepoint -= 0x10000;
                content.Append((char)((codepoint / 0x400) + 0xd800), (char)((codepoint % 0x400) + 0xdc00));
            }
        }
Пример #17
0
        private void AppendEscapedOctalEscape(MutableStringBuilder/*!*/ content) {
            int start = _bufferPos - 1;
            ReadOctalEscape(0);

            Debug.Assert(IsOctalDigit(_lineBuffer[start])); // first digit
            content.Append(_lineBuffer, start, _bufferPos - start);
        }
Пример #18
0
        private void AppendEscapedHexEscape(MutableStringBuilder/*!*/ content) {
            int start = _bufferPos - 1;
            ReadHexEscape();

            Debug.Assert(_lineBuffer[start] == 'x');
            content.Append(_lineBuffer, start, _bufferPos - start);
        }
Пример #19
0
 private void AppendRegularExpressionCompositeEscape(MutableStringBuilder/*!*/ content, int term) {
     int c = ReadNormalizeEndOfLine();
     if (c == '\\') {
         AppendEscapedRegexEscape(content, term);
     } else if (c == -1) {
         InvalidEscapeCharacter();
     } else {
         content.Append((char)c);
     }
 }
Пример #20
0
        // Appends escaped regex escape sequence.
        private void AppendEscapedRegexEscape(MutableStringBuilder/*!*/ content, int term) {
            int c = Read();

            switch (c) {
                case 'x':
                    content.Append('\\');
                    AppendEscapedHexEscape(content);
                    break;

                case 'M':
                    if (!Read('-')) {
                        InvalidEscapeCharacter();
                        break;
                    }

                    content.Append('\\', 'M', '-');

                    // escaped:
                    AppendRegularExpressionCompositeEscape(content, term);
                    break;                    

                case 'C':
                    if (!Read('-')) {
                        InvalidEscapeCharacter();
                        break;
                    }

                    content.Append('\\', 'C', '-');

                    AppendRegularExpressionCompositeEscape(content, term);
                    break;

                case 'c':
                    content.Append('\\', 'c');
                    AppendRegularExpressionCompositeEscape(content, term);
                    break;
                    
                case -1:
                    InvalidEscapeCharacter();
                    break;

                default:
                    if (IsOctalDigit(c)) {
                        content.Append('\\');
                        AppendEscapedOctalEscape(content);
                        break;
                    }

                    if (c != '\\' || c != term) {
                        content.Append('\\');
                    }

                    // ReadEscape is not called if the backslash is followed by an eoln:
                    Debug.Assert(c != '\n' && (c != '\r' || Peek() != '\n'));
                    content.Append((char)c);
                    break;
            }
        }
Пример #21
0
        private void HandleLowSurrogate(char highSurrogate, MutableStringBuilder output)
        {
            _surrogatePairStart = _charStart;
            Ensure(1);
            char lowSurrogate = ReadUTF8Character();

            if (lowSurrogate == '\\') {
                Ensure(5);
                if (ReadUTF8Character() != 'u') {
                    ThrowExceptionForInvalidUTF8();
                }
                lowSurrogate = ReadUnicodeCodepoint();
            }

            if (Char.IsLowSurrogate(lowSurrogate)) {
                WriteUTF8Character(Char.ConvertToUtf32(highSurrogate, lowSurrogate), output);
            }
            else {
                ThrowExceptionForInvalidUTF8();
            }
        }
Пример #22
0
        private Tokens TokenizeExpandingHeredocContent(HeredocTokenizer/*!*/ heredoc) {
            MutableStringBuilder content;

            int c = Peek();
            if (c == '#') {
                Skip(c);
                
                switch (Peek()) {
                    case '$':
                    case '@':
                        MarkSingleLineTokenEnd();
                        return StringEmbeddedVariableBegin();

                    case '{':
                        Skip('{');
                        MarkSingleLineTokenEnd();
                        return StringEmbeddedCodeBegin();
                }

                content = new MutableStringBuilder(_encoding);
                content.Append('#');
            } else {
                content = new MutableStringBuilder(_encoding);
            }

            bool isIndented = (heredoc.Properties & StringType.IndentedHeredoc) != 0;
            
            do {
                // read string content upto the end of the line:
                int tmp = 0;
                c = ReadStringContent(content, heredoc.Properties, '\n', 0, ref tmp);
                
                // stop reading on end-of-file or just before an embedded expression: #$, #$, #{
                if (c != '\n') {
                    break;
                }

                // adds \n
                content.Append((char)ReadNormalizeEndOfLine());

                // TODO:
                RefillBuffer();

                // first char on the next line:
                if (Peek() == -1) {
                    break;
                }

            } while (!LineContentEquals(heredoc.Label, isIndented));

            _tokenValue.SetStringContent(content);
            MarkMultiLineTokenEnd();
            return Tokens.StringContent;
        }
Пример #23
0
 private void WriteUTF8Character(int chr, MutableStringBuilder output)
 {
     byte[] utf16Bytes = Encoding.Convert(Encoding.UTF32, Encoding.UTF8, BitConverter.GetBytes(chr));
     for (int b = 0; b < utf16Bytes.Length; b++) {
         output.Append(utf16Bytes[b]);
     }
 }
Пример #24
0
 internal void SetStringContent(MutableStringBuilder /*!*/ contentBuilder)
 {
     Assert.NotNull(contentBuilder);
     StringContent = contentBuilder.ToValue();
 }
Пример #25
0
 internal void SetStringContent(MutableStringBuilder/*!*/ contentBuilder)
 {
     Assert.NotNull(contentBuilder);
     StringContent = contentBuilder.ToValue();
     Encoding = contentBuilder.Encoding;
 }