private void HandleEscapeSequence(MutableStringBuilder output) { Ensure(1); switch (ReadUTF8Character()) { case 'b': output.Append('\b'); break; case 'f': output.Append('\f'); break; case 'n': output.Append('\n'); break; case 'r': output.Append('\r'); break; case 't': output.Append('\t'); break; case 'u': Ensure(4); char codePoint = ReadUnicodeCodepoint(); if (Char.IsHighSurrogate(codePoint)) { HandleLowSurrogate(codePoint, output); } else if (Char.IsLowSurrogate(codePoint)) { // low surrogate with no high surrogate ThrowExceptionForInvalidUTF8(); } else { WriteUTF8Character((int)codePoint, output); } break; default: // '\\', '"', '/'... StartQuoteSequence(); break; } }
private void DetectEndOfQuoteSquence(int position, MutableStringBuilder output) { if (_quoteStart != -1) { output.Append(_source, _quoteStart, position - _quoteStart); _quoteStart = -1; } }
private void WriteUTF8Character(int chr, MutableStringBuilder output) { byte[] utf16Bytes = Encoding.Convert(Encoding.UTF32, Encoding.UTF8, BitConverter.GetBytes(chr)); for (int b = 0; b < utf16Bytes.Length; b++) { output.Append(utf16Bytes[b]); } }
private void AppendByte(MutableStringBuilder/*!*/ content, byte b, StringType stringType) { if (b == 0 && (stringType & StringType.Symbol) != 0) { ReportError(Errors.NullCharacterInSymbol); } else { content.Append(b); } }
private void AppendCharacter(MutableStringBuilder/*!*/ content, int c, StringType stringType) { if (c == 0 && (stringType & StringType.Symbol) != 0) { ReportError(Errors.NullCharacterInSymbol); } else { content.Append((char)c); } }
// returns last character read private int ReadStringContent(MutableStringBuilder/*!*/ content, StringType stringType, int terminator, int openingParenthesis, ref int nestingLevel) { while (true) { int eolnWidth; int c = ReadNormalizeEndOfLine(out eolnWidth); if (c == -1) { return -1; } if (openingParenthesis != 0 && c == openingParenthesis) { nestingLevel++; } else if (c == terminator) { if (nestingLevel == 0) { SeekRelative(-eolnWidth); return c; } nestingLevel--; } else if (((stringType & StringType.ExpandsEmbedded) != 0) && c == '#' && _bufferPos < _lineLength) { int c2 = _lineBuffer[_bufferPos]; if (c2 == '$' || c2 == '@' || c2 == '{') { SeekRelative(-eolnWidth); return c; } } else if ((stringType & StringType.Words) != 0 && IsWhiteSpace(c)) { SeekRelative(-eolnWidth); return c; } else if (c == '\\') { c = ReadNormalizeEndOfLine(out eolnWidth); if (c == '\n') { if ((stringType & StringType.Words) == 0) { if ((stringType & StringType.ExpandsEmbedded) != 0) { continue; } content.Append('\\'); } } else if (c == '\\') { if ((stringType & StringType.RegularExpression) != 0) { content.Append('\\'); } } else if ((stringType & StringType.RegularExpression) != 0) { // \uFFFF, \u{codepoint} if (c == 'u' && _compatibility >= RubyCompatibility.Ruby19) { content.Append('\\'); AppendEscapedUnicode(content); } else { SeekRelative(-eolnWidth); AppendEscapedRegexEscape(content, terminator); } continue; } else if ((stringType & StringType.ExpandsEmbedded) != 0) { if (c == 'u' && _compatibility >= RubyCompatibility.Ruby19) { // TODO: if the string contains ascii characters only => it is ok and the encoding of the string will be UTF8 if (_encoding != RubyEncoding.UTF8) { ReportError(Errors.EncodingsMixed, RubyEncoding.UTF8.Name, _encoding.Name); content.Append('\\'); content.Append('u'); continue; } // \uFFFF, \u{codepoint} if (Peek() == '{') { AppendUnicodeCodePoint(content, stringType); continue; } else { c = ReadUnicodeEscape(); } } else { // other escapes: SeekRelative(-eolnWidth); c = ReadEscape(); Debug.Assert(c <= 0xff); AppendByte(content, (byte)c, stringType); continue; } } else if ((stringType & StringType.Words) != 0 && IsWhiteSpace(c)) { /* ignore backslashed spaces in %w */ } else if (c != terminator && !(openingParenthesis != 0 && c == openingParenthesis)) { content.Append('\\'); } } AppendCharacter(content, c, stringType); } }
// Reads up to 6 hex characters, treats them as a exadecimal code-point value and appends the result to the buffer. private void AppendUnicodeCodePoint(MutableStringBuilder/*!*/ content, StringType stringType) { int codepoint = ReadUnicodeCodePoint(); if (codepoint < 0x10000) { // code-points [0xd800 .. 0xdffff] are not treated as invalid AppendCharacter(content, codepoint, stringType); } else { codepoint -= 0x10000; content.Append((char)((codepoint / 0x400) + 0xd800), (char)((codepoint % 0x400) + 0xdc00)); } }
private void AppendEscapedUnicode(MutableStringBuilder/*!*/ content) { int start = _bufferPos - 1; if (Peek() == '{') { ReadUnicodeCodePoint(); } else { ReadUnicodeEscape(); } Debug.Assert(_lineBuffer[start] == 'u'); content.Append(_lineBuffer, start, _bufferPos - start); }
private void AppendEscapedHexEscape(MutableStringBuilder/*!*/ content) { int start = _bufferPos - 1; ReadHexEscape(); Debug.Assert(_lineBuffer[start] == 'x'); content.Append(_lineBuffer, start, _bufferPos - start); }
private void AppendRegularExpressionCompositeEscape(MutableStringBuilder/*!*/ content, int term) { int c = ReadNormalizeEndOfLine(); if (c == '\\') { AppendEscapedRegexEscape(content, term); } else if (c == -1) { InvalidEscapeCharacter(); } else { content.Append((char)c); } }
// Appends escaped regex escape sequence. private void AppendEscapedRegexEscape(MutableStringBuilder/*!*/ content, int term) { int c = Read(); switch (c) { case 'x': content.Append('\\'); AppendEscapedHexEscape(content); break; case 'M': if (!Read('-')) { InvalidEscapeCharacter(); break; } content.Append('\\', 'M', '-'); // escaped: AppendRegularExpressionCompositeEscape(content, term); break; case 'C': if (!Read('-')) { InvalidEscapeCharacter(); break; } content.Append('\\', 'C', '-'); AppendRegularExpressionCompositeEscape(content, term); break; case 'c': content.Append('\\', 'c'); AppendRegularExpressionCompositeEscape(content, term); break; case -1: InvalidEscapeCharacter(); break; default: if (IsOctalDigit(c)) { content.Append('\\'); AppendEscapedOctalEscape(content); break; } if (c != '\\' || c != term) { content.Append('\\'); } // ReadEscape is not called if the backslash is followed by an eoln: Debug.Assert(c != '\n' && (c != '\r' || Peek() != '\n')); content.Append((char)c); break; } }
// // returns tokens: // - StringEnd/RegexEnd ... string/regex closed // - (Tokens)' ' ... space in word list // - StringEmbeddedVariableBegin ... #$, #@ (start of an embedded global/instance variable) // - StringEmbeddedCodeBegin ... #{ (start of an embedded expression) // - StringContent ... string data // internal Tokens TokenizeString(StringContentTokenizer/*!*/ info) { StringType stringKind = info.Properties; bool whitespaceSeen = false; // final separator in the list of words (see grammar): if (stringKind == StringType.FinalWordSeparator) { MarkTokenStart(); MarkSingleLineTokenEnd(); return Tokens.StringEnd; } MarkTokenStart(); int eolnWidth; int c = ReadNormalizeEndOfLine(out eolnWidth); // unterminated string (error recovery is slightly different from MRI): if (c == -1) { ReportError(Errors.UnterminatedString); _unterminatedToken = true; MarkSingleLineTokenEnd(); return Tokens.StringEnd; } bool isMultiline = c == '\n'; // skip whitespace in word list: if ((stringKind & StringType.Words) != 0 && IsWhiteSpace(c)) { isMultiline |= SkipWhitespace(); c = Read(); whitespaceSeen = true; } // end of the top-level string: if (c == info.TerminatingCharacter && info.NestingLevel == 0) { // end of words: if ((stringKind & StringType.Words) != 0) { // final separator in the list of words (see grammar): info.Properties = StringType.FinalWordSeparator; MarkTokenEnd(isMultiline); return Tokens.WordSeparator; } // end of regex: if ((stringKind & StringType.RegularExpression) != 0) { _tokenValue.SetRegexOptions(ReadRegexOptions()); MarkTokenEnd(isMultiline); return Tokens.RegexpEnd; } // end of string/symbol: MarkTokenEnd(isMultiline); return Tokens.StringEnd; } // word separator: if (whitespaceSeen) { Debug.Assert(!IsWhiteSpace(c)); Back(c); MarkTokenEnd(isMultiline); return Tokens.WordSeparator; } MutableStringBuilder content; // start of #$variable, #@variable, #{expression} in a string: if ((stringKind & StringType.ExpandsEmbedded) != 0 && c == '#') { switch (Peek()) { case '$': case '@': MarkSingleLineTokenEnd(); return StringEmbeddedVariableBegin(); case '{': Skip('{'); MarkSingleLineTokenEnd(); return StringEmbeddedCodeBegin(); } content = new MutableStringBuilder(_encoding); content.Append('#'); } else { content = new MutableStringBuilder(_encoding); SeekRelative(-eolnWidth); } int nestingLevel = info.NestingLevel; ReadStringContent(content, stringKind, info.TerminatingCharacter, info.OpeningParenthesis, ref nestingLevel); info.NestingLevel = nestingLevel; _tokenValue.SetStringContent(content); MarkMultiLineTokenEnd(); return Tokens.StringContent; }
private Tokens TokenizeExpandingHeredocContent(HeredocTokenizer/*!*/ heredoc) { MutableStringBuilder content; int c = Peek(); if (c == '#') { Skip(c); switch (Peek()) { case '$': case '@': MarkSingleLineTokenEnd(); return StringEmbeddedVariableBegin(); case '{': Skip('{'); MarkSingleLineTokenEnd(); return StringEmbeddedCodeBegin(); } content = new MutableStringBuilder(_encoding); content.Append('#'); } else { content = new MutableStringBuilder(_encoding); } bool isIndented = (heredoc.Properties & StringType.IndentedHeredoc) != 0; do { // read string content upto the end of the line: int tmp = 0; c = ReadStringContent(content, heredoc.Properties, '\n', 0, ref tmp); // stop reading on end-of-file or just before an embedded expression: #$, #$, #{ if (c != '\n') { break; } // adds \n content.Append((char)ReadNormalizeEndOfLine()); // TODO: RefillBuffer(); // first char on the next line: if (Peek() == -1) { break; } } while (!LineContentEquals(heredoc.Label, isIndented)); _tokenValue.SetStringContent(content); MarkMultiLineTokenEnd(); return Tokens.StringContent; }
private void AppendEscapedOctalEscape(MutableStringBuilder/*!*/ content) { int start = _bufferPos - 1; ReadOctalEscape(0); Debug.Assert(IsOctalDigit(_lineBuffer[start])); // first digit content.Append(_lineBuffer, start, _bufferPos - start); }