/// <summary> /// Handles string sequence with escapes /// </summary> /// <param name="openQuote"></param> public static void HandleString(char openQuote, CharacterStream cs, Action<int, int> addToken) { int start = cs.Position; cs.MoveToNextChar(); if (!cs.IsEndOfStream()) { while (true) { if (cs.CurrentChar == openQuote) { cs.MoveToNextChar(); break; } if (cs.CurrentChar == '\\') { cs.MoveToNextChar(); } if (!cs.MoveToNextChar()) break; } } int length = cs.Position - start; if (length > 0) { addToken(start, length); } }
private static int GetNCharOperatorLength(CharacterStream cs) { // R allows user-defined infix operators. These have the form of // a string of characters delimited by the ‘%’ character. The string // can contain any printable character except ‘%’. if (cs.CurrentChar == '%' && !char.IsWhiteSpace(cs.NextChar)) { // In case of broken or partially typed operators // make sure we terminate at whitespace or end of the line // so in 'x <- y % z' '% z' is not an operator. int start = cs.Position; int length; cs.MoveToNextChar(); while (!cs.IsEndOfStream() && !cs.IsWhiteSpace()) { if (cs.CurrentChar == '%') { cs.MoveToNextChar(); length = cs.Position - start; cs.Position = start; return length; } if (cs.IsAtNewLine()) { // x <- y %abcd cs.Position = start; return 1; } cs.MoveToNextChar(); } } return Get3CharOrShorterOperatorLength(cs); }
/// <summary> /// Handle generic comment. Comment goes to the end of the line. /// </summary> public static void HandleEolComment(CharacterStream cs, Action<int, int> addToken) { int start = cs.Position; while (!cs.IsEndOfStream() && !cs.IsAtNewLine()) { cs.MoveToNextChar(); } int length = cs.Position - start; if (length > 0) { addToken(start, length); } }
public static void SkipIdentifier(CharacterStream cs, Func<CharacterStream, bool> isIdentifierLeadCharacter, Func<CharacterStream, bool> isIdentifierCharacter) { if (!isIdentifierLeadCharacter(cs)) return; if (cs.IsEndOfStream()) return; while (!cs.IsWhiteSpace()) { if (!isIdentifierCharacter(cs)) break; if (!cs.MoveToNextChar()) break; } }
public static string NormalizeWhitespace(this string s) { if(s == null || s.Length == 0) { return s; } var cs = new CharacterStream(new TextStream(s)); var sb = new StringBuilder(); while (!cs.IsEndOfStream()) { var current = cs.Position; cs.SkipWhitespace(); if (cs.Position - current > 0) { sb.Append(' '); } while (!cs.IsEndOfStream() && !cs.IsWhiteSpace()) { sb.Append(cs.CurrentChar); cs.MoveToNextChar(); } } return sb.ToString().Trim(); }
internal static void SkipWhitespace(CharacterStream cs) { while (!cs.IsEndOfStream() && cs.IsWhiteSpace()) { cs.MoveToNextChar(); } }
internal static int HandleExponent(CharacterStream cs, int start) { Debug.Assert(cs.CurrentChar == 'E' || cs.CurrentChar == 'e'); bool hasSign = false; cs.MoveToNextChar(); if (cs.IsWhiteSpace() || cs.IsEndOfStream()) { // 0.1E or 1e return 0; } if (cs.CurrentChar == '-' || cs.CurrentChar == '+') { hasSign = true; cs.MoveToNextChar(); } int digitsStart = cs.Position; // collect decimals while (cs.IsDecimal()) { cs.MoveToNextChar(); } if (hasSign && digitsStart == cs.Position) return 0; // NaN like 1.0E- // Technically if letter or braces follows this is not // a number but we'll leave it alone for now. // TODO: This code is not language specific and yet it currently // handles complex 'i' as well as R-specific 'L' suffix. // Ideally this needs to be extended in a way so language-specific // tokenizer can specify options or control number format. if (char.IsLetter(cs.CurrentChar) && cs.CurrentChar != 'i' && cs.CurrentChar != 'L') { return 0; } return cs.Position - start; }