private static int GetNCharOperatorLength(CharacterStream cs) { // R allows user-defined infix operators. These have the form of // a string of characters delimited by the ‘%’ character. The string // can contain any printable character except ‘%’. if (cs.CurrentChar == '%' && !char.IsWhiteSpace(cs.NextChar)) { // In case of broken or partially typed operators // make sure we terminate at whitespace or end of the line // so in 'x <- y % z' '% z' is not an operator. int start = cs.Position; int length; cs.MoveToNextChar(); while (!cs.IsEndOfStream() && !cs.IsWhiteSpace()) { if (cs.CurrentChar == '%') { cs.MoveToNextChar(); length = cs.Position - start; cs.Position = start; return length; } if (cs.IsAtNewLine()) { // x <- y %abcd cs.Position = start; return 1; } cs.MoveToNextChar(); } } return Get3CharOrShorterOperatorLength(cs); }
/// <summary> /// Handles string sequence with escapes /// </summary> /// <param name="openQuote"></param> public static void HandleString(char openQuote, CharacterStream cs, Action<int, int> addToken) { int start = cs.Position; cs.MoveToNextChar(); if (!cs.IsEndOfStream()) { while (true) { if (cs.CurrentChar == openQuote) { cs.MoveToNextChar(); break; } if (cs.CurrentChar == '\\') { cs.MoveToNextChar(); } if (!cs.MoveToNextChar()) break; } } int length = cs.Position - start; if (length > 0) { addToken(start, length); } }
/// <summary> /// Given candidate returns length of operator /// or zero if character sequence is not an operator. /// </summary> public static int OperatorLength(CharacterStream cs) { // // http://stat.ethz.ch/R-manual/R-patched/library/base/html/Syntax.html // // Longest first return GetNCharOperatorLength(cs); }
/// <summary> /// Handle generic comment. Comment goes to the end of the line. /// </summary> public static void HandleEolComment(CharacterStream cs, Action<int, int> addToken) { int start = cs.Position; while (!cs.IsEndOfStream() && !cs.IsAtNewLine()) { cs.MoveToNextChar(); } int length = cs.Position - start; if (length > 0) { addToken(start, length); } }
private static int Get3CharOrShorterOperatorLength(CharacterStream cs) { if (cs.DistanceFromEnd >= 3) { string threeLetterCandidate = cs.GetSubstringAt(cs.Position, 3); if (threeLetterCandidate.Length == 3) { int index = Array.BinarySearch<string>(_threeChars, threeLetterCandidate); if (index >= 0) { return 3; } } } return Get2CharOrShorterOperatorLength(cs); }
internal static int Get2CharOrShorterOperatorLength(CharacterStream cs) { if (cs.DistanceFromEnd >= 2) { string twoLetterCandidate = cs.GetSubstringAt(cs.Position, 2); if (twoLetterCandidate.Length == 2) { int index = Array.BinarySearch<string>(_twoChars, twoLetterCandidate); if (index >= 0) { return 2; } } } return GetSingleCharOperatorLength(cs.CurrentChar); }
public static void SkipIdentifier(CharacterStream cs, Func<CharacterStream, bool> isIdentifierLeadCharacter, Func<CharacterStream, bool> isIdentifierCharacter) { if (!isIdentifierLeadCharacter(cs)) return; if (cs.IsEndOfStream()) return; while (!cs.IsWhiteSpace()) { if (!isIdentifierCharacter(cs)) break; if (!cs.MoveToNextChar()) break; } }
public static string NormalizeWhitespace(this string s) { if(s == null || s.Length == 0) { return s; } var cs = new CharacterStream(new TextStream(s)); var sb = new StringBuilder(); while (!cs.IsEndOfStream()) { var current = cs.Position; cs.SkipWhitespace(); if (cs.Position - current > 0) { sb.Append(' '); } while (!cs.IsEndOfStream() && !cs.IsWhiteSpace()) { sb.Append(cs.CurrentChar); cs.MoveToNextChar(); } } return sb.ToString().Trim(); }
internal static int HandleHex(CharacterStream cs, int start) { while (CharacterStream.IsHex(cs.CurrentChar)) { cs.MoveToNextChar(); } // TODO: handle C99 floating point hex syntax like 0x1.1p-2 if (cs.CurrentChar == 'L') { cs.MoveToNextChar(); } return cs.Position - start; }
private static bool IsValidDouble(CharacterStream cs, int start, int end) { int len = end - start; string s = cs.GetSubstringAt(start, len); double n; return Double.TryParse(s, NumberStyles.Number | NumberStyles.AllowExponent, CultureInfo.InvariantCulture, out n); }
internal static void SkipWhitespace(CharacterStream cs) { while (!cs.IsEndOfStream() && cs.IsWhiteSpace()) { cs.MoveToNextChar(); } }
public static int HandleImaginaryPart(CharacterStream cs) { int start = cs.Position; // Check if this is actually complex number NumberTokenizer.SkipWhitespace(cs); if (cs.CurrentChar == '+' || cs.CurrentChar == '-') { cs.MoveToNextChar(); if (cs.CurrentChar == '+' || cs.CurrentChar == '-') { cs.MoveToNextChar(); } int imaginaryLength = NumberTokenizer.HandleNumber(cs); if (imaginaryLength > 0) { if (cs.CurrentChar == 'i') { cs.MoveToNextChar(); return cs.Position - start; } } } return 0; }
// public static object CharacterSteam { get; private set; } public static int HandleNumber(CharacterStream cs) { int start = cs.Position; if (cs.CurrentChar == '-' || cs.CurrentChar == '+') { cs.MoveToNextChar(); } if (cs.CurrentChar == '0' && cs.NextChar == 'x') { cs.Advance(2); return HandleHex(cs, start); } if (cs.CurrentChar == 'x' && CharacterStream.IsHex(cs.NextChar)) { cs.MoveToNextChar(); return HandleHex(cs, start); } int integerPartStart = cs.Position; int integerPartLength = 0; int fractionPartLength = 0; bool isDouble = false; // collect decimals (there may be none like in .1e+20 while (cs.IsDecimal()) { cs.MoveToNextChar(); integerPartLength++; } if (cs.CurrentChar == '.') { isDouble = true; // float/double cs.MoveToNextChar(); // If we've seen don we need to collect factional part of any while (cs.IsDecimal()) { cs.MoveToNextChar(); fractionPartLength++; } } if (integerPartLength + fractionPartLength == 0) { return 0; // +e or +.e is not a number and neither is lonely + or - } int numberLength; if (cs.CurrentChar == 'e' || cs.CurrentChar == 'E') { isDouble = true; numberLength = HandleExponent(cs, start); } else { numberLength = cs.Position - start; } // Verify double format if (isDouble && !IsValidDouble(cs, start, cs.Position)) { numberLength = 0; } if (numberLength > 0) { // skip over trailing 'L' if any if (cs.CurrentChar == 'L') { cs.MoveToNextChar(); numberLength++; } } return numberLength; }
internal static int HandleExponent(CharacterStream cs, int start) { Debug.Assert(cs.CurrentChar == 'E' || cs.CurrentChar == 'e'); bool hasSign = false; cs.MoveToNextChar(); if (cs.IsWhiteSpace() || cs.IsEndOfStream()) { // 0.1E or 1e return 0; } if (cs.CurrentChar == '-' || cs.CurrentChar == '+') { hasSign = true; cs.MoveToNextChar(); } int digitsStart = cs.Position; // collect decimals while (cs.IsDecimal()) { cs.MoveToNextChar(); } if (hasSign && digitsStart == cs.Position) return 0; // NaN like 1.0E- // Technically if letter or braces follows this is not // a number but we'll leave it alone for now. // TODO: This code is not language specific and yet it currently // handles complex 'i' as well as R-specific 'L' suffix. // Ideally this needs to be extended in a way so language-specific // tokenizer can specify options or control number format. if (char.IsLetter(cs.CurrentChar) && cs.CurrentChar != 'i' && cs.CurrentChar != 'L') { return 0; } return cs.Position - start; }