private DimStatement.DimVariable translateRawVariableData(List <IToken> tokens) { if (tokens == null) { throw new ArgumentNullException("tokens"); } if (tokens.Count == 0) { throw new ArgumentException("zero tokens - invalid"); } foreach (IToken token in tokens) { if (token == null) { throw new Exception("Invalid token - null"); } if ((!(token is AtomToken)) && (!(token is DateLiteralToken)) && (!(token is StringToken))) { throw new Exception("Invalid token - not AtomToken or StringToken"); } } // Get name (if no other content, we're all done!) var nameToken = tokens[0]; if (tokens.Count == 1) { return(new DimStatement.DimVariable(new NameToken(nameToken.Content, nameToken.LineIndex), null)); } // Ensure next token and last token are "(" and ")" if (tokens.Count == 2) { throw new Exception("Invalid token sequence"); } if ((tokens[1].Content != "(") || (tokens[tokens.Count - 1].Content != ")")) { throw new Exception("Invalid token sequence"); } // If there were only three tokens, we're all done! if (tokens.Count == 3) { return(new DimStatement.DimVariable(new NameToken(nameToken.Content, nameToken.LineIndex), new List <Expression>())); } // Use base.getEntryList to be flexible and grab dimension declarations // as Statement instances List <Expression> dimensions = new List <Expression>(); List <List <IToken> > dimStatements = base.getEntryList(tokens, 2, AtomToken.GetNewToken(")", nameToken.LineIndex)); foreach (List <IToken> dimStatement in dimStatements) { dimensions.Add(new Expression(dimStatement)); } return(new DimStatement.DimVariable(new NameToken(nameToken.Content, nameToken.LineIndex), dimensions)); }
// ======================================================================================= // VBScript BASE SOURCE RE-GENERATION // ======================================================================================= /// <summary> /// Re-generate equivalent VBScript source code for this block - there /// should not be a line return at the end of the content /// </summary> public string GenerateBaseSource(SourceRendering.ISourceIndentHandler indenter) { // The Statement class' GenerateBaseSource has logic about rendering strings of tokens and rules about whitespace around // (or not around) particular tokens, so the content from this class is wrapped up as a Statement so that the method may // be re-used without copying any of it here var assignmentOperator = AtomToken.GetNewToken("=", ValueToSet.Tokens.Last().LineIndex); var tokensList = ValueToSet.Tokens.Concat(new[] { assignmentOperator }).Concat(Expression.Tokens).ToList(); if (ValueSetType == ValueSetTypeOptions.Set) { tokensList.Insert(0, AtomToken.GetNewToken("Set", ValueToSet.Tokens.First().LineIndex)); } return((new Statement(tokensList, Statement.CallPrefixOptions.Absent)).GenerateBaseSource(indenter)); }
// ======================================================================================= // VBScript BASE SOURCE RE-GENERATION // ======================================================================================= /// <summary> /// Re-generate equivalent VBScript source code for this block - there /// should not be a line return at the end of the content /// </summary> public string GenerateBaseSource(SourceRendering.ISourceIndentHandler indenter) { var tokensList = Tokens.ToList(); if (CallPrefix == CallPrefixOptions.Present) { tokensList.Insert(0, AtomToken.GetNewToken("Call", tokensList[0].LineIndex)); } var output = new StringBuilder(); output.Append(indenter.Indent); for (int index = 0; index < tokensList.Count; index++) { var token = tokensList[index]; if (token is StringToken) { output.Append("\"" + token.Content + "\""); } else if (token is DateLiteralToken) { output.Append("#" + token.Content + "#"); } else { output.Append(token.Content); } var nextToken = (index < (tokensList.Count - 1)) ? tokensList[index + 1] : null; if (nextToken == null) { continue; } if ((token is MemberAccessorOrDecimalPointToken) || (token is OpenBrace) || (nextToken is MemberAccessorOrDecimalPointToken) || (nextToken is ArgumentSeparatorToken) || (nextToken is OpenBrace) || (nextToken is CloseBrace)) { continue; } output.Append(" "); } return(output.ToString().TrimEnd()); }
public static IEnumerable <IToken> Combine(IEnumerable <IToken> tokens) { if (tokens == null) { throw new ArgumentNullException("tokens"); } // Handle +/- sign combinations var additionSubtractionRewrittenTokens = new List <IToken>(); var buffer = new List <OperatorToken>(); var previousTokenIfAny = (IToken)null; foreach (var token in tokens) { if (token == null) { throw new ArgumentException("Null reference encountered in tokens set"); } var combinableOperator = TryToGetAsAdditionOrSubtractionToken(token); if (combinableOperator == null) { var bufferHadContentThatWasReducedToNothing = false; if (buffer.Any()) { var condensedToken = CondenseNegations(buffer); if (IsTokenRedundant(condensedToken, previousTokenIfAny)) { // If this is a "+" and the last token was an OperatorToken, then this one is redundant (eg. "1 * +1") bufferHadContentThatWasReducedToNothing = true; } else { additionSubtractionRewrittenTokens.Add(condensedToken); } buffer.Clear(); } // When a minus-sign/addition-sign buffer is flattened and can be reduced to nothing, if the next token is a numeric value then we // need to apply a bit of a dirty hack since VBScript gives numeric literals special treatment in some cases but does not consider // --1 to be a numeric literal (for example). So we can not replace --1 with 1 since it would change the meaning of some code. To // illustrate, consider the following: // If ("a" = 1) Then // If ("a" = --1) Then // If ("a" = +-1) Then // The first example will result in a Type Mismatch since the numeric literal forces the "a" to be parsed as a number (which fails). // However, the second and third examples return false since their right hand side values are not considered to be numeric literals // and so the left hand sides need not be parsed as numeric values. The workaround is to identify these situations and to wrap the // number in a CInt/CLng/CDbl call. So long as the appropriate function is used, this will not affect the numeric value but it will // prevent it from being identified as a numeric literal later on (this is important to the StatementTranslator). Note: This is why // the NumberRebuilder must have done its work before we get here, since ++1.2 must be recognised as "+", "+", "1.2" so that it can // be translated into "CDbl(1.2)", rather than still being "+", "+", "1", ".", "2", which would translated into "CDbl(1).2", which // would be invalid. var numericValueToken = token as NumericValueToken; var wrapTokenInNumberFunctionCall = bufferHadContentThatWasReducedToNothing && (numericValueToken != null); if (wrapTokenInNumberFunctionCall) { additionSubtractionRewrittenTokens.Add(new BuiltInFunctionToken(numericValueToken.GetSafeWrapperFunctionName(), token.LineIndex)); additionSubtractionRewrittenTokens.Add(new OpenBrace(token.LineIndex)); } additionSubtractionRewrittenTokens.Add(token); if (wrapTokenInNumberFunctionCall) { additionSubtractionRewrittenTokens.Add(new CloseBrace(token.LineIndex)); } previousTokenIfAny = token; } else { buffer.Add(combinableOperator); } } if (buffer.Any()) { // Note: We don't need to copy all of the logic from above - in fact we can't, since we don't have a current token reference var condensedToken = CondenseNegations(buffer); if (!IsTokenRedundant(condensedToken, previousTokenIfAny)) { additionSubtractionRewrittenTokens.Add(condensedToken); } } // Handle comparison token combinations (eg. ">", "=" to ">=") var combinations = new[] { Tuple.Create(Tuple.Create("<", ">"), "<>"), Tuple.Create(Tuple.Create("<", "="), "<="), Tuple.Create(Tuple.Create(">", "="), ">=") }; var comparisonRewrittenTokens = new List <IToken>(); for (var index = 0; index < additionSubtractionRewrittenTokens.Count; index++) { var token = additionSubtractionRewrittenTokens[index]; if (index == (additionSubtractionRewrittenTokens.Count - 1)) { comparisonRewrittenTokens.Add(token); continue; } var nextToken = additionSubtractionRewrittenTokens[index + 1]; var combineTokens = ( ((token.Content == "<") && (nextToken.Content == ">")) || ((token.Content == ">") && (nextToken.Content == "=")) || ((token.Content == "<") && (nextToken.Content == "=")) ); if (combineTokens) { comparisonRewrittenTokens.Add(AtomToken.GetNewToken(token.Content + nextToken.Content, token.LineIndex)); index++; continue; } comparisonRewrittenTokens.Add(token); } return(comparisonRewrittenTokens); }
/// <summary> /// Break down scriptContent into a combination of StringToken, CommentToken, UnprocessedContentToken and EndOfStatementNewLine instances (the /// end of statement tokens will not have been comprehensively handled). This will never return null nor a set containing any null references. /// </summary> public static IEnumerable <IToken> SegmentString(string scriptContent) { if (scriptContent == null) { throw new ArgumentNullException("scriptContent"); } // Normalise line returns scriptContent = scriptContent.Replace("\r\n", "\n").Replace('\r', '\n'); var index = 0; var tokenContent = ""; var tokens = new List <IToken>(); var lineIndex = 0; var lineIndexForStartOfContent = 0; while (index < scriptContent.Length) { var chr = scriptContent.Substring(index, 1); // Check for comment bool isComment; if (chr == "'") { isComment = true; } else if (index <= (scriptContent.Length - 3)) { var threeChars = scriptContent.Substring(index, 3); var fourthChar = (index == scriptContent.Length - 3) ? (char?)null : scriptContent[index + 3]; if (threeChars.Equals("REM", StringComparison.InvariantCultureIgnoreCase) && ((fourthChar == null) || _whiteSpaceCharsExceptLineReturn.Contains(fourthChar.Value))) { isComment = true; index += 2; } else { isComment = false; } } else { isComment = false; } if (isComment) { // Store any previous token content bool isInlineComment; if (tokenContent != "") { // If there has been any one the same line as this comment, then this is an inline comment var contentAfterLastLineReturn = tokenContent.Split('\n').Last(); isInlineComment = (contentAfterLastLineReturn.Trim() != ""); tokens.Add(new UnprocessedContentToken(tokenContent, lineIndexForStartOfContent)); tokenContent = ""; } else { isInlineComment = false; } // Move past comment marker and look for end of comment (end of the line) then store in a CommentToken instance // - Note: Always want an EndOfStatementNewLineToken to appear before comments, so ensure this is the case (if the previous token was // a Comment it doesn't matter, if the previous statement was a String we'll definitely need an end-of-statement, if the previous // was Unprocessed, we only need end-of-statement if the content didn't end with a line-return) lineIndexForStartOfContent = lineIndex; index++; int breakPoint = scriptContent.IndexOf("\n", index); if (breakPoint == -1) { breakPoint = scriptContent.Length; } if (tokens.Count > 0) { var prevToken = tokens[tokens.Count - 1]; if (prevToken is UnprocessedContentToken) { // UnprocessedContentToken MAY conclude with end-of-statement content, we'll need to check if (!prevToken.Content.TrimEnd(_whiteSpaceCharsExceptLineReturn).EndsWith("\n")) { tokens.RemoveAt(tokens.Count - 1); var unprocessedContentToRecord = prevToken.Content.TrimEnd('\t', ' '); if (unprocessedContentToRecord != "") { tokens.Add(new UnprocessedContentToken(unprocessedContentToRecord, prevToken.LineIndex)); tokens.Add(new EndOfStatementSameLineToken(prevToken.LineIndex)); } } } } if (tokens.Any() && ((tokens.Last() is DateLiteralToken) || (tokens.Last() is StringToken))) { // Quoted literals (ie. string or date) CAN'T contain end-of-statement content so we'll definitely need an EndOfStatementNewLineToken // Note: This has to be done after the above work in case there was a literal token then some whitespace (which is removed above) // then a Comment. If the work above wasn't done before this check then "prevToken" would not be a StringToken, it would be the // whitespace - but that would be removed and then the literal would be arranged right next to the Comment, without an end- // of-statement token between them! tokens.Add(new EndOfStatementSameLineToken(lineIndexForStartOfContent)); } var commentContent = scriptContent.Substring(index, breakPoint - index); if (isInlineComment) { tokens.Add(new InlineCommentToken(commentContent, lineIndexForStartOfContent)); } else { tokens.Add(new CommentToken(commentContent, lineIndexForStartOfContent)); } index = breakPoint; lineIndex++; lineIndexForStartOfContent = lineIndex; } // Check for string content else if (chr == "\"") { // Store any previous token content if (tokenContent != "") { tokens.Add(new UnprocessedContentToken(tokenContent, lineIndexForStartOfContent)); tokenContent = ""; } // Try to grab string content lineIndexForStartOfContent = lineIndex; var indexString = index + 1; while (true) { chr = scriptContent.Substring(indexString, 1); if (chr == "\n") { throw new Exception("Encountered line return in string content around line " + (lineIndexForStartOfContent + 1)); } if (chr != "\"") { tokenContent += chr; } else { // Quote character - is it doubled (ie. escaped quote)? string chrNext; if (indexString < (scriptContent.Length - 1)) { chrNext = scriptContent.Substring(indexString + 1, 1); } else { chrNext = null; } if (chrNext == "\"") { // Escaped quote: push past and add singe chr to content indexString++; tokenContent += "\""; } else { // Non-escaped quote: string end tokens.Add(new StringToken(tokenContent, lineIndexForStartOfContent)); tokenContent = ""; lineIndexForStartOfContent = lineIndex; index = indexString; break; } } indexString++; } } // Check for crazy VBScript escaped-name variable content // - It's acceptable to name a variable pretty much anything if it's wrapped in square brackets; seems to be any character other than // line returns and a closing square bracket (since there is no support for escaping the closing bracket). This includes single and // double quotes, whitespace, colons, numbers, underscores, anything - in fact a valid variable name is [ ], meaning a single space // wrapped in square brackets! This is a little-known feature but it shouldn't be too hard to parse out at this point. else if (chr == "[") { // Store any previous token content if (tokenContent != "") { tokens.Add(new UnprocessedContentToken(tokenContent, lineIndexForStartOfContent)); } lineIndexForStartOfContent = lineIndex; tokenContent = "["; var indexString = index + 1; while (true) { chr = scriptContent.Substring(indexString, 1); if (chr == "\n") { throw new Exception("Encountered line return in escaped-content variable name"); } tokenContent += chr; if (chr == "]") { tokens.Add(AtomToken.GetNewToken(tokenContent, lineIndexForStartOfContent)); tokenContent = ""; lineIndexForStartOfContent = lineIndex; index = indexString; break; } indexString++; } } // VBScript supports date literals, wrapped in hashes. These introduce a range of complications - such as literal comparisons requiring // special logic, as string and number literals do - eg. ("a" = #2015-5-27#) will fail at runtime as "a" must be parse-able as a date, // and it isn't. It also has complications around culture - so the value #1 5 2015# must be parsed as 2015-5-1 in the UK when the // translated output is executed but as 2015-1-5 in the US. On top of that, VBScript is very flexible in its acceptance of date formats - // amongst these problems is that the year is optional and so #1 5# means 1st of May or 5th of January (depending upon culture) in the // current year - however, once a date literal has had a default year set for a given request it must stick to that year; so if the request // is unfortunate enough to be slow and cross years, a given date literal must consistently stick to using the year from when the request // started. When a new request starts, however, if the year has changed then that new request must default to that new year, it would be no // good if the year was determined once (at translation time) and then never changed, since this would be inconsistent with VBScript's behaviour // of treating each request as a whole new start-up / serve / tear-down process. This means that the value #29 2# will change by year, being // the 29th of February if the current year is a leap year and the 1st of February 2029 if not (since #29 2# will be interpreted as year 29 // and month 2 since 29 could not be a valid month - and then 29 will be treated as a two-digit year which must be bumped up to 2029). Also // note that even in the US #29 2# will be interpreted as the 29th of February (or 1st of February 2029) since there is no way to parse that // as a month-then-day format). // - Note: This gets the lowest priority in terms of wrapping characters, so [#1 1#] is a variable name and not something containing a // date, likewise "#1 1#" is a string and nothing to do with a date. There are no escape characters. If the wrapped value can not // possibly be valid then an exception will be raised at this point. else if (chr == "#") { // Store any previous token content if (tokenContent != "") { tokens.Add(new UnprocessedContentToken(tokenContent, lineIndexForStartOfContent)); } lineIndexForStartOfContent = lineIndex; tokenContent = ""; var indexString = index + 1; while (true) { chr = scriptContent.Substring(indexString, 1); if (chr == "\n") { throw new Exception("Encountered line return in date literal content"); } if (chr == "#") { // We can only catch certain kinds of invalid date literal format here since some formats are culture-dependent (eg. "1 May 2010" is // valid in English but not in French) and I don't want to assume that translated programs are running with the same culture as the // translation process. The "limitedDateParser" can catch some invalid formats, which is better than nothing, but others will have // to checked at runtime (see the notes around the instantiation of the limitedDateParser). try { _limitedDateParser.Parse(tokenContent); } catch (Exception e) { throw new ArgumentException("Invalid date literal content encountered on line " + lineIndex + ": #" + tokenContent + "#", e); } tokens.Add(new DateLiteralToken(tokenContent, lineIndexForStartOfContent)); tokenContent = ""; lineIndexForStartOfContent = lineIndex; index = indexString; break; } else { tokenContent += chr; } indexString++; } } // Mustn't be neither comment, string, date nor VBScript-escaped-variable-name.. else { tokenContent += chr; } // Move to next character (if any).. index++; if (chr == "\n") { lineIndex++; } } // Don't let any unhandled content get away! if (tokenContent != "") { tokens.Add(new UnprocessedContentToken(tokenContent, lineIndexForStartOfContent)); } return(tokens); }
/// <summary> /// Break down an UnprocessedContentToken into a combination of AtomToken and AbstractEndOfStatementToken references. This will never return null nor a set /// containing any null references. /// </summary> public static IEnumerable <IToken> BreakUnprocessedToken(UnprocessedContentToken token) { if (token == null) { throw new ArgumentNullException("token"); } var lineIndex = token.LineIndex; var buffer = ""; var content = token.Content; var tokens = new List <IToken>(); for (var index = 0; index < content.Length; index++) { var chr = content.Substring(index, 1); if (char.IsWhiteSpace(chr, 0) && (chr != "\n")) { // If we've found a (non-line-return) whitespace character, push content retrieved from the token so far (if any), into a fresh token on the // list and clear the buffer to accept following data. if (buffer != "") { tokens.Add(AtomToken.GetNewToken(buffer, lineIndex)); } buffer = ""; } else { bool characterIsTokenBreaker; if (TokenBreakChars.IndexOf(chr) != -1) { characterIsTokenBreaker = true; } else if (chr == "_") { // An underscore is a line return continuation character if it follows whitespace, but it must be part of a variable name if it is not // preceded by whitespace (and line return continuation is a token-breaker, as opposed to an underscore that is part of the current // token) characterIsTokenBreaker = (index > 0) && char.IsWhiteSpace(content, index - 1); } else { characterIsTokenBreaker = false; } if (characterIsTokenBreaker) { // If the current character is a "&" then it may be a string concatenation or it may be the start of a hex number (eg. "&h001"), if it's // the latter then we want to represent the content as a single token "&h001" not break the "&" out. if ((chr == "&") && (index <= (content.Length - 3))) { var chrNext = content.Substring(index + 1, 1); var chrNextNext = content.Substring(index + 2, 1); if (chrNext.Equals("H", StringComparison.InvariantCultureIgnoreCase) && ("0123456789".IndexOf(chrNextNext) != -1)) { buffer += chr; continue; } } // If we've found another "break" character (which means a token split is identified, but that we want to keep the break character itself, // unlike with whitespace breaks), then do similar to above. if (buffer != "") { tokens.Add(AtomToken.GetNewToken(buffer, lineIndex)); } tokens.Add(AtomToken.GetNewToken(chr, lineIndex)); buffer = ""; } else { buffer += chr; } } if (chr == "\n") { lineIndex++; } } if (buffer != "") { tokens.Add(AtomToken.GetNewToken(buffer, lineIndex)); } // Handle ignore-line-return / end-of-statement combinations tokens = handleLineReturnCancels(tokens); return(tokens); }