private static CommentToken ResolveDelphiComment(LexerContext context) { var begin = context.GetIndex(); context.IncIndex(); context.IncIndex(); var inners = new List <CommentToken>(); while (!context.IsEnded() && !context.IsNewLineNow()) { if (CommentToken.IsOldStyleCommentBegin(context)) { inners.Add(ResolveOldStyleComment(context)); continue; } if (CommentToken.IsTurboPascalCommentBegin(context)) { inners.Add(ResolveTurboPascalComment(context)); continue; } context.IncIndex(); } var isSingleLine = inners.All(c => c.IsSingleLineComment); if (!isSingleLine) { throw new SyntaxErrorException(); } return(new CommentToken(true, inners, begin, context.GetIndex())); }
private static IToken Resolve(LexerContext context) { if (CommentToken.IsCommentBegin(context)) { return(ResolveComment(context)); } else if (NumberToken.IsNumberBegin(context)) { return(ResolveNumber(context)); } else if (StringToken.IsStringBegin(context)) { return(ResolveString(context)); } else if (SpecialSymbolToken.IsSpecial(context)) { return(ResolveSpecial(context)); } else if (IdentifierToken.IsIdentifierBegin(context)) { return(ResolveIdentifier(context)); } else { throw new SyntaxErrorException(); } }
private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log) { if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0) { return(HandleMissingVersion(comment, isLenientParsing, log)); } const int toDecimalStartLength = 4; if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), NumberStyles.Number, CultureInfo.InvariantCulture, out var version)) { return(HandleMissingVersion(comment, isLenientParsing, log)); } var atEnd = scanner.CurrentPosition == scanner.Length; var rewind = atEnd ? 1 : 2; var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind; scanner.Seek(0); var result = new HeaderVersion(version, comment.Data, commentOffset); return(result); }
public void CommentToken_Uniqueness() { var comment = new CommentToken { Content = new StringSlice("comment"), Indent = 0, ContentStartPosition = 0, ContentEndPosition = 6, IsClosed = true, TagStartPosition = 0, TagEndPosition = 6 }; var comment2 = new CommentToken { Content = new StringSlice("comment"), Indent = 0, ContentStartPosition = 0, ContentEndPosition = 6, IsClosed = true, TagStartPosition = 0, TagEndPosition = 6 }; Assert.False(comment.Equals(null)); Assert.Equal(comment, comment2); Assert.Equal(comment.GetHashCode(), comment2.GetHashCode()); }
public void Test_3_5_2_Comments_B() { // Sequential comments Scanner lexer = this.GetLexer("\" This is a comment \"\" and another comment \""); // 1st comment object obj = lexer.GetToken(); Assert.IsInstanceOfType(obj, typeof(CommentToken)); CommentToken token = (CommentToken)obj; Assert.IsTrue(token.IsValid); Assert.IsNull(token.ScanError); Assert.AreEqual(0, token.StartPosition.Position); Assert.AreEqual(20, token.StopPosition.Position); Assert.AreEqual(" This is a comment ", token.Value); // 2nd comment obj = lexer.GetToken(); Assert.IsInstanceOfType(obj, typeof(CommentToken)); token = (CommentToken)obj; Assert.IsTrue(token.IsValid); Assert.IsNull(token.ScanError); Assert.AreEqual(21, token.StartPosition.Position); Assert.AreEqual(43, token.StopPosition.Position); Assert.AreEqual(" and another comment ", token.Value); // Should be the last one obj = lexer.GetToken(); Assert.IsInstanceOfType(obj, typeof(EofToken)); }
public void HtmlParser_OnCommentStateTest() { string text = "<!-- abcde -->"; var target = new HtmlParser(); target._cs = new HtmlCharStream(text); target._tokenizer = new HtmlTokenizer(target._cs); target.CommentFound += delegate(object sender, HtmlParserCommentEventArgs args) { Assert.True(args.CommentToken is CommentToken); CommentToken ct = args.CommentToken; Assert.Equal(1, ct.Count); Assert.Equal(0, ct.Start); Assert.Equal(14, ct.End); Assert.True(ct[0] is HtmlToken); Assert.True(ct[0] is IExpandableTextRange); Assert.Equal(0, ct[0].Start); Assert.Equal(14, ct[0].End); }; target.OnCommentState(); }
private static void WriteComment(CommentToken comment, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(comment.Data); outputStream.WriteByte(Comment); outputStream.Write(bytes, 0, bytes.Length); WriteLineBreak(outputStream); }
private void PostBlockCommentMissingClosingError() { CommentToken openBlockComment = _comments.LastOrDefault(cm => cm.IsOpenBlock == true); if (openBlockComment != null) { PostError(openBlockComment, TexlStrings.ErrMissingEndOfBlockComment); } }
private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log) { if (isLenientParsing) { log.Warn($"Did not find a version header of the correct format, defaulting to 1.4 since lenient. Header was: {comment.Data}."); return(new HeaderVersion(1.4m, "PDF-1.4", 0)); } throw new PdfDocumentFormatException($"The comment which should have provided the version was in the wrong format: {comment.Data}."); }
public void When_HasSingleCommentToken() { var section = new ConfigIniSection(); var token1 = new CommentToken(new[] { "; Hey", ";Whats", " ;Up?" }, LineEnding.None); section.Tokens.Add(token1); section.MergeConsecutiveTokens(); Assert.That(section.Tokens, Has.Count.EqualTo(1)); Assert.That(section.Tokens[0], Is.SameAs(token1)); Assert.That(token1.GetStringLines(), Is.EquivalentTo(new[] { "; Hey", ";Whats", " ;Up?" })); }
public void Test_3_5_2_Comments_A() { Scanner lexer = this.GetLexer("\" This is a comment \n with two lines \""); object obj = lexer.GetToken(); Assert.IsInstanceOfType(obj, typeof(CommentToken)); CommentToken token = (CommentToken)obj; Assert.IsTrue(token.IsValid); Assert.IsNull(token.ScanError); Assert.AreEqual(0, token.StartPosition.Position); Assert.AreEqual(37, token.StopPosition.Position); Assert.AreEqual(" This is a comment \n with two lines ", token.Value); }
private static CommentToken ResolveTurboPascalComment(LexerContext context) { var begin = context.GetIndex(); context.IncIndex(); var inners = new List <CommentToken>(); var isOneLine = true; while (!context.IsEnded()) { if (CommentToken.IsOldStyleCommentBegin(context)) { inners.Add(ResolveOldStyleComment(context)); continue; } if (CommentToken.IsDelphiCommentBegin(context)) { inners.Add(ResolveDelphiComment(context)); continue; } if (context.IsNewLineNow()) { isOneLine = false; for (var i = 0; i < Environment.NewLine.Length; i++) { context.IncIndex(); } } if (context.IsNewLineNow()) { isOneLine = false; for (var i = 0; i < Environment.NewLine.Length; i++) { context.IncIndex(); } } if (CommentToken.IsTurboPascalCommentEnd(context)) { context.IncIndex(); break; } context.IncIndex(); } return(new CommentToken(isOneLine && inners.All(c => c.IsSingleLineComment), inners, begin, context.GetIndex())); }
private static CommentToken ResolveComment(LexerContext context) { if (CommentToken.IsOldStyleCommentBegin(context)) { return(ResolveOldStyleComment(context)); } else if (CommentToken.IsTurboPascalCommentBegin(context)) { return(ResolveTurboPascalComment(context)); } else { return(ResolveDelphiComment(context)); } }
/// <summary> /// Tries to match a comment tag from the provided slice /// </summary> /// <param name="processor">The processor</param> /// <param name="slice">The slice</param> /// <returns>If a comment tag was matched</returns> public override bool Match(Processor processor, ref StringSlice slice) { var tagStart = slice.Start - processor.CurrentTags.StartTag.Length; var index = slice.Start; while (slice[index].IsWhitespace()) { index++; } var match = slice[index]; if (match == TagId) { slice.Start = index; var startIndex = index + 1; var commentTag = new CommentToken { TagStartPosition = tagStart, ContentStartPosition = startIndex, IsClosed = false }; processor.CurrentToken = commentTag; while (!slice.IsEmpty && !slice.Match(processor.CurrentTags.EndTag)) { slice.NextChar(); } if (slice.IsEmpty) { return(false); } commentTag.TagEndPosition = slice.Start + processor.CurrentTags.EndTag.Length; commentTag.ContentEndPosition = slice.Start; commentTag.IsClosed = true; slice.Start += processor.CurrentTags.EndTag.Length; return(true); } return(false); }
public static bool TryParseCommentToken(FbxAsciiFileInfo fbxAsciiFileInfo, out CommentToken commentToken) { var c = fbxAsciiFileInfo.PeekChar(); if (c != ';') { commentToken = null; return(false); } var stringBuilder = new StringBuilder(); while (!c.IsLineEnd() && !fbxAsciiFileInfo.IsEndOfStream()) { stringBuilder.Append(fbxAsciiFileInfo.ReadChar()); c = fbxAsciiFileInfo.PeekChar(); } commentToken = new CommentToken(stringBuilder.ToString()); return(true); }
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { token = null; if (currentByte != '%') { return(false); } var builder = new StringBuilder(); while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte)) { builder.Append((char)inputBytes.CurrentByte); } token = new CommentToken(builder.ToString()); return(true); }
public void When_Has3NonConsecutiveTokens() { var section = new ConfigIniSection(); var token1 = new CommentToken(new[] { "; Hey", ";Whats", " ;Up?" }, LineEnding.None); var token2 = new WhitespaceToken(new[] { " ", "\t", "" }, LineEnding.None); var token3 = new CommentToken(new[] { ";Baz" }, LineEnding.None); section.Tokens.Add(token1); section.Tokens.Add(token2); section.Tokens.Add(token3); section.MergeConsecutiveTokens(); Assert.That(section.Tokens, Has.Count.EqualTo(3)); Assert.That(section.Tokens[0], Is.SameAs(token1)); Assert.That(section.Tokens[1], Is.SameAs(token2)); Assert.That(section.Tokens[2], Is.SameAs(token3)); Assert.That(token1.GetStringLines(), Is.EquivalentTo(new[] { "; Hey", ";Whats", " ;Up?" })); Assert.That(token2.GetStringLines(), Is.EquivalentTo(new[] { " ", "\t", "" })); Assert.That(token3.GetStringLines(), Is.EquivalentTo(new[] { ";Baz" })); }
/* * Method: FindNextToken * * Find the next token. Return 'true' if one was found. False, otherwise. */ override internal bool FindNextToken() { int startPosition = _reader.Position; // Dealing with whitespace? if (_reader.SinkMultipleWhiteSpace()) { current = new WhitespaceToken(); return(true); } // Check for one-line comment else if (_reader.Sink("//")) { // Looks like a one-line comment. Follow it to the End-of-line _reader.SinkToEndOfLine(); current = new CommentToken(); return(true); } // Check for multi-line comment else if (_reader.Sink("/*")) { _reader.SinkUntil("*/"); // Was the ending */ found? if (_reader.EndOfLines) { // No. There was a /* without a */. Return this a syntax error token. current = new CSharpTokenizer.EndOfFileInsideCommentToken(); return(true); } current = new CommentToken(); return(true); } // Handle chars else if (_reader.Sink("\'")) { while (_reader.CurrentCharacter != '\'') { if (_reader.Sink("\\")) { /* reader.Skip the escape sequence. * This isn't exactly right. We should detect: * * simple-escape-sequence: one of \' \" \\ \0 \a \b \f \n \r \t \v * * hexadecimal-escape-sequence: * \x hex-digit hex-digit[opt] hex-digit[opt] hex-digit[opt] */ } _reader.SinkCharacter(); } if (_reader.SinkCharacter() != '\'') { Debug.Assert(false, "Code defect in tokenizer: Should have yielded a closing tick."); } current = new CSharpTokenizer.CharLiteralToken(); return(true); } // Check for verbatim string else if (_reader.Sink("@\"")) { do { // Inside a verbatim string "" is treated as a special character while (_reader.Sink("\"\"")) { } }while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"'); // Can't end a file inside a string if (_reader.EndOfLines) { current = new EndOfFileInsideStringToken(); return(true); } // reader.Skip the ending quote. current = new StringLiteralToken(); current.InnerText = _reader.GetCurrentMatchedString(startPosition).Substring(1); return(true); } // Check for a quoted string. else if (_reader.Sink("\"")) { while (_reader.CurrentCharacter == '\\' || _reader.MatchRegularStringLiteral()) { // See if we have an escape sequence. if (_reader.SinkCharacter() == '\\') { // This is probably an escape character. if (_reader.SinkStringEscape()) { // This isn't nearly right. We just do barely enough to make a string // with an embedded escape sequence return _some_ string whose start and // end match the real bounds of the string. } else { // This is a compiler error. _reader.SinkCharacter(); current = new CSharpTokenizer.UnrecognizedStringEscapeToken(); return(true); } } } // Is it a newline? if (TokenChar.IsNewLine(_reader.CurrentCharacter)) { current = new CSharpTokenizer.NewlineInsideStringToken(); return(true); } // Create the token. if (_reader.SinkCharacter() != '\"') { Debug.Assert(false, "Defect in tokenizer: Should have yielded a terminating quote."); } current = new StringLiteralToken(); return(true); } // Identifier or keyword? else if ( // From 2.4.2 Identifiers: A '@' can be used to prefix an identifier so that a keyword can be used as an identifier. _reader.CurrentCharacter == '@' || _reader.MatchNextIdentifierStart() ) { if (_reader.CurrentCharacter == '@') { _reader.SinkCharacter(); } // Now, the next character must be an identifier start. if (!_reader.SinkIdentifierStart()) { current = new ExpectedIdentifierToken(); return(true); } // Sink the rest of the identifier. while (_reader.SinkIdentifierPart()) { } string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition); switch (identifierOrKeyword) { default: if (Array.IndexOf(s_keywordList, identifierOrKeyword) >= 0) { current = new KeywordToken(); return(true); } // If the identifier starts with '@' then we need to strip it off. // The '@' is for escaping so that we can have an identifier called // the same thing as a reserved keyword (i.e. class, if, foreach, etc) string identifier = _reader.GetCurrentMatchedString(startPosition); if (identifier.StartsWith("@", StringComparison.Ordinal)) { identifier = identifier.Substring(1); } // Create the token. current = new IdentifierToken(); current.InnerText = identifier; return(true); case "false": case "true": current = new BooleanLiteralToken(); return(true); case "null": current = new CSharpTokenizer.NullLiteralToken(); return(true); } } // Open scope else if (_reader.Sink("{")) { current = new CSharpTokenizer.OpenScopeToken(); return(true); } // Close scope else if (_reader.Sink("}")) { current = new CSharpTokenizer.CloseScopeToken(); return(true); } // Hexidecimal integer literal else if (_reader.SinkIgnoreCase("0x")) { // Sink the hex digits. if (!_reader.SinkMultipleHexDigits()) { current = new ExpectedValidHexDigitToken(); return(true); } // Skip the L, U, l, u, ul, etc. _reader.SinkLongIntegerSuffix(); current = new HexIntegerLiteralToken(); return(true); } // Decimal integer literal else if (_reader.SinkMultipleDecimalDigits()) { // reader.Skip the L, U, l, u, ul, etc. _reader.SinkLongIntegerSuffix(); current = new DecimalIntegerLiteralToken(); return(true); } // Check for single-digit operators and punctuators else if (_reader.SinkOperatorOrPunctuator()) { current = new OperatorOrPunctuatorToken(); return(true); } // Preprocessor line else if (_reader.CurrentCharacter == '#') { if (_reader.Sink("#if")) { current = new OpenConditionalDirectiveToken(); } else if (_reader.Sink("#endif")) { current = new CloseConditionalDirectiveToken(); } else { current = new PreprocessorToken(); } _reader.SinkToEndOfLine(); return(true); } // We didn't recognize the token, so this is a syntax error. _reader.SinkCharacter(); current = new UnrecognizedToken(); return(true); }
public virtual T applyToCommentToken(CommentToken operand) { return(applyToParseTreeNode(operand)); }
private static Comment ReadComment(CommentToken token) { CommentType type; string value; switch (token.Type) { case ":": type = CommentType.Reference; value = token.Content; break; case ".": type = CommentType.Extracted; value = token.Content; break; case ",": type = CommentType.Flag; value = token.Content; break; case "|": type = CommentType.Previous; value = token.Content; break; default: type = CommentType.Translator; value = token.Type + token.Content; break; } return new Comment { Type = type, Value = value }; }
protected virtual void VisitCommentToken(CommentToken token) { State.Write(this.CommentOpenQuote); VisitToken(token.Content); State.Write(this.CommentCloseQuote); }
/** * This is the main lexing algorithm. It consumes source file as string and puts out token list. * Token consists of token type and range that token spans. */ public static IToken[] Parse(string s) { LinkedList <IToken> ll = new LinkedList <IToken>(); // use five kinds of token to produce token stream. var index = 0; var next = 0; while (index < s.Length) { // try to parse as number if (NumberToken.ParseNumber(s, index, out var tokenNumber, out next)) { ll.AddLast(tokenNumber); index = next; continue; } // then try to parse as string if (StringToken.ParseString(s, index, out var tokenString, out next)) { ll.AddLast(tokenString); index = next; continue; } // then try to parse as identifier if (IdentifierToken.ParseIdentifier(s, index, out var tokenIdentifier, out next)) { ll.AddLast(tokenIdentifier); index = next; continue; } // then try to parse as comment if (CommentToken.ParseComment(s, index, out var tokensComment, out next)) { foreach (var t in tokensComment) { ll.AddLast(t); } index = next; continue; } // then try to parse as symbol token if (SymbolToken.ParseSymbol(s, index, out var tokenSymbol, out next)) { ll.AddLast(tokenSymbol); index = next; continue; } if (Char.IsWhiteSpace(s[index])) { // skip spaces index++; continue; } // otherwise token is unknown throw new Exception("unknown token " + s[index] + " at position " + index); } // return collected tokens return(ll.ToArray()); }
/* * Method: FindNextToken * * Find the next token. Return 'true' if one was found. False, otherwise. */ internal override bool FindNextToken() { int startPosition = _reader.Position; // VB docs claim whitespace is Unicode category Zs. However, // this category does not contain tabs. Assuming a less restrictive // definition for whitespace... if (_reader.SinkWhiteSpace()) { while (_reader.SinkWhiteSpace()) { } // Now, we need to check for the line continuation character. if (_reader.SinkLineContinuationCharacter()) // Line continuation is '_' { // Save the current position because we may need to come back here. int savePosition = _reader.Position - 1; // Skip all whitespace after the '_' while (_reader.SinkWhiteSpace()) { } // Now, skip all the newlines. // Need at least one newline for this to count as line continuation. int count = 0; while (_reader.SinkNewLine()) { ++count; } if (count > 0) { current = new VisualBasicTokenizer.LineContinuationToken(); return true; } // Otherwise, fall back to plain old whitespace. _reader.Position = savePosition; } current = new WhitespaceToken(); return true; } // Line terminators are separate from whitespace and are significant. else if (_reader.SinkNewLine()) { // We want one token per line terminator. current = new VisualBasicTokenizer.LineTerminatorToken(); return true; } // Check for a comment--either those that start with ' or rem. else if (_reader.SinkLineCommentStart()) { // Skip to the first EOL. _reader.SinkToEndOfLine(); current = new CommentToken(); return true; } // Identifier or keyword? else if ( // VB allows escaping of identifiers by surrounding them with [] // In other words, // Date is a keyword but, // [Date] is an identifier. _reader.CurrentCharacter == '[' || _reader.MatchNextIdentifierStart() ) { bool escapedIdentifier = false; if (_reader.CurrentCharacter == '[') { escapedIdentifier = true; _reader.SinkCharacter(); // Now, the next character must be an identifier start. if (!_reader.SinkIdentifierStart()) { current = new ExpectedIdentifierToken(); return true; } } // Sink the rest of the identifier. while (_reader.SinkIdentifierPart()) { } // If this was an escaped identifier the we need to get the terminating ']'. if (escapedIdentifier) { if (!_reader.Sink("]")) { current = new ExpectedIdentifierToken(); return true; } } else { // Escaped identifiers are not allowed to have trailing type character. _reader.SinkTypeCharacter(); // Type character is optional. } // An identifier that is only a '_' is illegal because it is // ambiguous with line continuation string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition); if (identifierOrKeyword == "_" || identifierOrKeyword == "[_]" || identifierOrKeyword == "[]") { current = new ExpectedIdentifierToken(); return true; } // Make an upper-case version in order to check whether this may be a keyword. string upper = identifierOrKeyword.ToUpper(CultureInfo.InvariantCulture); switch (upper) { default: if (Array.IndexOf(s_keywordList, upper) >= 0) { current = new KeywordToken(); return true; } // Create the token. current = new IdentifierToken(); // Trim off the [] if this is an escaped identifier. if (escapedIdentifier) { current.InnerText = identifierOrKeyword.Substring(1, identifierOrKeyword.Length - 2); } return true; case "FALSE": case "TRUE": current = new BooleanLiteralToken(); return true; } } // Is it a hex integer? else if (_reader.SinkHexIntegerPrefix()) { if (!_reader.SinkMultipleHexDigits()) { current = new ExpectedValidHexDigitToken(); return true; } // Sink a suffix if there is one. _reader.SinkIntegerSuffix(); current = new HexIntegerLiteralToken(); return true; } // Is it an octal integer? else if (_reader.SinkOctalIntegerPrefix()) { if (!_reader.SinkMultipleOctalDigits()) { current = new VisualBasicTokenizer.ExpectedValidOctalDigitToken(); return true; } // Sink a suffix if there is one. _reader.SinkIntegerSuffix(); current = new VisualBasicTokenizer.OctalIntegerLiteralToken(); return true; } // Is it a decimal integer? else if (_reader.SinkMultipleDecimalDigits()) { // Sink a suffix if there is one. _reader.SinkDecimalIntegerSuffix(); current = new DecimalIntegerLiteralToken(); return true; } // Preprocessor line else if (_reader.CurrentCharacter == '#') { if (_reader.SinkIgnoreCase("#if")) { current = new OpenConditionalDirectiveToken(); } else if (_reader.SinkIgnoreCase("#end if")) { current = new CloseConditionalDirectiveToken(); } else { current = new PreprocessorToken(); } _reader.SinkToEndOfLine(); return true; } // Is it a separator? else if (_reader.SinkSeparatorCharacter()) { current = new VisualBasicTokenizer.SeparatorToken(); return true; } // Is it an operator? else if (_reader.SinkOperator()) { current = new OperatorToken(); return true; } // A string? else if (_reader.Sink("\"")) { do { // Inside a verbatim string "" is treated as a special character while (_reader.Sink("\"\"")) { } } while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"'); // Can't end a file inside a string if (_reader.EndOfLines) { current = new EndOfFileInsideStringToken(); return true; } current = new StringLiteralToken(); return true; } // We didn't recognize the token, so this is a syntax error. _reader.SinkCharacter(); current = new UnrecognizedToken(); return true; }
// http://www.w3.org/TR/html5/syntax.html#insert-a-comment public void InsertComment(CommentToken token, IDocument doc) { //TODO - make sure the steps conform with the specs in the link above. IComment commentNode = new Comment(doc, token.Comment); doc.appendChild(commentNode); }
public IEnumerable <IToken> Tokenize(SafeStreamReader source) { uint lineNumber = 1; uint lexemeStartPositionInLine = 1; uint absolutePosition = 1; var maybeCurrentChar = Option <int> .None; var currentLexemeBuffer = new StringBuilder(); var maybeToken = Option <IToken> .None;; while ((maybeCurrentChar = source.Read()).IsSome) { var currentChar = maybeCurrentChar.Value(); maybeToken = Option <IToken> .None; switch (currentChar) { case var c when string.IsNullOrWhiteSpace(char.ConvertFromUtf32(c)): // if a whitespace was encountered - strip it // and yield whatever in the buffer to the output maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } switch (c) { case '\r': yield return(source.Read() .Some <IToken>(cn => cn == '\n' ? (IToken) new NewLineSymbolToken( absolutePosition, lineNumber, lexemeStartPositionInLine ) : (IToken) new UnrecognizedToken( $"\r{cn}", absolutePosition, lineNumber, lexemeStartPositionInLine ) ) .None(new UnrecognizedToken( $"\r", absolutePosition, lineNumber, lexemeStartPositionInLine )) ); absolutePosition += 2; lineNumber += 1; lexemeStartPositionInLine = 1; break; case '\n': yield return(new NewLineSymbolToken( absolutePosition, lineNumber, lexemeStartPositionInLine )); absolutePosition += 1; lineNumber += 1; lexemeStartPositionInLine = 1; break; default: absolutePosition += 1; lexemeStartPositionInLine += 1; break; } break; case '.': var currentLexeme = currentLexemeBuffer.ToString(); var maybeBeforeToken = IntegerLiteralToken.FromString( currentLexeme, absolutePosition, lineNumber, lexemeStartPositionInLine ) || IdentifierToken.FromString( currentLexeme, absolutePosition, lineNumber, lexemeStartPositionInLine ) || UnrecognizedToken.FromString( currentLexeme, absolutePosition, lineNumber, lexemeStartPositionInLine ) ; var tokes = source.Peek() .Some <ImmutableList <IToken> >(c => { var result = ImmutableList <IToken> .Empty; IToken tokenToAdd = null; switch (c) { case var _ when IsDigit(char.ConvertFromUtf32(c)): currentLexemeBuffer.Append('.'); return(ImmutableList <IToken> .Empty); case '.': absolutePosition += maybeBeforeToken .Map(t => (uint)t.Lexeme.Length) .IfNone(0); lexemeStartPositionInLine += maybeBeforeToken .Some(t => (uint)t.Lexeme.Length) .None(0u); tokenToAdd = new RangeSymbolToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); result = maybeBeforeToken .ToImmutableList() .Add(tokenToAdd); source.Read(); currentLexemeBuffer.Clear(); lexemeStartPositionInLine += (uint)(tokenToAdd?.Lexeme.Length ?? 0); absolutePosition += (uint)(tokenToAdd?.Lexeme.Length ?? 0); return(result); default: absolutePosition += maybeBeforeToken .Map(t => (uint)t.Lexeme.Length) .IfNone(0); lexemeStartPositionInLine += maybeBeforeToken .Some(t => (uint)t.Lexeme.Length) .None(0u); tokenToAdd = new DotSymbolToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); result = maybeBeforeToken .ToImmutableList() .Add(tokenToAdd); currentLexemeBuffer.Clear(); lexemeStartPositionInLine += (uint)(tokenToAdd?.Lexeme.Length ?? 0); absolutePosition += (uint)(tokenToAdd?.Lexeme.Length ?? 0); return(result); } }) .None(() => { absolutePosition += maybeBeforeToken .Map(t => (uint)t.Lexeme.Length) .IfNone(0); lexemeStartPositionInLine += maybeBeforeToken .Some(t => (uint)t.Lexeme.Length) .None(0u); var tokenToAdd = new DotSymbolToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); var result = maybeBeforeToken .ToImmutableList() .Add(tokenToAdd); currentLexemeBuffer.Clear(); lexemeStartPositionInLine += (uint)(tokenToAdd?.Lexeme.Length ?? 0); absolutePosition += (uint)(tokenToAdd?.Lexeme.Length ?? 0); return(result); }) ; foreach (var token in tokes) { yield return(token); } break; case '/': maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } yield return(source.Peek() .Some <IToken>(c => { switch (c) { case '/': var commentContent = source.ReadLine(); var commentToken = new CommentToken( $"/{commentContent}", absolutePosition, lineNumber, lexemeStartPositionInLine ); absolutePosition += (uint)commentContent.Length; lineNumber += 1; lexemeStartPositionInLine = 0; return commentToken; case '=': var notEqualsToken = new NotEqualsOperatorToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); source.Read(); absolutePosition += 1; lexemeStartPositionInLine = 1; return notEqualsToken; default: return new DivideOperatorToken( (uint)source.BaseStream.Position, lineNumber, lexemeStartPositionInLine ); } }) .None(() => new DivideOperatorToken( (uint)source.BaseStream.Position, lineNumber, lexemeStartPositionInLine ))); absolutePosition += 1; lexemeStartPositionInLine += 1; break; case ':': maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } yield return(source.Peek() .Filter(c => c == '=') .Some <IToken>(c => { var result = new AssignmentOperatorToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); source.Read(); absolutePosition += 1; lexemeStartPositionInLine += 1; return result; }) .None(new ColonSymbolToken( absolutePosition, lineNumber, lexemeStartPositionInLine ))); absolutePosition += 1; lexemeStartPositionInLine += 1; break; case '>': maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } yield return(source.Peek() .Filter(c => c == '=') .Some <IToken>(_ => { var result = new GeOperatorToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); source.Read(); absolutePosition += 1; lexemeStartPositionInLine += 1; return result; }) .None(new GtOperatorToken( (uint)absolutePosition, lineNumber, lexemeStartPositionInLine ))); absolutePosition += 1; lexemeStartPositionInLine += 1; break; case '<': maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } yield return(source.Peek() .Filter(c => c == '=') .Some <IToken>(_ => { var result = new LeOperatorToken( absolutePosition, lineNumber, lexemeStartPositionInLine ); source.Read(); absolutePosition += 1; lexemeStartPositionInLine += 1; return result; }) .None(new LtOperatorToken( absolutePosition, lineNumber, lexemeStartPositionInLine ))); absolutePosition += 1; lexemeStartPositionInLine += 1; break; case '*': case '%': case '+': case '-': case '=': case ',': case '[': case ']': case '(': case ')': case ';': maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } yield return(SymbolLexemes .TryGetValue(((char)currentChar).ToString()) .Some(cons => cons( absolutePosition, lineNumber, lexemeStartPositionInLine )) .None(() => new UnrecognizedToken( currentChar.ToString(), absolutePosition, lineNumber, lexemeStartPositionInLine ) )); absolutePosition += 1; lexemeStartPositionInLine += 1; break; default: currentLexemeBuffer.Append(char.ConvertFromUtf32(currentChar)); break; } } maybeToken = FlushBuffer( currentLexemeBuffer, ref absolutePosition, lineNumber, ref lexemeStartPositionInLine ); if (maybeToken.IsSome) { yield return(maybeToken.ValueUnsafe()); } }
/* * Method: FindNextToken * * Find the next token. Return 'true' if one was found. False, otherwise. */ override internal bool FindNextToken() { int startPosition = _reader.Position; // Dealing with whitespace? if (_reader.SinkMultipleWhiteSpace()) { current = new WhitespaceToken(); return true; } // Check for one-line comment else if (_reader.Sink("//")) { // Looks like a one-line comment. Follow it to the End-of-line _reader.SinkToEndOfLine(); current = new CommentToken(); return true; } // Check for multi-line comment else if (_reader.Sink("/*")) { _reader.SinkUntil("*/"); // Was the ending */ found? if (_reader.EndOfLines) { // No. There was a /* without a */. Return this a syntax error token. current = new CSharpTokenizer.EndOfFileInsideCommentToken(); return true; } current = new CommentToken(); return true; } // Handle chars else if (_reader.Sink("\'")) { while (_reader.CurrentCharacter != '\'') { if (_reader.Sink("\\")) { /* reader.Skip the escape sequence. This isn't exactly right. We should detect: simple-escape-sequence: one of \' \" \\ \0 \a \b \f \n \r \t \v hexadecimal-escape-sequence: \x hex-digit hex-digit[opt] hex-digit[opt] hex-digit[opt] */ } _reader.SinkCharacter(); } if (_reader.SinkCharacter() != '\'') { Debug.Assert(false, "Code defect in tokenizer: Should have yielded a closing tick."); } current = new CSharpTokenizer.CharLiteralToken(); return true; } // Check for verbatim string else if (_reader.Sink("@\"")) { do { // Inside a verbatim string "" is treated as a special character while (_reader.Sink("\"\"")) { } } while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"'); // Can't end a file inside a string if (_reader.EndOfLines) { current = new EndOfFileInsideStringToken(); return true; } // reader.Skip the ending quote. current = new StringLiteralToken(); current.InnerText = _reader.GetCurrentMatchedString(startPosition).Substring(1); return true; } // Check for a quoted string. else if (_reader.Sink("\"")) { while (_reader.CurrentCharacter == '\\' || _reader.MatchRegularStringLiteral()) { // See if we have an escape sequence. if (_reader.SinkCharacter() == '\\') { // This is probably an escape character. if (_reader.SinkStringEscape()) { // This isn't nearly right. We just do barely enough to make a string // with an embedded escape sequence return _some_ string whose start and // end match the real bounds of the string. } else { // This is a compiler error. _reader.SinkCharacter(); current = new CSharpTokenizer.UnrecognizedStringEscapeToken(); return true; } } } // Is it a newline? if (TokenChar.IsNewLine(_reader.CurrentCharacter)) { current = new CSharpTokenizer.NewlineInsideStringToken(); return true; } // Create the token. if (_reader.SinkCharacter() != '\"') { Debug.Assert(false, "Defect in tokenizer: Should have yielded a terminating quote."); } current = new StringLiteralToken(); return true; } // Identifier or keyword? else if ( // From 2.4.2 Identifiers: A '@' can be used to prefix an identifier so that a keyword can be used as an identifier. _reader.CurrentCharacter == '@' || _reader.MatchNextIdentifierStart() ) { if (_reader.CurrentCharacter == '@') { _reader.SinkCharacter(); } // Now, the next character must be an identifier start. if (!_reader.SinkIdentifierStart()) { current = new ExpectedIdentifierToken(); return true; } // Sink the rest of the identifier. while (_reader.SinkIdentifierPart()) { } string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition); switch (identifierOrKeyword) { default: if (Array.IndexOf(s_keywordList, identifierOrKeyword) >= 0) { current = new KeywordToken(); return true; } // If the identifier starts with '@' then we need to strip it off. // The '@' is for escaping so that we can have an identifier called // the same thing as a reserved keyword (i.e. class, if, foreach, etc) string identifier = _reader.GetCurrentMatchedString(startPosition); if (identifier.StartsWith("@", StringComparison.Ordinal)) { identifier = identifier.Substring(1); } // Create the token. current = new IdentifierToken(); current.InnerText = identifier; return true; case "false": case "true": current = new BooleanLiteralToken(); return true; case "null": current = new CSharpTokenizer.NullLiteralToken(); return true; } } // Open scope else if (_reader.Sink("{")) { current = new CSharpTokenizer.OpenScopeToken(); return true; } // Close scope else if (_reader.Sink("}")) { current = new CSharpTokenizer.CloseScopeToken(); return true; } // Hexidecimal integer literal else if (_reader.SinkIgnoreCase("0x")) { // Sink the hex digits. if (!_reader.SinkMultipleHexDigits()) { current = new ExpectedValidHexDigitToken(); return true; } // Skip the L, U, l, u, ul, etc. _reader.SinkLongIntegerSuffix(); current = new HexIntegerLiteralToken(); return true; } // Decimal integer literal else if (_reader.SinkMultipleDecimalDigits()) { // reader.Skip the L, U, l, u, ul, etc. _reader.SinkLongIntegerSuffix(); current = new DecimalIntegerLiteralToken(); return true; } // Check for single-digit operators and punctuators else if (_reader.SinkOperatorOrPunctuator()) { current = new OperatorOrPunctuatorToken(); return true; } // Preprocessor line else if (_reader.CurrentCharacter == '#') { if (_reader.Sink("#if")) { current = new OpenConditionalDirectiveToken(); } else if (_reader.Sink("#endif")) { current = new CloseConditionalDirectiveToken(); } else { current = new PreprocessorToken(); } _reader.SinkToEndOfLine(); return true; } // We didn't recognize the token, so this is a syntax error. _reader.SinkCharacter(); current = new UnrecognizedToken(); return true; }
public HtmlParserCommentEventArgs(CommentToken commentToken) { CommentToken = commentToken; }
/* * Method: FindNextToken * * Find the next token. Return 'true' if one was found. False, otherwise. */ override internal bool FindNextToken() { int startPosition = _reader.Position; // VB docs claim whitespace is Unicode category Zs. However, // this category does not contain tabs. Assuming a less restrictive // definition for whitespace... if (_reader.SinkWhiteSpace()) { while (_reader.SinkWhiteSpace()) { } // Now, we need to check for the line continuation character. if (_reader.SinkLineContinuationCharacter()) // Line continuation is '_' { // Save the current position because we may need to come back here. int savePosition = _reader.Position - 1; // Skip all whitespace after the '_' while (_reader.SinkWhiteSpace()) { } // Now, skip all the newlines. // Need at least one newline for this to count as line continuation. int count = 0; while (_reader.SinkNewLine()) { ++count; } if (count > 0) { current = new VisualBasicTokenizer.LineContinuationToken(); return(true); } // Otherwise, fall back to plain old whitespace. _reader.Position = savePosition; } current = new WhitespaceToken(); return(true); } // Line terminators are separate from whitespace and are significant. else if (_reader.SinkNewLine()) { // We want one token per line terminator. current = new VisualBasicTokenizer.LineTerminatorToken(); return(true); } // Check for a comment--either those that start with ' or rem. else if (_reader.SinkLineCommentStart()) { // Skip to the first EOL. _reader.SinkToEndOfLine(); current = new CommentToken(); return(true); } // Identifier or keyword? else if ( // VB allows escaping of identifiers by surrounding them with [] // In other words, // Date is a keyword but, // [Date] is an identifier. _reader.CurrentCharacter == '[' || _reader.MatchNextIdentifierStart() ) { bool escapedIdentifier = false; if (_reader.CurrentCharacter == '[') { escapedIdentifier = true; _reader.SinkCharacter(); // Now, the next character must be an identifier start. if (!_reader.SinkIdentifierStart()) { current = new ExpectedIdentifierToken(); return(true); } } // Sink the rest of the identifier. while (_reader.SinkIdentifierPart()) { } // If this was an escaped identifier the we need to get the terminating ']'. if (escapedIdentifier) { if (!_reader.Sink("]")) { current = new ExpectedIdentifierToken(); return(true); } } else { // Escaped identifiers are not allowed to have trailing type character. _reader.SinkTypeCharacter(); // Type character is optional. } // An identifier that is only a '_' is illegal because it is // ambiguous with line continuation string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition); if (identifierOrKeyword == "_" || identifierOrKeyword == "[_]" || identifierOrKeyword == "[]") { current = new ExpectedIdentifierToken(); return(true); } // Make an upper-case version in order to check whether this may be a keyword. string upper = identifierOrKeyword.ToUpperInvariant(); switch (upper) { default: if (Array.IndexOf(s_keywordList, upper) >= 0) { current = new KeywordToken(); return(true); } // Create the token. current = new IdentifierToken(); // Trim off the [] if this is an escaped identifier. if (escapedIdentifier) { current.InnerText = identifierOrKeyword.Substring(1, identifierOrKeyword.Length - 2); } return(true); case "FALSE": case "TRUE": current = new BooleanLiteralToken(); return(true); } } // Is it a hex integer? else if (_reader.SinkHexIntegerPrefix()) { if (!_reader.SinkMultipleHexDigits()) { current = new ExpectedValidHexDigitToken(); return(true); } // Sink a suffix if there is one. _reader.SinkIntegerSuffix(); current = new HexIntegerLiteralToken(); return(true); } // Is it an octal integer? else if (_reader.SinkOctalIntegerPrefix()) { if (!_reader.SinkMultipleOctalDigits()) { current = new VisualBasicTokenizer.ExpectedValidOctalDigitToken(); return(true); } // Sink a suffix if there is one. _reader.SinkIntegerSuffix(); current = new VisualBasicTokenizer.OctalIntegerLiteralToken(); return(true); } // Is it a decimal integer? else if (_reader.SinkMultipleDecimalDigits()) { // Sink a suffix if there is one. _reader.SinkDecimalIntegerSuffix(); current = new DecimalIntegerLiteralToken(); return(true); } // Preprocessor line else if (_reader.CurrentCharacter == '#') { if (_reader.SinkIgnoreCase("#if")) { current = new OpenConditionalDirectiveToken(); } else if (_reader.SinkIgnoreCase("#end if")) { current = new CloseConditionalDirectiveToken(); } else { current = new PreprocessorToken(); } _reader.SinkToEndOfLine(); return(true); } // Is it a separator? else if (_reader.SinkSeparatorCharacter()) { current = new VisualBasicTokenizer.SeparatorToken(); return(true); } // Is it an operator? else if (_reader.SinkOperator()) { current = new OperatorToken(); return(true); } // A string? else if (_reader.Sink("\"")) { do { // Inside a verbatim string "" is treated as a special character while (_reader.Sink("\"\"")) { } }while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"'); // Can't end a file inside a string if (_reader.EndOfLines) { current = new EndOfFileInsideStringToken(); return(true); } current = new StringLiteralToken(); return(true); } // We didn't recognize the token, so this is a syntax error. _reader.SinkCharacter(); current = new UnrecognizedToken(); return(true); }
private static void AssertAreEqualInternal(Token expectedToken, Token actualToken, int index = -1) { if ((expectedToken == null) && (actualToken == null)) { return; } if (expectedToken == null) { Assert.Fail(LexerHelper.GetAssertErrorMessage("expected is null, but actual is not null", index)); } if (actualToken == null) { Assert.Fail(LexerHelper.GetAssertErrorMessage("expected is not null, but actual is null", index)); } Assert.AreEqual(expectedToken.GetType(), actualToken.GetType(), LexerHelper.GetAssertErrorMessage($"actual type does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.StartPosition.Position, actualToken.Extent.StartPosition.Position, LexerHelper.GetAssertErrorMessage($"actual Start Position does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.StartPosition.LineNumber, actualToken.Extent.StartPosition.LineNumber, LexerHelper.GetAssertErrorMessage($"actual Start Line does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.StartPosition.ColumnNumber, actualToken.Extent.StartPosition.ColumnNumber, LexerHelper.GetAssertErrorMessage($"actual Start Column does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.EndPosition.Position, actualToken.Extent.EndPosition.Position, LexerHelper.GetAssertErrorMessage($"actual End Position does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.EndPosition.LineNumber, actualToken.Extent.EndPosition.LineNumber, LexerHelper.GetAssertErrorMessage($"actual End Line does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.EndPosition.ColumnNumber, actualToken.Extent.EndPosition.ColumnNumber, LexerHelper.GetAssertErrorMessage($"actual End Column does not match expected value", index)); Assert.AreEqual(expectedToken.Extent.Text, actualToken.Extent.Text, LexerHelper.GetAssertErrorMessage($"actual Text does not match expected value", index)); switch (expectedToken) { case AliasIdentifierToken token: Assert.IsTrue( AliasIdentifierToken.AreEqual((AliasIdentifierToken)expectedToken, (AliasIdentifierToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case AttributeCloseToken token: Assert.IsTrue( AttributeCloseToken.AreEqual((AttributeCloseToken)expectedToken, (AttributeCloseToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case AttributeOpenToken token: Assert.IsTrue( AttributeOpenToken.AreEqual((AttributeOpenToken)expectedToken, (AttributeOpenToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case BlockCloseToken token: Assert.IsTrue( BlockCloseToken.AreEqual((BlockCloseToken)expectedToken, (BlockCloseToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case BlockOpenToken token: Assert.IsTrue( BlockOpenToken.AreEqual((BlockOpenToken)expectedToken, (BlockOpenToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case BooleanLiteralToken token: Assert.IsTrue( BooleanLiteralToken.AreEqual((BooleanLiteralToken)expectedToken, (BooleanLiteralToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case ColonToken token: Assert.IsTrue( ColonToken.AreEqual((ColonToken)expectedToken, (ColonToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case CommaToken token: Assert.IsTrue( CommaToken.AreEqual((CommaToken)expectedToken, (CommaToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case CommentToken token: Assert.IsTrue( CommentToken.AreEqual((CommentToken)expectedToken, (CommentToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case DotOperatorToken token: Assert.IsTrue( DotOperatorToken.AreEqual((DotOperatorToken)expectedToken, (DotOperatorToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case EqualsOperatorToken token: Assert.IsTrue( EqualsOperatorToken.AreEqual((EqualsOperatorToken)expectedToken, (EqualsOperatorToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case IdentifierToken token: Assert.IsTrue( IdentifierToken.AreEqual((IdentifierToken)expectedToken, (IdentifierToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case IntegerLiteralToken token: Assert.IsTrue( IntegerLiteralToken.AreEqual((IntegerLiteralToken)expectedToken, (IntegerLiteralToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case NullLiteralToken token: Assert.IsTrue( NullLiteralToken.AreEqual((NullLiteralToken)expectedToken, (NullLiteralToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case ParenthesisCloseToken token: Assert.IsTrue( ParenthesisCloseToken.AreEqual((ParenthesisCloseToken)expectedToken, (ParenthesisCloseToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case ParenthesisOpenToken token: Assert.IsTrue( ParenthesisOpenToken.AreEqual((ParenthesisOpenToken)expectedToken, (ParenthesisOpenToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case PragmaToken token: Assert.IsTrue( PragmaToken.AreEqual((PragmaToken)expectedToken, (PragmaToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case RealLiteralToken token: Assert.IsTrue( RealLiteralToken.AreEqual((RealLiteralToken)expectedToken, (RealLiteralToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case StatementEndToken token: Assert.IsTrue( StatementEndToken.AreEqual((StatementEndToken)expectedToken, (StatementEndToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case StringLiteralToken token: Assert.IsTrue( StringLiteralToken.AreEqual((StringLiteralToken)expectedToken, (StringLiteralToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; case WhitespaceToken token: Assert.IsTrue( WhitespaceToken.AreEqual((WhitespaceToken)expectedToken, (WhitespaceToken)actualToken), LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index) ); break; default: throw new NotImplementedException($"Cannot compare type '{expectedToken.GetType().Name}'"); } }
private int Comment() { char ch = data[chBaseIndex]; if (ch == '/') { int chNextIndex = chBaseIndex + 1; if (chNextIndex < data.Length) { char nextCh = data[chNextIndex]; if (nextCh == '/') { // Line comment - scan for end of line, and collect. int chScanningIndex = chNextIndex; while (++chScanningIndex <= data.Length) { bool eof = chScanningIndex >= data.Length - 1; bool proceed = eof; if (!proceed) { char chScanning = data[chScanningIndex]; proceed = SeparatorToken.Map.ContainsKey(chScanning) && SeparatorToken.Map[chScanning] == NssSeparators.NewLine; } if (proceed) { CommentToken comment = new CommentToken(); comment.CommentType = CommentType.LineComment; int chStartIndex = chNextIndex + 1; int chEndIndex = eof ? data.Length : chScanningIndex; if (chStartIndex == chEndIndex) { comment.Comment = ""; } else { comment.Comment = data.Substring(chStartIndex, chEndIndex - chStartIndex); } int chNewBaseIndex = chEndIndex; AttachDebugData(comment, DebugRanges, chBaseIndex, chNewBaseIndex - 1); Tokens.Add(comment); chBaseIndex = chNewBaseIndex; break; } } } else if (nextCh == '*') { // Block comment - scan for the closing */, ignoring everything else. bool terminated = false; int chScanningIndex = chNextIndex + 1; while (++chScanningIndex < data.Length) { char chScanning = data[chScanningIndex]; if (chScanning == '/') { char chScanningLast = data[chScanningIndex - 1]; if (chScanningLast == '*') { terminated = true; break; } } } bool eof = chScanningIndex >= data.Length - 1; CommentToken comment = new CommentToken(); comment.CommentType = CommentType.BlockComment; comment.Terminated = terminated; int chStartIndex = chBaseIndex + 2; int chEndIndex = !terminated && eof ? data.Length : chScanningIndex + (terminated ? -1 : 0); comment.Comment = data.Substring(chStartIndex, chEndIndex - chStartIndex); int chNewBaseIndex = eof ? data.Length : chScanningIndex + 1; AttachDebugData(comment, DebugRanges, chBaseIndex, chNewBaseIndex - 1); Tokens.Add(comment); chBaseIndex = chNewBaseIndex; } } } return(chBaseIndex); }
public static Token Create(TokenKind kind, SourceLocation location) { Token token; switch (kind) { #region case KEYWORD: case TokenKind.T_AS: case TokenKind.T_BREAK: case TokenKind.T_CASE: case TokenKind.T_CATCH: case TokenKind.T_CONST: case TokenKind.T_CONTINUE: case TokenKind.T_DEFAULT: case TokenKind.T_DELETE: case TokenKind.T_DO: case TokenKind.T_ELSE: case TokenKind.T_ENUM: case TokenKind.T_FALSE: case TokenKind.T_FINALLY: case TokenKind.T_FOR: case TokenKind.T_FUNCTION: case TokenKind.T_IF: case TokenKind.T_IMPORT: case TokenKind.T_IN: case TokenKind.T_INSTANCEOF: case TokenKind.T_LET: case TokenKind.T_NEW: case TokenKind.T_NULL: case TokenKind.T_ON: case TokenKind.T_PRAGMA: case TokenKind.T_PROPERTY: case TokenKind.T_PUBLIC: case TokenKind.T_READONLY: case TokenKind.T_RESERVED_WORD: case TokenKind.T_RETURN: case TokenKind.T_SET: case TokenKind.T_SIGNAL: case TokenKind.T_SWITCH: case TokenKind.T_THIS: case TokenKind.T_THROW: case TokenKind.T_TRUE: case TokenKind.T_TRY: case TokenKind.T_TYPEOF: case TokenKind.T_VAR: case TokenKind.T_VOID: case TokenKind.T_WHILE: case TokenKind.T_WITH: #endregion token = new KeywordToken(); break; case TokenKind.T_NUMERIC_LITERAL: token = new NumberToken(); break; case TokenKind.T_MULTILINE_STRING_LITERAL: case TokenKind.T_STRING_LITERAL: token = new StringToken(); break; case TokenKind.T_COMMENT: token = new CommentToken(); break; default: token = new Token(); break; } token.Kind = kind; token.Location = location; return(token); }
public static string ToXml(string html) { StringBuilder result = new StringBuilder(); // Standard XML file header, including entities that are likely to be used. result.Append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); ParseReader reader = new ParseReader(html); TagParser parser = new TagParser(reader); Stack <string> nestingStack = new Stack <string>(); try { ParseToken token = parser.GetNextToken(); // Ignore leading white-space. while (token is SpacesToken || token is NewlineToken || token is DoctypeToken) { token = parser.GetNextToken(); } while (!(token is EOFToken)) { Log.DebugFormat("Token = {0}", token); if (token is TagToken) { TagToken t = (TagToken)token; if (!t.Tag.IsEndTag) { // Deal with start-tag. Typically this will be new element nesting. Tag startTag = t.Tag; if (startTag is EmptyElement) { result.Append(((EmptyElement)startTag).ToString()); } else { // Tags that are always empty elements are converted to empty elements here. // Element names are pushed onto the stack to balance elements with missing end-tag. string startTagName = startTag.Name.ToLower(); Log.DebugFormat("startTagName = {0}", startTagName); if (EmptyElements.Contains(startTagName)) { result.Append((new EmptyElement(startTag)).ToString()); } else { result.Append(startTag.ToString()); nestingStack.Push(startTagName); } } } else { // Deal with end-tag. Tag endTag = t.Tag; // Remove the '/' from beginning of the tag-name for comparison. string endTagName = endTag.Name.Substring(1).ToLower(); Log.DebugFormat("endTagName = {0}", endTagName); // Ignore some end-tags for empty elements that are handled with or without empty element syntax. if (EmptyElements.Contains(endTagName)) { Log.InfoFormat("Ignoring redundant end-tag: {0}", endTagName); } else { // Keep element tags matched appropriately. string peek = nestingStack.Peek(); if (peek == null) { Log.WarnFormat("Ignoring extra content at end of document! </{0}> ({1})", endTagName, parser.GetCharacterPosition()); } else { if (peek.Equals(endTagName)) { nestingStack.Pop(); } else { // Pair all the previous unmatched tags for these important structural elements. // These elements appear only once, so should never be automatically closed. if (SingleElements.Contains(endTagName)) { while (peek != endTagName) { StringBuilder endtag = (new StringBuilder()).Append("</").Append(peek).Append('>'); Log.WarnFormat("Adding a missing end-tag! {0} ({1})", endtag, parser.GetCharacterPosition()); result.Append(endtag); nestingStack.Pop(); peek = nestingStack.Peek(); } // Remove the current item from the stack, as it has been paired now. nestingStack.Pop(); } else { // Insert a matching start-tag before the unbalanced end-tag found. StringBuilder startTag = (new StringBuilder()).Append("<").Append(endTagName).Append('>'); Log.WarnFormat("Adding a missing start-tag! {0} ({1})", startTag, parser.GetCharacterPosition()); result.Append(startTag); } } // Write the current element end-tag. result.Append("</").Append(endTagName).Append('>'); } } } } else if (token is WordToken) { WordToken t = (WordToken)token; result.Append(t.Word); } else if (token is SpacesToken) { SpacesToken t = (SpacesToken)token; result.Append(t.Spaces); } else if (token is NumberToken) { NumberToken t = (NumberToken)token; result.Append(t.Number); } else if (token is EntityReferenceToken) { EntityReferenceToken t = (EntityReferenceToken)token; result.Append(XmlEntity(t.Name)); } else if (token is PunctuationToken) { PunctuationToken t = (PunctuationToken)token; result.Append(t.Character); } else if (token is CharacterEntityToken) { CharacterEntityToken t = (CharacterEntityToken)token; result.Append(t.Character); } else if (token is NewlineToken) { result.Append('\n'); } else if (token is ScriptToken) { ScriptToken t = (ScriptToken)token; if (t.Script.Length > 0) { // Script element contents are often empty. // NOTE: Removing any prior use of CDATA section in script, to avoid conflict. string script = t.Script.Replace("<![CDATA[", "").Replace("]]>", ""); result.Append("/*<![CDATA[*/").Append(script).Append("/*]]>*/"); } } else if (token is CDataToken) { CDataToken t = (CDataToken)token; result.Append("<![CDATA[").Append(t.Data).Append("]]>"); } else if (token is CommentToken) { CommentToken t = (CommentToken)token; result.Append("<!--").Append(t.Comment).Append("-->"); } else if (token is DoctypeToken) { // Ignore. } else if (token is ProcessingInstructionToken) { // Ignore. } else { Log.WarnFormat("Unexpected token! {0}", token); } token = parser.GetNextToken(); } Log.Info(parser.GetCompletionReport()); } catch (Exception ex) { Log.Error("EXCEPTION", ex); result = null; } return(result == null ? null : result.ToString()); }
//protected override void VisitInToken(InToken token) { throw new NotImplementedException(); } //protected override void VisitNotInToken(NotInToken token) { throw new NotImplementedException(); } protected override void VisitCommentToken(CommentToken token) { throw new NotImplementedException(); }