Exemple #1
0
        private static CommentToken ResolveDelphiComment(LexerContext context)
        {
            var begin = context.GetIndex();

            context.IncIndex();
            context.IncIndex();
            var inners = new List <CommentToken>();

            while (!context.IsEnded() && !context.IsNewLineNow())
            {
                if (CommentToken.IsOldStyleCommentBegin(context))
                {
                    inners.Add(ResolveOldStyleComment(context));
                    continue;
                }
                if (CommentToken.IsTurboPascalCommentBegin(context))
                {
                    inners.Add(ResolveTurboPascalComment(context));
                    continue;
                }
                context.IncIndex();
            }

            var isSingleLine = inners.All(c => c.IsSingleLineComment);

            if (!isSingleLine)
            {
                throw new SyntaxErrorException();
            }

            return(new CommentToken(true, inners, begin, context.GetIndex()));
        }
Exemple #2
0
 private static IToken Resolve(LexerContext context)
 {
     if (CommentToken.IsCommentBegin(context))
     {
         return(ResolveComment(context));
     }
     else if (NumberToken.IsNumberBegin(context))
     {
         return(ResolveNumber(context));
     }
     else if (StringToken.IsStringBegin(context))
     {
         return(ResolveString(context));
     }
     else if (SpecialSymbolToken.IsSpecial(context))
     {
         return(ResolveSpecial(context));
     }
     else if (IdentifierToken.IsIdentifierBegin(context))
     {
         return(ResolveIdentifier(context));
     }
     else
     {
         throw new SyntaxErrorException();
     }
 }
Exemple #3
0
        private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
        {
            if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            const int toDecimalStartLength = 4;

            if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
                                  NumberStyles.Number,
                                  CultureInfo.InvariantCulture,
                                  out var version))
            {
                return(HandleMissingVersion(comment, isLenientParsing, log));
            }

            var atEnd  = scanner.CurrentPosition == scanner.Length;
            var rewind = atEnd ? 1 : 2;

            var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind;

            scanner.Seek(0);

            var result = new HeaderVersion(version, comment.Data, commentOffset);

            return(result);
        }
Exemple #4
0
        public void CommentToken_Uniqueness()
        {
            var comment = new CommentToken
            {
                Content = new StringSlice("comment"),
                Indent  = 0,
                ContentStartPosition = 0,
                ContentEndPosition   = 6,
                IsClosed             = true,
                TagStartPosition     = 0,
                TagEndPosition       = 6
            };

            var comment2 = new CommentToken
            {
                Content = new StringSlice("comment"),
                Indent  = 0,
                ContentStartPosition = 0,
                ContentEndPosition   = 6,
                IsClosed             = true,
                TagStartPosition     = 0,
                TagEndPosition       = 6
            };

            Assert.False(comment.Equals(null));
            Assert.Equal(comment, comment2);
            Assert.Equal(comment.GetHashCode(), comment2.GetHashCode());
        }
Exemple #5
0
        public void Test_3_5_2_Comments_B()
        {
            // Sequential comments
            Scanner lexer = this.GetLexer("\" This is a comment \"\" and another comment \"");
            // 1st comment
            object obj = lexer.GetToken();

            Assert.IsInstanceOfType(obj, typeof(CommentToken));
            CommentToken token = (CommentToken)obj;

            Assert.IsTrue(token.IsValid);
            Assert.IsNull(token.ScanError);
            Assert.AreEqual(0, token.StartPosition.Position);
            Assert.AreEqual(20, token.StopPosition.Position);
            Assert.AreEqual(" This is a comment ", token.Value);
            // 2nd comment
            obj = lexer.GetToken();
            Assert.IsInstanceOfType(obj, typeof(CommentToken));
            token = (CommentToken)obj;
            Assert.IsTrue(token.IsValid);
            Assert.IsNull(token.ScanError);
            Assert.AreEqual(21, token.StartPosition.Position);
            Assert.AreEqual(43, token.StopPosition.Position);
            Assert.AreEqual(" and another comment ", token.Value);
            // Should be the last one
            obj = lexer.GetToken();
            Assert.IsInstanceOfType(obj, typeof(EofToken));
        }
Exemple #6
0
        public void HtmlParser_OnCommentStateTest()
        {
            string text   = "<!-- abcde -->";
            var    target = new HtmlParser();

            target._cs = new HtmlCharStream(text);

            target._tokenizer = new HtmlTokenizer(target._cs);

            target.CommentFound +=
                delegate(object sender, HtmlParserCommentEventArgs args) {
                Assert.True(args.CommentToken is CommentToken);
                CommentToken ct = args.CommentToken;

                Assert.Equal(1, ct.Count);

                Assert.Equal(0, ct.Start);
                Assert.Equal(14, ct.End);

                Assert.True(ct[0] is HtmlToken);
                Assert.True(ct[0] is IExpandableTextRange);

                Assert.Equal(0, ct[0].Start);
                Assert.Equal(14, ct[0].End);
            };

            target.OnCommentState();
        }
Exemple #7
0
        private static void WriteComment(CommentToken comment, Stream outputStream)
        {
            var bytes = OtherEncodings.StringAsLatin1Bytes(comment.Data);

            outputStream.WriteByte(Comment);
            outputStream.Write(bytes, 0, bytes.Length);
            WriteLineBreak(outputStream);
        }
Exemple #8
0
        private void PostBlockCommentMissingClosingError()
        {
            CommentToken openBlockComment = _comments.LastOrDefault(cm => cm.IsOpenBlock == true);

            if (openBlockComment != null)
            {
                PostError(openBlockComment, TexlStrings.ErrMissingEndOfBlockComment);
            }
        }
Exemple #9
0
        private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log)
        {
            if (isLenientParsing)
            {
                log.Warn($"Did not find a version header of the correct format, defaulting to 1.4 since lenient. Header was: {comment.Data}.");

                return(new HeaderVersion(1.4m, "PDF-1.4", 0));
            }

            throw new PdfDocumentFormatException($"The comment which should have provided the version was in the wrong format: {comment.Data}.");
        }
            public void When_HasSingleCommentToken()
            {
                var section = new ConfigIniSection();
                var token1  = new CommentToken(new[] { "; Hey", ";Whats", " ;Up?" }, LineEnding.None);

                section.Tokens.Add(token1);
                section.MergeConsecutiveTokens();

                Assert.That(section.Tokens, Has.Count.EqualTo(1));
                Assert.That(section.Tokens[0], Is.SameAs(token1));
                Assert.That(token1.GetStringLines(), Is.EquivalentTo(new[] { "; Hey", ";Whats", " ;Up?" }));
            }
Exemple #11
0
        public void Test_3_5_2_Comments_A()
        {
            Scanner lexer = this.GetLexer("\" This is a comment \n with two lines \"");
            object  obj   = lexer.GetToken();

            Assert.IsInstanceOfType(obj, typeof(CommentToken));
            CommentToken token = (CommentToken)obj;

            Assert.IsTrue(token.IsValid);
            Assert.IsNull(token.ScanError);
            Assert.AreEqual(0, token.StartPosition.Position);
            Assert.AreEqual(37, token.StopPosition.Position);
            Assert.AreEqual(" This is a comment \n with two lines ", token.Value);
        }
Exemple #12
0
        private static CommentToken ResolveTurboPascalComment(LexerContext context)
        {
            var begin = context.GetIndex();

            context.IncIndex();
            var inners    = new List <CommentToken>();
            var isOneLine = true;

            while (!context.IsEnded())
            {
                if (CommentToken.IsOldStyleCommentBegin(context))
                {
                    inners.Add(ResolveOldStyleComment(context));
                    continue;
                }
                if (CommentToken.IsDelphiCommentBegin(context))
                {
                    inners.Add(ResolveDelphiComment(context));
                    continue;
                }
                if (context.IsNewLineNow())
                {
                    isOneLine = false;
                    for (var i = 0; i < Environment.NewLine.Length; i++)
                    {
                        context.IncIndex();
                    }
                }

                if (context.IsNewLineNow())
                {
                    isOneLine = false;
                    for (var i = 0; i < Environment.NewLine.Length; i++)
                    {
                        context.IncIndex();
                    }
                }

                if (CommentToken.IsTurboPascalCommentEnd(context))
                {
                    context.IncIndex();
                    break;
                }
                context.IncIndex();
            }

            return(new CommentToken(isOneLine && inners.All(c => c.IsSingleLineComment), inners,
                                    begin, context.GetIndex()));
        }
Exemple #13
0
 private static CommentToken ResolveComment(LexerContext context)
 {
     if (CommentToken.IsOldStyleCommentBegin(context))
     {
         return(ResolveOldStyleComment(context));
     }
     else if (CommentToken.IsTurboPascalCommentBegin(context))
     {
         return(ResolveTurboPascalComment(context));
     }
     else
     {
         return(ResolveDelphiComment(context));
     }
 }
Exemple #14
0
        /// <summary>
        /// Tries to match a comment tag from the provided slice
        /// </summary>
        /// <param name="processor">The processor</param>
        /// <param name="slice">The slice</param>
        /// <returns>If a comment tag was matched</returns>
        public override bool Match(Processor processor, ref StringSlice slice)
        {
            var tagStart = slice.Start - processor.CurrentTags.StartTag.Length;
            var index    = slice.Start;

            while (slice[index].IsWhitespace())
            {
                index++;
            }

            var match = slice[index];

            if (match == TagId)
            {
                slice.Start = index;
                var startIndex = index + 1;

                var commentTag = new CommentToken
                {
                    TagStartPosition     = tagStart,
                    ContentStartPosition = startIndex,
                    IsClosed             = false
                };
                processor.CurrentToken = commentTag;

                while (!slice.IsEmpty && !slice.Match(processor.CurrentTags.EndTag))
                {
                    slice.NextChar();
                }

                if (slice.IsEmpty)
                {
                    return(false);
                }

                commentTag.TagEndPosition     = slice.Start + processor.CurrentTags.EndTag.Length;
                commentTag.ContentEndPosition = slice.Start;
                commentTag.IsClosed           = true;
                slice.Start += processor.CurrentTags.EndTag.Length;
                return(true);
            }

            return(false);
        }
Exemple #15
0
        public static bool TryParseCommentToken(FbxAsciiFileInfo fbxAsciiFileInfo, out CommentToken commentToken)
        {
            var c = fbxAsciiFileInfo.PeekChar();

            if (c != ';')
            {
                commentToken = null;
                return(false);
            }

            var stringBuilder = new StringBuilder();

            while (!c.IsLineEnd() && !fbxAsciiFileInfo.IsEndOfStream())
            {
                stringBuilder.Append(fbxAsciiFileInfo.ReadChar());
                c = fbxAsciiFileInfo.PeekChar();
            }
            commentToken = new CommentToken(stringBuilder.ToString());
            return(true);
        }
Exemple #16
0
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
        {
            token = null;

            if (currentByte != '%')
            {
                return(false);
            }

            var builder = new StringBuilder();

            while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte))
            {
                builder.Append((char)inputBytes.CurrentByte);
            }

            token = new CommentToken(builder.ToString());

            return(true);
        }
            public void When_Has3NonConsecutiveTokens()
            {
                var section = new ConfigIniSection();
                var token1  = new CommentToken(new[] { "; Hey", ";Whats", " ;Up?" }, LineEnding.None);
                var token2  = new WhitespaceToken(new[] { " ", "\t", "" }, LineEnding.None);
                var token3  = new CommentToken(new[] { ";Baz" }, LineEnding.None);

                section.Tokens.Add(token1);
                section.Tokens.Add(token2);
                section.Tokens.Add(token3);
                section.MergeConsecutiveTokens();

                Assert.That(section.Tokens, Has.Count.EqualTo(3));
                Assert.That(section.Tokens[0], Is.SameAs(token1));
                Assert.That(section.Tokens[1], Is.SameAs(token2));
                Assert.That(section.Tokens[2], Is.SameAs(token3));
                Assert.That(token1.GetStringLines(), Is.EquivalentTo(new[] { "; Hey", ";Whats", " ;Up?" }));
                Assert.That(token2.GetStringLines(), Is.EquivalentTo(new[] { " ", "\t", "" }));
                Assert.That(token3.GetStringLines(), Is.EquivalentTo(new[] { ";Baz" }));
            }
        /*
         * Method:  FindNextToken
         *
         * Find the next token. Return 'true' if one was found. False, otherwise.
         */
        override internal bool FindNextToken()
        {
            int startPosition = _reader.Position;

            // Dealing with whitespace?
            if (_reader.SinkMultipleWhiteSpace())
            {
                current = new WhitespaceToken();
                return(true);
            }
            // Check for one-line comment
            else if (_reader.Sink("//"))
            {
                // Looks like a one-line comment. Follow it to the End-of-line
                _reader.SinkToEndOfLine();

                current = new CommentToken();
                return(true);
            }
            // Check for multi-line comment
            else if (_reader.Sink("/*"))
            {
                _reader.SinkUntil("*/");

                // Was the ending */ found?
                if (_reader.EndOfLines)
                {
                    // No. There was a /* without a */. Return this a syntax error token.
                    current = new CSharpTokenizer.EndOfFileInsideCommentToken();
                    return(true);
                }

                current = new CommentToken();
                return(true);
            }
            // Handle chars
            else if (_reader.Sink("\'"))
            {
                while (_reader.CurrentCharacter != '\'')
                {
                    if (_reader.Sink("\\"))
                    {
                        /* reader.Skip the escape sequence.
                         *  This isn't exactly right. We should detect:
                         *
                         *  simple-escape-sequence: one of
                         \' \" \\ \0 \a \b \f \n \r \t \v
                         *
                         *  hexadecimal-escape-sequence:
                         *  \x   hex-digit   hex-digit[opt]   hex-digit[opt]  hex-digit[opt]
                         */
                    }

                    _reader.SinkCharacter();
                }

                if (_reader.SinkCharacter() != '\'')
                {
                    Debug.Assert(false, "Code defect in tokenizer: Should have yielded a closing tick.");
                }
                current = new CSharpTokenizer.CharLiteralToken();
                return(true);
            }
            // Check for verbatim string
            else if (_reader.Sink("@\""))
            {
                do
                {
                    // Inside a verbatim string "" is treated as a special character
                    while (_reader.Sink("\"\""))
                    {
                    }
                }while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"');

                // Can't end a file inside a string
                if (_reader.EndOfLines)
                {
                    current = new EndOfFileInsideStringToken();
                    return(true);
                }

                // reader.Skip the ending quote.
                current           = new StringLiteralToken();
                current.InnerText = _reader.GetCurrentMatchedString(startPosition).Substring(1);
                return(true);
            }
            // Check for a quoted string.
            else if (_reader.Sink("\""))
            {
                while (_reader.CurrentCharacter == '\\' || _reader.MatchRegularStringLiteral())
                {
                    // See if we have an escape sequence.
                    if (_reader.SinkCharacter() == '\\')
                    {
                        // This is probably an escape character.
                        if (_reader.SinkStringEscape())
                        {
                            // This isn't nearly right. We just do barely enough to make a string
                            // with an embedded escape sequence return _some_ string whose start and
                            // end match the real bounds of the string.
                        }
                        else
                        {
                            // This is a compiler error.
                            _reader.SinkCharacter();
                            current = new CSharpTokenizer.UnrecognizedStringEscapeToken();
                            return(true);
                        }
                    }
                }

                // Is it a newline?
                if (TokenChar.IsNewLine(_reader.CurrentCharacter))
                {
                    current = new CSharpTokenizer.NewlineInsideStringToken();
                    return(true);
                }

                // Create the token.
                if (_reader.SinkCharacter() != '\"')
                {
                    Debug.Assert(false, "Defect in tokenizer: Should have yielded a terminating quote.");
                }
                current = new StringLiteralToken();
                return(true);
            }
            // Identifier or keyword?
            else if
            (
                // From 2.4.2 Identifiers: A '@' can be used to prefix an identifier so that a keyword can be used as an identifier.
                _reader.CurrentCharacter == '@' ||
                _reader.MatchNextIdentifierStart()
            )
            {
                if (_reader.CurrentCharacter == '@')
                {
                    _reader.SinkCharacter();
                }

                // Now, the next character must be an identifier start.
                if (!_reader.SinkIdentifierStart())
                {
                    current = new ExpectedIdentifierToken();
                    return(true);
                }

                // Sink the rest of the identifier.
                while (_reader.SinkIdentifierPart())
                {
                }
                string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition);

                switch (identifierOrKeyword)
                {
                default:

                    if (Array.IndexOf(s_keywordList, identifierOrKeyword) >= 0)
                    {
                        current = new KeywordToken();
                        return(true);
                    }

                    // If the identifier starts with '@' then we need to strip it off.
                    // The '@' is for escaping so that we can have an identifier called
                    // the same thing as a reserved keyword (i.e. class, if, foreach, etc)
                    string identifier = _reader.GetCurrentMatchedString(startPosition);
                    if (identifier.StartsWith("@", StringComparison.Ordinal))
                    {
                        identifier = identifier.Substring(1);
                    }

                    // Create the token.
                    current           = new IdentifierToken();
                    current.InnerText = identifier;
                    return(true);

                case "false":
                case "true":
                    current = new BooleanLiteralToken();
                    return(true);

                case "null":
                    current = new CSharpTokenizer.NullLiteralToken();
                    return(true);
                }
            }
            // Open scope
            else if (_reader.Sink("{"))
            {
                current = new CSharpTokenizer.OpenScopeToken();
                return(true);
            }
            // Close scope
            else if (_reader.Sink("}"))
            {
                current = new CSharpTokenizer.CloseScopeToken();
                return(true);
            }
            // Hexidecimal integer literal
            else if (_reader.SinkIgnoreCase("0x"))
            {
                // Sink the hex digits.
                if (!_reader.SinkMultipleHexDigits())
                {
                    current = new ExpectedValidHexDigitToken();
                    return(true);
                }

                // Skip the L, U, l, u, ul, etc.
                _reader.SinkLongIntegerSuffix();

                current = new HexIntegerLiteralToken();
                return(true);
            }
            // Decimal integer literal
            else if (_reader.SinkMultipleDecimalDigits())
            {
                // reader.Skip the L, U, l, u, ul, etc.
                _reader.SinkLongIntegerSuffix();

                current = new DecimalIntegerLiteralToken();
                return(true);
            }
            // Check for single-digit operators and punctuators
            else if (_reader.SinkOperatorOrPunctuator())
            {
                current = new OperatorOrPunctuatorToken();
                return(true);
            }
            // Preprocessor line
            else if (_reader.CurrentCharacter == '#')
            {
                if (_reader.Sink("#if"))
                {
                    current = new OpenConditionalDirectiveToken();
                }
                else if (_reader.Sink("#endif"))
                {
                    current = new CloseConditionalDirectiveToken();
                }
                else
                {
                    current = new PreprocessorToken();
                }

                _reader.SinkToEndOfLine();

                return(true);
            }

            // We didn't recognize the token, so this is a syntax error.
            _reader.SinkCharacter();
            current = new UnrecognizedToken();
            return(true);
        }
Exemple #19
0
 public virtual T applyToCommentToken(CommentToken operand)
 {
     return(applyToParseTreeNode(operand));
 }
Exemple #20
0
        private static Comment ReadComment(CommentToken token)
        {
            CommentType type;
            string value;

            switch (token.Type)
            {
                case ":":
                    type = CommentType.Reference;
                    value = token.Content;
                    break;
                case ".":
                    type = CommentType.Extracted;
                    value = token.Content;
                    break;
                case ",":
                    type = CommentType.Flag;
                    value = token.Content;
                    break;
                case "|":
                    type = CommentType.Previous;
                    value = token.Content;
                    break;
                default:
                    type = CommentType.Translator;
                    value = token.Type + token.Content;
                    break;
            }

            return new Comment { Type = type, Value = value };
        }
Exemple #21
0
 protected virtual void VisitCommentToken(CommentToken token)
 {
     State.Write(this.CommentOpenQuote);
     VisitToken(token.Content);
     State.Write(this.CommentCloseQuote);
 }
Exemple #22
0
        /**
         * This is the main lexing algorithm. It consumes source file as string and puts out token list.
         * Token consists of token type and range that token spans.
         */
        public static IToken[] Parse(string s)
        {
            LinkedList <IToken> ll = new LinkedList <IToken>();
            // use five kinds of token to produce token stream.
            var index = 0;
            var next  = 0;

            while (index < s.Length)
            {
                // try to parse as number
                if (NumberToken.ParseNumber(s, index, out var tokenNumber, out next))
                {
                    ll.AddLast(tokenNumber);
                    index = next;
                    continue;
                }

                // then try to parse as string
                if (StringToken.ParseString(s, index, out var tokenString, out next))
                {
                    ll.AddLast(tokenString);
                    index = next;
                    continue;
                }

                // then try to parse as identifier
                if (IdentifierToken.ParseIdentifier(s, index, out var tokenIdentifier, out next))
                {
                    ll.AddLast(tokenIdentifier);
                    index = next;
                    continue;
                }

                // then try to parse as comment
                if (CommentToken.ParseComment(s, index, out var tokensComment, out next))
                {
                    foreach (var t in tokensComment)
                    {
                        ll.AddLast(t);
                    }
                    index = next;
                    continue;
                }

                // then try to parse as symbol token
                if (SymbolToken.ParseSymbol(s, index, out var tokenSymbol, out next))
                {
                    ll.AddLast(tokenSymbol);
                    index = next;
                    continue;
                }

                if (Char.IsWhiteSpace(s[index]))
                {
                    // skip spaces
                    index++;
                    continue;
                }

                // otherwise token is unknown
                throw new Exception("unknown token " + s[index] + " at position " + index);
            }

            // return collected tokens
            return(ll.ToArray());
        }
Exemple #23
0
        /*
        * Method:  FindNextToken
        *
        * Find the next token. Return 'true' if one was found. False, otherwise.
        */
        internal override bool FindNextToken()
        {
            int startPosition = _reader.Position;

            // VB docs claim whitespace is Unicode category Zs. However,
            // this category does not contain tabs. Assuming a less restrictive
            // definition for whitespace...
            if (_reader.SinkWhiteSpace())
            {
                while (_reader.SinkWhiteSpace())
                {
                }

                // Now, we need to check for the line continuation character.
                if (_reader.SinkLineContinuationCharacter())    // Line continuation is '_'
                {
                    // Save the current position because we may need to come back here.
                    int savePosition = _reader.Position - 1;

                    // Skip all whitespace after the '_'
                    while (_reader.SinkWhiteSpace())
                    {
                    }

                    // Now, skip all the newlines.
                    // Need at least one newline for this to count as line continuation.
                    int count = 0;
                    while (_reader.SinkNewLine())
                    {
                        ++count;
                    }

                    if (count > 0)
                    {
                        current = new VisualBasicTokenizer.LineContinuationToken();
                        return true;
                    }

                    // Otherwise, fall back to plain old whitespace.
                    _reader.Position = savePosition;
                }

                current = new WhitespaceToken();
                return true;
            }
            // Line terminators are separate from whitespace and are significant.
            else if (_reader.SinkNewLine())
            {
                // We want one token per line terminator.
                current = new VisualBasicTokenizer.LineTerminatorToken();
                return true;
            }
            // Check for a comment--either those that start with ' or rem.
            else if (_reader.SinkLineCommentStart())
            {
                // Skip to the first EOL.
                _reader.SinkToEndOfLine();

                current = new CommentToken();
                return true;
            }
            // Identifier or keyword?
            else if
            (
                // VB allows escaping of identifiers by surrounding them with []
                // In other words,
                //      Date is a keyword but,
                //      [Date] is an identifier.
                _reader.CurrentCharacter == '[' ||
                _reader.MatchNextIdentifierStart()
            )
            {
                bool escapedIdentifier = false;
                if (_reader.CurrentCharacter == '[')
                {
                    escapedIdentifier = true;
                    _reader.SinkCharacter();

                    // Now, the next character must be an identifier start.
                    if (!_reader.SinkIdentifierStart())
                    {
                        current = new ExpectedIdentifierToken();
                        return true;
                    }
                }

                // Sink the rest of the identifier.
                while (_reader.SinkIdentifierPart())
                {
                }

                // If this was an escaped identifier the we need to get the terminating ']'.
                if (escapedIdentifier)
                {
                    if (!_reader.Sink("]"))
                    {
                        current = new ExpectedIdentifierToken();
                        return true;
                    }
                }
                else
                {
                    // Escaped identifiers are not allowed to have trailing type character.
                    _reader.SinkTypeCharacter(); // Type character is optional.
                }

                // An identifier that is only a '_' is illegal because it is
                // ambiguous with line continuation
                string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition);
                if (identifierOrKeyword == "_" || identifierOrKeyword == "[_]" || identifierOrKeyword == "[]")
                {
                    current = new ExpectedIdentifierToken();
                    return true;
                }

                // Make an upper-case version in order to check whether this may be a keyword.
                string upper = identifierOrKeyword.ToUpper(CultureInfo.InvariantCulture);

                switch (upper)
                {
                    default:

                        if (Array.IndexOf(s_keywordList, upper) >= 0)
                        {
                            current = new KeywordToken();
                            return true;
                        }

                        // Create the token.
                        current = new IdentifierToken();

                        // Trim off the [] if this is an escaped identifier.
                        if (escapedIdentifier)
                        {
                            current.InnerText = identifierOrKeyword.Substring(1, identifierOrKeyword.Length - 2);
                        }
                        return true;
                    case "FALSE":
                    case "TRUE":
                        current = new BooleanLiteralToken();
                        return true;
                }
            }
            // Is it a hex integer?
            else if (_reader.SinkHexIntegerPrefix())
            {
                if (!_reader.SinkMultipleHexDigits())
                {
                    current = new ExpectedValidHexDigitToken();
                    return true;
                }

                // Sink a suffix if there is one.
                _reader.SinkIntegerSuffix();

                current = new HexIntegerLiteralToken();
                return true;
            }
            // Is it an octal integer?
            else if (_reader.SinkOctalIntegerPrefix())
            {
                if (!_reader.SinkMultipleOctalDigits())
                {
                    current = new VisualBasicTokenizer.ExpectedValidOctalDigitToken();
                    return true;
                }

                // Sink a suffix if there is one.
                _reader.SinkIntegerSuffix();

                current = new VisualBasicTokenizer.OctalIntegerLiteralToken();
                return true;
            }
            // Is it a decimal integer?
            else if (_reader.SinkMultipleDecimalDigits())
            {
                // Sink a suffix if there is one.
                _reader.SinkDecimalIntegerSuffix();

                current = new DecimalIntegerLiteralToken();
                return true;
            }
            // Preprocessor line
            else if (_reader.CurrentCharacter == '#')
            {
                if (_reader.SinkIgnoreCase("#if"))
                {
                    current = new OpenConditionalDirectiveToken();
                }
                else if (_reader.SinkIgnoreCase("#end if"))
                {
                    current = new CloseConditionalDirectiveToken();
                }
                else
                {
                    current = new PreprocessorToken();
                }

                _reader.SinkToEndOfLine();

                return true;
            }
            // Is it a separator?
            else if (_reader.SinkSeparatorCharacter())
            {
                current = new VisualBasicTokenizer.SeparatorToken();
                return true;
            }
            // Is it an operator?
            else if (_reader.SinkOperator())
            {
                current = new OperatorToken();
                return true;
            }
            // A string?
            else if (_reader.Sink("\""))
            {
                do
                {
                    // Inside a verbatim string "" is treated as a special character
                    while (_reader.Sink("\"\""))
                    {
                    }
                }
                while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"');

                // Can't end a file inside a string
                if (_reader.EndOfLines)
                {
                    current = new EndOfFileInsideStringToken();
                    return true;
                }

                current = new StringLiteralToken();
                return true;
            }

            // We didn't recognize the token, so this is a syntax error.
            _reader.SinkCharacter();
            current = new UnrecognizedToken();
            return true;
        }
 // http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 public void InsertComment(CommentToken token, IDocument doc)
 {
     //TODO - make sure the steps conform with the specs in the link above.
     IComment commentNode = new Comment(doc, token.Comment);
     doc.appendChild(commentNode);
 }
Exemple #25
0
        public IEnumerable <IToken> Tokenize(SafeStreamReader source)
        {
            uint lineNumber = 1;
            uint lexemeStartPositionInLine = 1;
            uint absolutePosition          = 1;
            var  maybeCurrentChar          = Option <int> .None;

            var currentLexemeBuffer = new StringBuilder();

            var maybeToken = Option <IToken> .None;;

            while ((maybeCurrentChar = source.Read()).IsSome)
            {
                var currentChar = maybeCurrentChar.Value();

                maybeToken = Option <IToken> .None;

                switch (currentChar)
                {
                case var c when string.IsNullOrWhiteSpace(char.ConvertFromUtf32(c)):
                    // if a whitespace was encountered - strip it
                    // and yield whatever in the buffer to the output

                    maybeToken = FlushBuffer(
                        currentLexemeBuffer,
                        ref absolutePosition,
                        lineNumber,
                        ref lexemeStartPositionInLine
                        );

                    if (maybeToken.IsSome)
                    {
                        yield return(maybeToken.ValueUnsafe());
                    }

                    switch (c)
                    {
                    case '\r':
                        yield return(source.Read()
                                     .Some <IToken>(cn =>
                                                    cn == '\n' ?
                                                    (IToken) new NewLineSymbolToken(
                                                        absolutePosition,
                                                        lineNumber,
                                                        lexemeStartPositionInLine
                                                        ) :
                                                    (IToken) new UnrecognizedToken(
                                                        $"\r{cn}",
                                                        absolutePosition,
                                                        lineNumber,
                                                        lexemeStartPositionInLine
                                                        )
                                                    )
                                     .None(new UnrecognizedToken(
                                               $"\r",
                                               absolutePosition,
                                               lineNumber,
                                               lexemeStartPositionInLine
                                               ))
                                     );

                        absolutePosition         += 2;
                        lineNumber               += 1;
                        lexemeStartPositionInLine = 1;

                        break;

                    case '\n':
                        yield return(new NewLineSymbolToken(
                                         absolutePosition,
                                         lineNumber,
                                         lexemeStartPositionInLine
                                         ));

                        absolutePosition         += 1;
                        lineNumber               += 1;
                        lexemeStartPositionInLine = 1;

                        break;

                    default:
                        absolutePosition          += 1;
                        lexemeStartPositionInLine += 1;
                        break;
                    }

                    break;

                case '.':
                    var currentLexeme = currentLexemeBuffer.ToString();

                    var maybeBeforeToken =
                        IntegerLiteralToken.FromString(
                            currentLexeme,
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            ) ||
                        IdentifierToken.FromString(
                            currentLexeme,
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            ) ||
                        UnrecognizedToken.FromString(
                            currentLexeme,
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            )
                    ;

                    var tokes =
                        source.Peek()
                        .Some <ImmutableList <IToken> >(c =>
                    {
                        var result        = ImmutableList <IToken> .Empty;
                        IToken tokenToAdd = null;

                        switch (c)
                        {
                        case var _ when IsDigit(char.ConvertFromUtf32(c)):
                            currentLexemeBuffer.Append('.');
                            return(ImmutableList <IToken> .Empty);

                        case '.':
                            absolutePosition += maybeBeforeToken
                                                .Map(t => (uint)t.Lexeme.Length)
                                                .IfNone(0);
                            lexemeStartPositionInLine += maybeBeforeToken
                                                         .Some(t => (uint)t.Lexeme.Length)
                                                         .None(0u);

                            tokenToAdd = new RangeSymbolToken(
                                absolutePosition,
                                lineNumber,
                                lexemeStartPositionInLine
                                );

                            result = maybeBeforeToken
                                     .ToImmutableList()
                                     .Add(tokenToAdd);
                            source.Read();
                            currentLexemeBuffer.Clear();
                            lexemeStartPositionInLine += (uint)(tokenToAdd?.Lexeme.Length ?? 0);
                            absolutePosition          += (uint)(tokenToAdd?.Lexeme.Length ?? 0);

                            return(result);

                        default:
                            absolutePosition += maybeBeforeToken
                                                .Map(t => (uint)t.Lexeme.Length)
                                                .IfNone(0);
                            lexemeStartPositionInLine += maybeBeforeToken
                                                         .Some(t => (uint)t.Lexeme.Length)
                                                         .None(0u);

                            tokenToAdd = new DotSymbolToken(
                                absolutePosition,
                                lineNumber,
                                lexemeStartPositionInLine
                                );

                            result = maybeBeforeToken
                                     .ToImmutableList()
                                     .Add(tokenToAdd);
                            currentLexemeBuffer.Clear();
                            lexemeStartPositionInLine += (uint)(tokenToAdd?.Lexeme.Length ?? 0);
                            absolutePosition          += (uint)(tokenToAdd?.Lexeme.Length ?? 0);

                            return(result);
                        }
                    })
                        .None(() =>
                    {
                        absolutePosition += maybeBeforeToken
                                            .Map(t => (uint)t.Lexeme.Length)
                                            .IfNone(0);
                        lexemeStartPositionInLine += maybeBeforeToken
                                                     .Some(t => (uint)t.Lexeme.Length)
                                                     .None(0u);

                        var tokenToAdd = new DotSymbolToken(
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            );

                        var result = maybeBeforeToken
                                     .ToImmutableList()
                                     .Add(tokenToAdd);
                        currentLexemeBuffer.Clear();
                        lexemeStartPositionInLine += (uint)(tokenToAdd?.Lexeme.Length ?? 0);
                        absolutePosition          += (uint)(tokenToAdd?.Lexeme.Length ?? 0);

                        return(result);
                    })
                    ;

                    foreach (var token in tokes)
                    {
                        yield return(token);
                    }

                    break;

                case '/':
                    maybeToken = FlushBuffer(
                        currentLexemeBuffer,
                        ref absolutePosition,
                        lineNumber,
                        ref lexemeStartPositionInLine
                        );
                    if (maybeToken.IsSome)
                    {
                        yield return(maybeToken.ValueUnsafe());
                    }

                    yield return(source.Peek()
                                 .Some <IToken>(c =>
                    {
                        switch (c)
                        {
                        case '/':
                            var commentContent = source.ReadLine();

                            var commentToken = new CommentToken(
                                $"/{commentContent}",
                                absolutePosition,
                                lineNumber,
                                lexemeStartPositionInLine
                                );

                            absolutePosition += (uint)commentContent.Length;
                            lineNumber += 1;
                            lexemeStartPositionInLine = 0;

                            return commentToken;

                        case '=':
                            var notEqualsToken = new NotEqualsOperatorToken(
                                absolutePosition,
                                lineNumber,
                                lexemeStartPositionInLine
                                );

                            source.Read();
                            absolutePosition += 1;
                            lexemeStartPositionInLine = 1;

                            return notEqualsToken;

                        default:
                            return new DivideOperatorToken(
                                (uint)source.BaseStream.Position,
                                lineNumber,
                                lexemeStartPositionInLine
                                );
                        }
                    })
                                 .None(() => new DivideOperatorToken(
                                           (uint)source.BaseStream.Position,
                                           lineNumber,
                                           lexemeStartPositionInLine
                                           )));

                    absolutePosition          += 1;
                    lexemeStartPositionInLine += 1;

                    break;

                case ':':
                    maybeToken = FlushBuffer(
                        currentLexemeBuffer,
                        ref absolutePosition,
                        lineNumber,
                        ref lexemeStartPositionInLine
                        );
                    if (maybeToken.IsSome)
                    {
                        yield return(maybeToken.ValueUnsafe());
                    }

                    yield return(source.Peek()
                                 .Filter(c => c == '=')
                                 .Some <IToken>(c =>
                    {
                        var result = new AssignmentOperatorToken(
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            );

                        source.Read();
                        absolutePosition += 1;
                        lexemeStartPositionInLine += 1;

                        return result;
                    })
                                 .None(new ColonSymbolToken(
                                           absolutePosition,
                                           lineNumber,
                                           lexemeStartPositionInLine
                                           )));

                    absolutePosition          += 1;
                    lexemeStartPositionInLine += 1;

                    break;

                case '>':
                    maybeToken = FlushBuffer(
                        currentLexemeBuffer,
                        ref absolutePosition,
                        lineNumber,
                        ref lexemeStartPositionInLine
                        );
                    if (maybeToken.IsSome)
                    {
                        yield return(maybeToken.ValueUnsafe());
                    }

                    yield return(source.Peek()
                                 .Filter(c => c == '=')
                                 .Some <IToken>(_ =>
                    {
                        var result = new GeOperatorToken(
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            );

                        source.Read();
                        absolutePosition += 1;
                        lexemeStartPositionInLine += 1;

                        return result;
                    })
                                 .None(new GtOperatorToken(
                                           (uint)absolutePosition,
                                           lineNumber,
                                           lexemeStartPositionInLine
                                           )));

                    absolutePosition          += 1;
                    lexemeStartPositionInLine += 1;

                    break;

                case '<':
                    maybeToken = FlushBuffer(
                        currentLexemeBuffer,
                        ref absolutePosition,
                        lineNumber,
                        ref lexemeStartPositionInLine
                        );
                    if (maybeToken.IsSome)
                    {
                        yield return(maybeToken.ValueUnsafe());
                    }

                    yield return(source.Peek()
                                 .Filter(c => c == '=')
                                 .Some <IToken>(_ =>
                    {
                        var result = new LeOperatorToken(
                            absolutePosition,
                            lineNumber,
                            lexemeStartPositionInLine
                            );

                        source.Read();
                        absolutePosition += 1;
                        lexemeStartPositionInLine += 1;

                        return result;
                    })
                                 .None(new LtOperatorToken(
                                           absolutePosition,
                                           lineNumber,
                                           lexemeStartPositionInLine
                                           )));

                    absolutePosition          += 1;
                    lexemeStartPositionInLine += 1;

                    break;

                case '*':
                case '%':
                case '+':
                case '-':
                case '=':
                case ',':
                case '[':
                case ']':
                case '(':
                case ')':
                case ';':
                    maybeToken = FlushBuffer(
                        currentLexemeBuffer,
                        ref absolutePosition,
                        lineNumber,
                        ref lexemeStartPositionInLine
                        );
                    if (maybeToken.IsSome)
                    {
                        yield return(maybeToken.ValueUnsafe());
                    }

                    yield return(SymbolLexemes
                                 .TryGetValue(((char)currentChar).ToString())
                                 .Some(cons => cons(
                                           absolutePosition,
                                           lineNumber,
                                           lexemeStartPositionInLine
                                           ))
                                 .None(() => new UnrecognizedToken(
                                           currentChar.ToString(),
                                           absolutePosition,
                                           lineNumber,
                                           lexemeStartPositionInLine
                                           )
                                       ));

                    absolutePosition          += 1;
                    lexemeStartPositionInLine += 1;

                    break;

                default:
                    currentLexemeBuffer.Append(char.ConvertFromUtf32(currentChar));
                    break;
                }
            }

            maybeToken = FlushBuffer(
                currentLexemeBuffer,
                ref absolutePosition,
                lineNumber,
                ref lexemeStartPositionInLine
                );
            if (maybeToken.IsSome)
            {
                yield return(maybeToken.ValueUnsafe());
            }
        }
        /*
        * Method:  FindNextToken
        * 
        * Find the next token. Return 'true' if one was found. False, otherwise.
        */
        override internal bool FindNextToken()
        {
            int startPosition = _reader.Position;

            // Dealing with whitespace?
            if (_reader.SinkMultipleWhiteSpace())
            {
                current = new WhitespaceToken();
                return true;
            }
            // Check for one-line comment
            else if (_reader.Sink("//"))
            {
                // Looks like a one-line comment. Follow it to the End-of-line
                _reader.SinkToEndOfLine();

                current = new CommentToken();
                return true;
            }
            // Check for multi-line comment
            else if (_reader.Sink("/*"))
            {
                _reader.SinkUntil("*/");

                // Was the ending */ found?
                if (_reader.EndOfLines)
                {
                    // No. There was a /* without a */. Return this a syntax error token.
                    current = new CSharpTokenizer.EndOfFileInsideCommentToken();
                    return true;
                }

                current = new CommentToken();
                return true;
            }
            // Handle chars
            else if (_reader.Sink("\'"))
            {
                while (_reader.CurrentCharacter != '\'')
                {
                    if (_reader.Sink("\\"))
                    {
                        /* reader.Skip the escape sequence. 
                            This isn't exactly right. We should detect:
                            
                            simple-escape-sequence: one of 
                            \' \" \\ \0 \a \b \f \n \r \t \v 
                            
                            hexadecimal-escape-sequence: 
                            \x   hex-digit   hex-digit[opt]   hex-digit[opt]  hex-digit[opt]                                
                        */
                    }

                    _reader.SinkCharacter();
                }

                if (_reader.SinkCharacter() != '\'')
                {
                    Debug.Assert(false, "Code defect in tokenizer: Should have yielded a closing tick.");
                }
                current = new CSharpTokenizer.CharLiteralToken();
                return true;
            }
            // Check for verbatim string
            else if (_reader.Sink("@\""))
            {
                do
                {
                    // Inside a verbatim string "" is treated as a special character
                    while (_reader.Sink("\"\""))
                    {
                    }
                }
                while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"');

                // Can't end a file inside a string 
                if (_reader.EndOfLines)
                {
                    current = new EndOfFileInsideStringToken();
                    return true;
                }

                // reader.Skip the ending quote.
                current = new StringLiteralToken();
                current.InnerText = _reader.GetCurrentMatchedString(startPosition).Substring(1);
                return true;
            }
            // Check for a quoted string.
            else if (_reader.Sink("\""))
            {
                while (_reader.CurrentCharacter == '\\' || _reader.MatchRegularStringLiteral())
                {
                    // See if we have an escape sequence.
                    if (_reader.SinkCharacter() == '\\')
                    {
                        // This is probably an escape character.
                        if (_reader.SinkStringEscape())
                        {
                            // This isn't nearly right. We just do barely enough to make a string
                            // with an embedded escape sequence return _some_ string whose start and 
                            // end match the real bounds of the string.
                        }
                        else
                        {
                            // This is a compiler error. 
                            _reader.SinkCharacter();
                            current = new CSharpTokenizer.UnrecognizedStringEscapeToken();
                            return true;
                        }
                    }
                }

                // Is it a newline?
                if (TokenChar.IsNewLine(_reader.CurrentCharacter))
                {
                    current = new CSharpTokenizer.NewlineInsideStringToken();
                    return true;
                }

                // Create the token.
                if (_reader.SinkCharacter() != '\"')
                {
                    Debug.Assert(false, "Defect in tokenizer: Should have yielded a terminating quote.");
                }
                current = new StringLiteralToken();
                return true;
            }
            // Identifier or keyword?
            else if
            (
                // From 2.4.2 Identifiers: A '@' can be used to prefix an identifier so that a keyword can be used as an identifier.
                _reader.CurrentCharacter == '@' ||
                _reader.MatchNextIdentifierStart()
            )
            {
                if (_reader.CurrentCharacter == '@')
                {
                    _reader.SinkCharacter();
                }

                // Now, the next character must be an identifier start.
                if (!_reader.SinkIdentifierStart())
                {
                    current = new ExpectedIdentifierToken();
                    return true;
                }

                // Sink the rest of the identifier.                     
                while (_reader.SinkIdentifierPart())
                {
                }
                string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition);

                switch (identifierOrKeyword)
                {
                    default:

                        if (Array.IndexOf(s_keywordList, identifierOrKeyword) >= 0)
                        {
                            current = new KeywordToken();
                            return true;
                        }

                        // If the identifier starts with '@' then we need to strip it off.
                        // The '@' is for escaping so that we can have an identifier called
                        // the same thing as a reserved keyword (i.e. class, if, foreach, etc)
                        string identifier = _reader.GetCurrentMatchedString(startPosition);
                        if (identifier.StartsWith("@", StringComparison.Ordinal))
                        {
                            identifier = identifier.Substring(1);
                        }

                        // Create the token.
                        current = new IdentifierToken();
                        current.InnerText = identifier;
                        return true;
                    case "false":
                    case "true":
                        current = new BooleanLiteralToken();
                        return true;
                    case "null":
                        current = new CSharpTokenizer.NullLiteralToken();
                        return true;
                }
            }
            // Open scope
            else if (_reader.Sink("{"))
            {
                current = new CSharpTokenizer.OpenScopeToken();
                return true;
            }
            // Close scope
            else if (_reader.Sink("}"))
            {
                current = new CSharpTokenizer.CloseScopeToken();
                return true;
            }
            // Hexidecimal integer literal
            else if (_reader.SinkIgnoreCase("0x"))
            {
                // Sink the hex digits.
                if (!_reader.SinkMultipleHexDigits())
                {
                    current = new ExpectedValidHexDigitToken();
                    return true;
                }

                // Skip the L, U, l, u, ul, etc.                    
                _reader.SinkLongIntegerSuffix();

                current = new HexIntegerLiteralToken();
                return true;
            }
            // Decimal integer literal
            else if (_reader.SinkMultipleDecimalDigits())
            {
                // reader.Skip the L, U, l, u, ul, etc.                    
                _reader.SinkLongIntegerSuffix();

                current = new DecimalIntegerLiteralToken();
                return true;
            }
            // Check for single-digit operators and punctuators
            else if (_reader.SinkOperatorOrPunctuator())
            {
                current = new OperatorOrPunctuatorToken();
                return true;
            }
            // Preprocessor line
            else if (_reader.CurrentCharacter == '#')
            {
                if (_reader.Sink("#if"))
                {
                    current = new OpenConditionalDirectiveToken();
                }
                else if (_reader.Sink("#endif"))
                {
                    current = new CloseConditionalDirectiveToken();
                }
                else
                {
                    current = new PreprocessorToken();
                }

                _reader.SinkToEndOfLine();

                return true;
            }

            // We didn't recognize the token, so this is a syntax error. 
            _reader.SinkCharacter();
            current = new UnrecognizedToken();
            return true;
        }
Exemple #27
0
 public HtmlParserCommentEventArgs(CommentToken commentToken)
 {
     CommentToken = commentToken;
 }
        /*
         * Method:  FindNextToken
         *
         * Find the next token. Return 'true' if one was found. False, otherwise.
         */
        override internal bool FindNextToken()
        {
            int startPosition = _reader.Position;

            // VB docs claim whitespace is Unicode category Zs. However,
            // this category does not contain tabs. Assuming a less restrictive
            // definition for whitespace...
            if (_reader.SinkWhiteSpace())
            {
                while (_reader.SinkWhiteSpace())
                {
                }

                // Now, we need to check for the line continuation character.
                if (_reader.SinkLineContinuationCharacter())    // Line continuation is '_'
                {
                    // Save the current position because we may need to come back here.
                    int savePosition = _reader.Position - 1;

                    // Skip all whitespace after the '_'
                    while (_reader.SinkWhiteSpace())
                    {
                    }

                    // Now, skip all the newlines.
                    // Need at least one newline for this to count as line continuation.
                    int count = 0;
                    while (_reader.SinkNewLine())
                    {
                        ++count;
                    }

                    if (count > 0)
                    {
                        current = new VisualBasicTokenizer.LineContinuationToken();
                        return(true);
                    }

                    // Otherwise, fall back to plain old whitespace.
                    _reader.Position = savePosition;
                }

                current = new WhitespaceToken();
                return(true);
            }
            // Line terminators are separate from whitespace and are significant.
            else if (_reader.SinkNewLine())
            {
                // We want one token per line terminator.
                current = new VisualBasicTokenizer.LineTerminatorToken();
                return(true);
            }
            // Check for a comment--either those that start with ' or rem.
            else if (_reader.SinkLineCommentStart())
            {
                // Skip to the first EOL.
                _reader.SinkToEndOfLine();

                current = new CommentToken();
                return(true);
            }
            // Identifier or keyword?
            else if
            (
                // VB allows escaping of identifiers by surrounding them with []
                // In other words,
                //      Date is a keyword but,
                //      [Date] is an identifier.
                _reader.CurrentCharacter == '[' ||
                _reader.MatchNextIdentifierStart()
            )
            {
                bool escapedIdentifier = false;
                if (_reader.CurrentCharacter == '[')
                {
                    escapedIdentifier = true;
                    _reader.SinkCharacter();

                    // Now, the next character must be an identifier start.
                    if (!_reader.SinkIdentifierStart())
                    {
                        current = new ExpectedIdentifierToken();
                        return(true);
                    }
                }

                // Sink the rest of the identifier.
                while (_reader.SinkIdentifierPart())
                {
                }

                // If this was an escaped identifier the we need to get the terminating ']'.
                if (escapedIdentifier)
                {
                    if (!_reader.Sink("]"))
                    {
                        current = new ExpectedIdentifierToken();
                        return(true);
                    }
                }
                else
                {
                    // Escaped identifiers are not allowed to have trailing type character.
                    _reader.SinkTypeCharacter(); // Type character is optional.
                }

                // An identifier that is only a '_' is illegal because it is
                // ambiguous with line continuation
                string identifierOrKeyword = _reader.GetCurrentMatchedString(startPosition);
                if (identifierOrKeyword == "_" || identifierOrKeyword == "[_]" || identifierOrKeyword == "[]")
                {
                    current = new ExpectedIdentifierToken();
                    return(true);
                }

                // Make an upper-case version in order to check whether this may be a keyword.
                string upper = identifierOrKeyword.ToUpperInvariant();

                switch (upper)
                {
                default:

                    if (Array.IndexOf(s_keywordList, upper) >= 0)
                    {
                        current = new KeywordToken();
                        return(true);
                    }

                    // Create the token.
                    current = new IdentifierToken();

                    // Trim off the [] if this is an escaped identifier.
                    if (escapedIdentifier)
                    {
                        current.InnerText = identifierOrKeyword.Substring(1, identifierOrKeyword.Length - 2);
                    }
                    return(true);

                case "FALSE":
                case "TRUE":
                    current = new BooleanLiteralToken();
                    return(true);
                }
            }
            // Is it a hex integer?
            else if (_reader.SinkHexIntegerPrefix())
            {
                if (!_reader.SinkMultipleHexDigits())
                {
                    current = new ExpectedValidHexDigitToken();
                    return(true);
                }

                // Sink a suffix if there is one.
                _reader.SinkIntegerSuffix();

                current = new HexIntegerLiteralToken();
                return(true);
            }
            // Is it an octal integer?
            else if (_reader.SinkOctalIntegerPrefix())
            {
                if (!_reader.SinkMultipleOctalDigits())
                {
                    current = new VisualBasicTokenizer.ExpectedValidOctalDigitToken();
                    return(true);
                }

                // Sink a suffix if there is one.
                _reader.SinkIntegerSuffix();

                current = new VisualBasicTokenizer.OctalIntegerLiteralToken();
                return(true);
            }
            // Is it a decimal integer?
            else if (_reader.SinkMultipleDecimalDigits())
            {
                // Sink a suffix if there is one.
                _reader.SinkDecimalIntegerSuffix();

                current = new DecimalIntegerLiteralToken();
                return(true);
            }
            // Preprocessor line
            else if (_reader.CurrentCharacter == '#')
            {
                if (_reader.SinkIgnoreCase("#if"))
                {
                    current = new OpenConditionalDirectiveToken();
                }
                else if (_reader.SinkIgnoreCase("#end if"))
                {
                    current = new CloseConditionalDirectiveToken();
                }
                else
                {
                    current = new PreprocessorToken();
                }

                _reader.SinkToEndOfLine();

                return(true);
            }
            // Is it a separator?
            else if (_reader.SinkSeparatorCharacter())
            {
                current = new VisualBasicTokenizer.SeparatorToken();
                return(true);
            }
            // Is it an operator?
            else if (_reader.SinkOperator())
            {
                current = new OperatorToken();
                return(true);
            }
            // A string?
            else if (_reader.Sink("\""))
            {
                do
                {
                    // Inside a verbatim string "" is treated as a special character
                    while (_reader.Sink("\"\""))
                    {
                    }
                }while (!_reader.EndOfLines && _reader.SinkCharacter() != '\"');

                // Can't end a file inside a string
                if (_reader.EndOfLines)
                {
                    current = new EndOfFileInsideStringToken();
                    return(true);
                }

                current = new StringLiteralToken();
                return(true);
            }


            // We didn't recognize the token, so this is a syntax error.
            _reader.SinkCharacter();
            current = new UnrecognizedToken();
            return(true);
        }
        private static void AssertAreEqualInternal(Token expectedToken, Token actualToken, int index = -1)
        {
            if ((expectedToken == null) && (actualToken == null))
            {
                return;
            }
            if (expectedToken == null)
            {
                Assert.Fail(LexerHelper.GetAssertErrorMessage("expected is null, but actual is not null", index));
            }
            if (actualToken == null)
            {
                Assert.Fail(LexerHelper.GetAssertErrorMessage("expected is not null, but actual is null", index));
            }
            Assert.AreEqual(expectedToken.GetType(), actualToken.GetType(),
                            LexerHelper.GetAssertErrorMessage($"actual type does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.StartPosition.Position, actualToken.Extent.StartPosition.Position,
                            LexerHelper.GetAssertErrorMessage($"actual Start Position does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.StartPosition.LineNumber, actualToken.Extent.StartPosition.LineNumber,
                            LexerHelper.GetAssertErrorMessage($"actual Start Line does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.StartPosition.ColumnNumber, actualToken.Extent.StartPosition.ColumnNumber,
                            LexerHelper.GetAssertErrorMessage($"actual Start Column does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.EndPosition.Position, actualToken.Extent.EndPosition.Position,
                            LexerHelper.GetAssertErrorMessage($"actual End Position does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.EndPosition.LineNumber, actualToken.Extent.EndPosition.LineNumber,
                            LexerHelper.GetAssertErrorMessage($"actual End Line does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.EndPosition.ColumnNumber, actualToken.Extent.EndPosition.ColumnNumber,
                            LexerHelper.GetAssertErrorMessage($"actual End Column does not match expected value", index));
            Assert.AreEqual(expectedToken.Extent.Text, actualToken.Extent.Text,
                            LexerHelper.GetAssertErrorMessage($"actual Text does not match expected value", index));
            switch (expectedToken)
            {
            case AliasIdentifierToken token:
                Assert.IsTrue(
                    AliasIdentifierToken.AreEqual((AliasIdentifierToken)expectedToken, (AliasIdentifierToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case AttributeCloseToken token:
                Assert.IsTrue(
                    AttributeCloseToken.AreEqual((AttributeCloseToken)expectedToken, (AttributeCloseToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case AttributeOpenToken token:
                Assert.IsTrue(
                    AttributeOpenToken.AreEqual((AttributeOpenToken)expectedToken, (AttributeOpenToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case BlockCloseToken token:
                Assert.IsTrue(
                    BlockCloseToken.AreEqual((BlockCloseToken)expectedToken, (BlockCloseToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case BlockOpenToken token:
                Assert.IsTrue(
                    BlockOpenToken.AreEqual((BlockOpenToken)expectedToken, (BlockOpenToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case BooleanLiteralToken token:
                Assert.IsTrue(
                    BooleanLiteralToken.AreEqual((BooleanLiteralToken)expectedToken, (BooleanLiteralToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case ColonToken token:
                Assert.IsTrue(
                    ColonToken.AreEqual((ColonToken)expectedToken, (ColonToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case CommaToken token:
                Assert.IsTrue(
                    CommaToken.AreEqual((CommaToken)expectedToken, (CommaToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case CommentToken token:
                Assert.IsTrue(
                    CommentToken.AreEqual((CommentToken)expectedToken, (CommentToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case DotOperatorToken token:
                Assert.IsTrue(
                    DotOperatorToken.AreEqual((DotOperatorToken)expectedToken, (DotOperatorToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case EqualsOperatorToken token:
                Assert.IsTrue(
                    EqualsOperatorToken.AreEqual((EqualsOperatorToken)expectedToken, (EqualsOperatorToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case IdentifierToken token:
                Assert.IsTrue(
                    IdentifierToken.AreEqual((IdentifierToken)expectedToken, (IdentifierToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case IntegerLiteralToken token:
                Assert.IsTrue(
                    IntegerLiteralToken.AreEqual((IntegerLiteralToken)expectedToken, (IntegerLiteralToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case NullLiteralToken token:
                Assert.IsTrue(
                    NullLiteralToken.AreEqual((NullLiteralToken)expectedToken, (NullLiteralToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case ParenthesisCloseToken token:
                Assert.IsTrue(
                    ParenthesisCloseToken.AreEqual((ParenthesisCloseToken)expectedToken, (ParenthesisCloseToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case ParenthesisOpenToken token:
                Assert.IsTrue(
                    ParenthesisOpenToken.AreEqual((ParenthesisOpenToken)expectedToken, (ParenthesisOpenToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case PragmaToken token:
                Assert.IsTrue(
                    PragmaToken.AreEqual((PragmaToken)expectedToken, (PragmaToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case RealLiteralToken token:
                Assert.IsTrue(
                    RealLiteralToken.AreEqual((RealLiteralToken)expectedToken, (RealLiteralToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case StatementEndToken token:
                Assert.IsTrue(
                    StatementEndToken.AreEqual((StatementEndToken)expectedToken, (StatementEndToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case StringLiteralToken token:
                Assert.IsTrue(
                    StringLiteralToken.AreEqual((StringLiteralToken)expectedToken, (StringLiteralToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            case WhitespaceToken token:
                Assert.IsTrue(
                    WhitespaceToken.AreEqual((WhitespaceToken)expectedToken, (WhitespaceToken)actualToken),
                    LexerHelper.GetAssertErrorMessage($"actual token does not match expected token", index)
                    );
                break;

            default:
                throw new NotImplementedException($"Cannot compare type '{expectedToken.GetType().Name}'");
            }
        }
Exemple #30
0
        private int Comment()
        {
            char ch = data[chBaseIndex];

            if (ch == '/')
            {
                int chNextIndex = chBaseIndex + 1;
                if (chNextIndex < data.Length)
                {
                    char nextCh = data[chNextIndex];
                    if (nextCh == '/')
                    {
                        // Line comment - scan for end of line, and collect.
                        int chScanningIndex = chNextIndex;

                        while (++chScanningIndex <= data.Length)
                        {
                            bool eof = chScanningIndex >= data.Length - 1;

                            bool proceed = eof;
                            if (!proceed)
                            {
                                char chScanning = data[chScanningIndex];
                                proceed = SeparatorToken.Map.ContainsKey(chScanning) &&
                                          SeparatorToken.Map[chScanning] == NssSeparators.NewLine;
                            }

                            if (proceed)
                            {
                                CommentToken comment = new CommentToken();
                                comment.CommentType = CommentType.LineComment;

                                int chStartIndex = chNextIndex + 1;
                                int chEndIndex   = eof ? data.Length : chScanningIndex;

                                if (chStartIndex == chEndIndex)
                                {
                                    comment.Comment = "";
                                }
                                else
                                {
                                    comment.Comment = data.Substring(chStartIndex, chEndIndex - chStartIndex);
                                }

                                int chNewBaseIndex = chEndIndex;
                                AttachDebugData(comment, DebugRanges, chBaseIndex, chNewBaseIndex - 1);

                                Tokens.Add(comment);
                                chBaseIndex = chNewBaseIndex;
                                break;
                            }
                        }
                    }
                    else if (nextCh == '*')
                    {
                        // Block comment - scan for the closing */, ignoring everything else.
                        bool terminated      = false;
                        int  chScanningIndex = chNextIndex + 1;
                        while (++chScanningIndex < data.Length)
                        {
                            char chScanning = data[chScanningIndex];
                            if (chScanning == '/')
                            {
                                char chScanningLast = data[chScanningIndex - 1];
                                if (chScanningLast == '*')
                                {
                                    terminated = true;
                                    break;
                                }
                            }
                        }

                        bool eof = chScanningIndex >= data.Length - 1;

                        CommentToken comment = new CommentToken();
                        comment.CommentType = CommentType.BlockComment;
                        comment.Terminated  = terminated;

                        int chStartIndex = chBaseIndex + 2;
                        int chEndIndex   = !terminated && eof ? data.Length : chScanningIndex + (terminated ? -1 : 0);
                        comment.Comment = data.Substring(chStartIndex, chEndIndex - chStartIndex);

                        int chNewBaseIndex = eof ? data.Length : chScanningIndex + 1;
                        AttachDebugData(comment, DebugRanges, chBaseIndex, chNewBaseIndex - 1);

                        Tokens.Add(comment);
                        chBaseIndex = chNewBaseIndex;
                    }
                }
            }

            return(chBaseIndex);
        }
Exemple #31
0
        public static Token Create(TokenKind kind, SourceLocation location)
        {
            Token token;

            switch (kind)
            {
                #region case KEYWORD:
            case TokenKind.T_AS:
            case TokenKind.T_BREAK:
            case TokenKind.T_CASE:
            case TokenKind.T_CATCH:
            case TokenKind.T_CONST:
            case TokenKind.T_CONTINUE:
            case TokenKind.T_DEFAULT:
            case TokenKind.T_DELETE:
            case TokenKind.T_DO:
            case TokenKind.T_ELSE:
            case TokenKind.T_ENUM:
            case TokenKind.T_FALSE:
            case TokenKind.T_FINALLY:
            case TokenKind.T_FOR:
            case TokenKind.T_FUNCTION:
            case TokenKind.T_IF:
            case TokenKind.T_IMPORT:
            case TokenKind.T_IN:
            case TokenKind.T_INSTANCEOF:
            case TokenKind.T_LET:
            case TokenKind.T_NEW:
            case TokenKind.T_NULL:
            case TokenKind.T_ON:
            case TokenKind.T_PRAGMA:
            case TokenKind.T_PROPERTY:
            case TokenKind.T_PUBLIC:
            case TokenKind.T_READONLY:
            case TokenKind.T_RESERVED_WORD:
            case TokenKind.T_RETURN:
            case TokenKind.T_SET:
            case TokenKind.T_SIGNAL:
            case TokenKind.T_SWITCH:
            case TokenKind.T_THIS:
            case TokenKind.T_THROW:
            case TokenKind.T_TRUE:
            case TokenKind.T_TRY:
            case TokenKind.T_TYPEOF:
            case TokenKind.T_VAR:
            case TokenKind.T_VOID:
            case TokenKind.T_WHILE:
            case TokenKind.T_WITH:
                #endregion
                token = new KeywordToken();
                break;

            case TokenKind.T_NUMERIC_LITERAL:
                token = new NumberToken();
                break;

            case TokenKind.T_MULTILINE_STRING_LITERAL:
            case TokenKind.T_STRING_LITERAL:
                token = new StringToken();
                break;

            case TokenKind.T_COMMENT:
                token = new CommentToken();
                break;

            default:
                token = new Token();
                break;
            }
            token.Kind     = kind;
            token.Location = location;
            return(token);
        }
        public static string ToXml(string html)
        {
            StringBuilder result = new StringBuilder();

            // Standard XML file header, including entities that are likely to be used.
            result.Append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");

            ParseReader    reader       = new ParseReader(html);
            TagParser      parser       = new TagParser(reader);
            Stack <string> nestingStack = new Stack <string>();

            try
            {
                ParseToken token = parser.GetNextToken();

                // Ignore leading white-space.
                while (token is SpacesToken || token is NewlineToken || token is DoctypeToken)
                {
                    token = parser.GetNextToken();
                }

                while (!(token is EOFToken))
                {
                    Log.DebugFormat("Token = {0}", token);
                    if (token is TagToken)
                    {
                        TagToken t = (TagToken)token;
                        if (!t.Tag.IsEndTag)
                        {
                            // Deal with start-tag. Typically this will be new element nesting.
                            Tag startTag = t.Tag;
                            if (startTag is EmptyElement)
                            {
                                result.Append(((EmptyElement)startTag).ToString());
                            }
                            else
                            {
                                // Tags that are always empty elements are converted to empty elements here.
                                // Element names are pushed onto the stack to balance elements with missing end-tag.
                                string startTagName = startTag.Name.ToLower();
                                Log.DebugFormat("startTagName = {0}", startTagName);
                                if (EmptyElements.Contains(startTagName))
                                {
                                    result.Append((new EmptyElement(startTag)).ToString());
                                }
                                else
                                {
                                    result.Append(startTag.ToString());
                                    nestingStack.Push(startTagName);
                                }
                            }
                        }
                        else
                        {
                            // Deal with end-tag.
                            Tag endTag = t.Tag;

                            // Remove the '/' from beginning of the tag-name for comparison.
                            string endTagName = endTag.Name.Substring(1).ToLower();
                            Log.DebugFormat("endTagName = {0}", endTagName);

                            // Ignore some end-tags for empty elements that are handled with or without empty element syntax.
                            if (EmptyElements.Contains(endTagName))
                            {
                                Log.InfoFormat("Ignoring redundant end-tag: {0}", endTagName);
                            }
                            else
                            {
                                // Keep element tags matched appropriately.
                                string peek = nestingStack.Peek();
                                if (peek == null)
                                {
                                    Log.WarnFormat("Ignoring extra content at end of document! </{0}> ({1})", endTagName, parser.GetCharacterPosition());
                                }
                                else
                                {
                                    if (peek.Equals(endTagName))
                                    {
                                        nestingStack.Pop();
                                    }
                                    else
                                    {
                                        // Pair all the previous unmatched tags for these important structural elements.
                                        // These elements appear only once, so should never be automatically closed.
                                        if (SingleElements.Contains(endTagName))
                                        {
                                            while (peek != endTagName)
                                            {
                                                StringBuilder endtag = (new StringBuilder()).Append("</").Append(peek).Append('>');
                                                Log.WarnFormat("Adding a missing end-tag! {0} ({1})", endtag, parser.GetCharacterPosition());
                                                result.Append(endtag);
                                                nestingStack.Pop();
                                                peek = nestingStack.Peek();
                                            }

                                            // Remove the current item from the stack, as it has been paired now.
                                            nestingStack.Pop();
                                        }
                                        else
                                        {
                                            // Insert a matching start-tag before the unbalanced end-tag found.
                                            StringBuilder startTag = (new StringBuilder()).Append("<").Append(endTagName).Append('>');
                                            Log.WarnFormat("Adding a missing start-tag! {0} ({1})", startTag, parser.GetCharacterPosition());
                                            result.Append(startTag);
                                        }
                                    }

                                    // Write the current element end-tag.
                                    result.Append("</").Append(endTagName).Append('>');
                                }
                            }
                        }
                    }
                    else if (token is WordToken)
                    {
                        WordToken t = (WordToken)token;
                        result.Append(t.Word);
                    }
                    else if (token is SpacesToken)
                    {
                        SpacesToken t = (SpacesToken)token;
                        result.Append(t.Spaces);
                    }
                    else if (token is NumberToken)
                    {
                        NumberToken t = (NumberToken)token;
                        result.Append(t.Number);
                    }
                    else if (token is EntityReferenceToken)
                    {
                        EntityReferenceToken t = (EntityReferenceToken)token;
                        result.Append(XmlEntity(t.Name));
                    }
                    else if (token is PunctuationToken)
                    {
                        PunctuationToken t = (PunctuationToken)token;
                        result.Append(t.Character);
                    }
                    else if (token is CharacterEntityToken)
                    {
                        CharacterEntityToken t = (CharacterEntityToken)token;
                        result.Append(t.Character);
                    }
                    else if (token is NewlineToken)
                    {
                        result.Append('\n');
                    }
                    else if (token is ScriptToken)
                    {
                        ScriptToken t = (ScriptToken)token;
                        if (t.Script.Length > 0)
                        {
                            // Script element contents are often empty.
                            // NOTE: Removing any prior use of CDATA section in script, to avoid conflict.
                            string script = t.Script.Replace("<![CDATA[", "").Replace("]]>", "");
                            result.Append("/*<![CDATA[*/").Append(script).Append("/*]]>*/");
                        }
                    }
                    else if (token is CDataToken)
                    {
                        CDataToken t = (CDataToken)token;
                        result.Append("<![CDATA[").Append(t.Data).Append("]]>");
                    }
                    else if (token is CommentToken)
                    {
                        CommentToken t = (CommentToken)token;
                        result.Append("<!--").Append(t.Comment).Append("-->");
                    }
                    else if (token is DoctypeToken)
                    {
                        // Ignore.
                    }
                    else if (token is ProcessingInstructionToken)
                    {
                        // Ignore.
                    }
                    else
                    {
                        Log.WarnFormat("Unexpected token! {0}", token);
                    }
                    token = parser.GetNextToken();
                }

                Log.Info(parser.GetCompletionReport());
            }
            catch (Exception ex)
            {
                Log.Error("EXCEPTION", ex);
                result = null;
            }

            return(result == null ? null : result.ToString());
        }
 //protected override void VisitInToken(InToken token) { throw new NotImplementedException(); }
 //protected override void VisitNotInToken(NotInToken token) { throw new NotImplementedException(); }
 protected override void VisitCommentToken(CommentToken token)
 {
     throw new NotImplementedException();
 }