Пример #1
0
        /// <summary>
        ///     skips past unknown directives that start with "<!" but are not comments or Cdata
        /// ignores content of such directives until the next ">"
        ///     character
        ///     applies to directives such as DOCTYPE, etc that we do not presently support
        /// </summary>
        private void ReadUnknownDirective()
        {
            // verify that we are at an unknown directive
            Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' &&
                         !(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));

            // Let's treat this as empty text
            NextTokenType     = HtmlTokenType.Text;
            _nextToken.Length = 0;

            // advance to the next character
            GetNextCharacter();

            // skip to the first tag end we find
            while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
            {
                GetNextCharacter();
            }

            if (!IsAtEndOfStream)
            {
                // advance past the tag end
                GetNextCharacter();
            }
        }
Пример #2
0
 internal HtmlToken(
     HtmlTokenType type,
     bool isSelfClosing,
     ReadOnlyMemory <char> name,
     ReadOnlyMemory <char> rawText,
     Memory <HtmlAttribute> attributes,
     in HtmlTextRange range,
Пример #3
0
 public ComplexAttributeValueToken(IHtmlToken token, char openQuote, char closeQuote)
     : base(token)
 {
     _tokenType  = token.TokenType;
     _openQuote  = openQuote;
     _closeQuote = closeQuote;
 }
Пример #4
0
        string TokenTypeAsString(HtmlTokenType t)
        {
            string type = t.ToString();

            //int lastDot = type.LastIndexOf('.');
            return(type);
        }
Пример #5
0
        private void AttributeValue(char quote)
        {
            _attributeValueStart = Position();

            while (true)
            {
                switch (Current())
                {
                case '\0':
                    _type = HtmlTokenType.Comment;
                    Consume();
                    return;

                case char c when c == quote:
                    _attributeValueEnd = Position();
                    Consume();
                    _attributeEnd = Position();
                    AddAttribute();
                    BeforeAttributeName();
                    return;

                default:
                    Consume();
                    break;
                }
            }
        }
Пример #6
0
        private void EndTagOpen()
        {
            _type = HtmlTokenType.EndTag;

            switch (Current())
            {
            case '>':
                Consume();
                _nameEnd = _nameStart;
                break;

            case '\0':
                Consume();
                _type = HtmlTokenType.Text;
                break;

            case char c when IsASCIIAlpha(c):
                TagName(readAttributes: false);

                break;

            default:
                Consume();
                BogusComment();
                break;
            }
        }
Пример #7
0
 internal void GetNextAtomToken()
 {
     this._nextToken.Length = 0;
     this.SkipWhiteSpace();
     this._nextTokenType = HtmlTokenType.Atom;
     if ((this.NextCharacter == '\'' || this.NextCharacter == '"') && !this.IsNextCharacterEntity)
     {
         char nextCharacter = this.NextCharacter;
         this.GetNextCharacter();
         while ((this.NextCharacter != nextCharacter || this.IsNextCharacterEntity) && !this.IsAtEndOfStream)
         {
             this._nextToken.Append(this.NextCharacter);
             this.GetNextCharacter();
         }
         if (this.NextCharacter == nextCharacter)
         {
             this.GetNextCharacter();
             return;
         }
     }
     else
     {
         while (!this.IsAtEndOfStream && !char.IsWhiteSpace(this.NextCharacter) && this.NextCharacter != '>')
         {
             this._nextToken.Append(this.NextCharacter);
             this.GetNextCharacter();
         }
     }
 }
Пример #8
0
        /// <summary>
        ///     skips dynamic content starting with '<![' and ending with ']>'
        /// </summary>
        private void ReadDynamicContent()
        {
            // verify that we are at dynamic content, which may include CDATA
            Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '[');

            // Let's treat this as empty text
            NextTokenType     = HtmlTokenType.Text;
            _nextToken.Length = 0;

            // advance twice, once to get the lookahead character and then to reach the start of the cdata
            GetNextCharacter();
            GetNextCharacter();

            // NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else
            // some directives may start with a <![ and then have some data and they will just end with a ]>
            // this function is modified to stop at the sequence ]> and not ]]>
            // this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
            // directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
            // sequence anyway, it probably stops at the first ]
            while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
            {
                // advance
                GetNextCharacter();
            }

            if (!IsAtEndOfStream)
            {
                // advance, first to the last >
                GetNextCharacter();

                // then advance past it to the next character after processing directive
                GetNextCharacter();
            }
        }
Пример #9
0
        private void ReadComment()
        {
            this._nextTokenType    = HtmlTokenType.Comment;
            this._nextToken.Length = 0;
            this.GetNextCharacter();
            this.GetNextCharacter();
            this.GetNextCharacter();
            while (true)
            {
                if (this.IsAtEndOfStream || (this._nextCharacter == '-' && this._lookAheadCharacter == '-') || (this._nextCharacter == '!' && this._lookAheadCharacter == '>'))
                {
                    this.GetNextCharacter();
                    if (this._previousCharacter == '-' && this._nextCharacter == '-' && this._lookAheadCharacter == '>')
                    {
                        break;
                    }
                    if (this._previousCharacter == '!' && this._nextCharacter == '>')
                    {
                        goto IL_C6;
                    }
                    this._nextToken.Append(this._previousCharacter);
                }
                else
                {
                    this._nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }
            }
            this.GetNextCharacter();
IL_C6:
            if (this._nextCharacter == '>')
            {
                this.GetNextCharacter();
            }
        }
Пример #10
0
        private void TagOpen()
        {
            _type = HtmlTokenType.StartTag;

            switch (Current())
            {
            case '!':
                Consume();
                MarkdownDeclarationOpen();
                break;

            case '?':
                BogusComment();
                break;

            case '/':
                Consume();
                EndTagOpen();
                break;

            case char c when IsASCIIAlpha(c):
                TagName(readAttributes: true);

                break;

            default:
                Data();
                break;
            }
        }
Пример #11
0
 private bool IsValidTokenType(HtmlTokenType type)
 {
     return(type != HtmlTokenType.WhiteSpace &&
            type != HtmlTokenType.NewLine &&
            type != HtmlTokenType.RazorComment &&
            type != HtmlTokenType.RazorCommentStar &&
            type != HtmlTokenType.RazorCommentTransition &&
            type != HtmlTokenType.Transition);
 }
Пример #12
0
        private void AfterAttributeName()
        {
            while (true)
            {
                switch (Current())
                {
                case '\0':
                    _type = HtmlTokenType.Comment;
                    Consume();
                    return;

                case '>':
                    if (_attributeEnd.Index == _attributeStart.Index)
                    {
                        _attributeEnd = Position();
                    }
                    Consume();
                    AddAttribute();
                    return;

                case '/':
                    if (_attributeEnd.Index == _attributeStart.Index)
                    {
                        _attributeEnd = Position();
                    }
                    Consume();
                    AddAttribute();
                    SelfClosingStartTag();
                    return;

                case '=':
                    Consume();
                    BeforeAttributeValue();
                    return;

                case '\t':
                case '\r':
                case '\n':
                case '\f':
                case ' ':
                    if (_attributeEnd.Index == _attributeStart.Index)
                    {
                        _attributeEnd = Position();
                    }
                    Consume();
                    break;

                default:
                    AddAttribute();
                    AttributeName();
                    return;
                }
            }
        }
Пример #13
0
 internal void GetNextEqualSignToken()
 {
     this._nextToken.Length = 0;
     this._nextToken.Append('=');
     this._nextTokenType = HtmlTokenType.EqualSign;
     this.SkipWhiteSpace();
     if (this.NextCharacter == '=')
     {
         this.GetNextCharacter();
     }
 }
Пример #14
0
        private void TagName(bool readAttributes)
        {
            _nameStart = _nameEnd = Position();

            while (true)
            {
                switch (Current())
                {
                case '\0':
                    _type = HtmlTokenType.Comment;
                    Consume();
                    return;

                case '>':
                    if (_nameEnd.Index == _nameStart.Index)
                    {
                        _nameEnd = Position();
                    }
                    Consume();
                    return;

                case '/':
                    if (_nameEnd.Index == _nameStart.Index)
                    {
                        _nameEnd = Position();
                    }
                    Consume();
                    SelfClosingStartTag();
                    return;

                case '\t':
                case '\r':
                case '\n':
                case '\f':
                case ' ':
                    if (_nameEnd.Index == _nameStart.Index)
                    {
                        _nameEnd = Position();
                    }
                    Consume();
                    if (readAttributes)
                    {
                        BeforeAttributeName();
                        return;
                    }
                    break;

                default:
                    Consume();
                    break;
                }
            }
        }
Пример #15
0
        /// <summary>
        /// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
        /// Does not guarantee token reader advancing.
        /// </summary>
        internal void GetNextTagToken()
        {
            this.nextToken.Length = 0;
            if (this.IsAtEndOfStream)
            {
                this.nextTokenType = HtmlTokenType.EOF;
                return;
            }

            this.SkipWhiteSpace();

            if (this.NextCharacter == '>' && !this.IsNextCharacterEntity)
            {
                // &gt; should not end a tag, so make sure it's not an entity
                this.nextTokenType = HtmlTokenType.TagEnd;
                this.nextToken.Append('>');
                this.GetNextCharacter();
                //// Note: ignoreNextWhitespace must be set appropriately on tag start processing
            }
            else if (this.NextCharacter == '/' && this.lookAheadCharacter == '>')
            {
                // could be start of closing of empty tag
                this.nextTokenType = HtmlTokenType.EmptyTagEnd;
                this.nextToken.Append("/>");
                this.GetNextCharacter();
                this.GetNextCharacter();
                this.ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
            }
            else if (this.IsGoodForNameStart(this.NextCharacter))
            {
                this.nextTokenType = HtmlTokenType.Name;

                // starts a name
                // we allow character entities here
                // we do not throw exceptions here if end of stream is encountered
                // just stop and return whatever is in the token
                // if the parser is not expecting end of file after this it will call
                // the get next token function and throw an exception
                while (this.IsGoodForName(this.NextCharacter) && !this.IsAtEndOfStream)
                {
                    this.nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }
            }
            else
            {
                // Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
                this.nextTokenType = HtmlTokenType.Atom;
                this.nextToken.Append(this.NextCharacter);
                this.GetNextCharacter();
            }
        }
Пример #16
0
 internal HtmlLexicalAnalyzer(string inputTextString)
 {
     this._inputStringReader      = new StringReader(inputTextString);
     this._nextCharacterCode      = 0;
     this._nextCharacter          = ' ';
     this._lookAheadCharacterCode = this._inputStringReader.Read();
     this._lookAheadCharacter     = (char)this._lookAheadCharacterCode;
     this._previousCharacter      = ' ';
     this._ignoreNextWhitespace   = true;
     this._nextToken     = new StringBuilder(100);
     this._nextTokenType = HtmlTokenType.Text;
     this.GetNextCharacter();
 }
Пример #17
0
        /// <summary>
        /// skips comments starting with '!-' and ending with '--'
        /// NOTE: 10/06/2004: processing changed, will now skip anything starting with
        /// the "!-"  sequence and ending in "!" or "-", because in practice many html pages do not
        /// use the full comment specifying conventions
        /// </summary>
        private void ReadComment()
        {
            // verify that we are at a comment
            Debug.Assert(this.previousCharacter == '<' && this.nextCharacter == '!' && this.lookAheadCharacter == '-', "HtmlToXamlConverter");

            // Initialize a token
            this.nextTokenType    = HtmlTokenType.Comment;
            this.nextToken.Length = 0;

            // advance to the next character, so that to be at the start of comment value
            this.GetNextCharacter(); // get first '-'
            this.GetNextCharacter(); // get second '-'
            this.GetNextCharacter(); // get first character of comment content

            while (true)
            {
                // Read text until end of comment
                // Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
                while (!this.IsAtEndOfStream && !((this.nextCharacter == '-' && this.lookAheadCharacter == '-') || (this.nextCharacter == '!' && this.lookAheadCharacter == '>')))
                {
                    this.nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }

                // Finish comment reading
                this.GetNextCharacter();
                if (this.previousCharacter == '-' && this.nextCharacter == '-' && this.lookAheadCharacter == '>')
                {
                    // Standard comment end. Eat it and exit the loop
                    this.GetNextCharacter(); // get '>'
                    break;
                }
                else if (this.previousCharacter == '!' && this.nextCharacter == '>')
                {
                    // Nonstandard but possible comment end - '!>'. Exit the loop
                    break;
                }
                else
                {
                    // Not an end. Save character and continue continue reading
                    this.nextToken.Append(this.previousCharacter);
                    continue;
                }
            }

            // Read end of comment combination
            if (this.nextCharacter == '>')
            {
                this.GetNextCharacter();
            }
        }
 /// <summary>
 /// initializes the _inputStringReader member with the string to be read
 /// also sets initial values for _nextCharacterCode and _nextTokenType
 /// </summary>
 /// <param name="inputTextString">
 /// text string to be parsed for xml content
 /// </param>
 internal HtmlLexicalAnalyzer(string inputTextString)
 {
     _inputStringReader = new StringReader(inputTextString);
     _nextCharacterCode = 0;
     _nextCharacter = ' ';
     _lookAheadCharacterCode = _inputStringReader.Read();
     _lookAheadCharacter = (char)_lookAheadCharacterCode;
     _previousCharacter = ' ';
     _ignoreNextWhitespace = true;
     _nextToken = new StringBuilder(100);
     _nextTokenType = HtmlTokenType.Text;
     // read the first character so we have some value for the NextCharacter property
     this.GetNextCharacter();
 }
Пример #19
0
 private void ReadUnknownDirective()
 {
     this._nextTokenType    = HtmlTokenType.Text;
     this._nextToken.Length = 0;
     this.GetNextCharacter();
     while ((this._nextCharacter != '>' || this.IsNextCharacterEntity) && !this.IsAtEndOfStream)
     {
         this.GetNextCharacter();
     }
     if (!this.IsAtEndOfStream)
     {
         this.GetNextCharacter();
     }
 }
Пример #20
0
        /// <summary>
        ///     skips comments starting with '<!-' and ending with '-->'
        ///     NOTE: 10/06/2004: processing changed, will now skip anything starting with
        ///     the "<!-"  sequence and ending in "!>" or "->", because in practice many html pages do not
        ///     use the full comment specifying conventions
        /// </summary>
        private void ReadComment()
        {
            // verify that we are at a comment
            Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-');

            // Initialize a token
            NextTokenType     = HtmlTokenType.Comment;
            _nextToken.Length = 0;

            // advance to the next character, so that to be at the start of comment value
            GetNextCharacter(); // get first '-'
            GetNextCharacter(); // get second '-'
            GetNextCharacter(); // get first character of comment content

            while (true)
            {
                // Read text until end of comment
                // Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
                while (!IsAtEndOfStream &&
                       !(NextCharacter == '-' && _lookAheadCharacter == '-' ||
                         NextCharacter == '!' && _lookAheadCharacter == '>'))
                {
                    _nextToken.Append(NextCharacter);
                    GetNextCharacter();
                }

                // Finish comment reading
                GetNextCharacter();
                if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>')
                {
                    // Standard comment end. Eat it and exit the loop
                    GetNextCharacter(); // get '>'
                    break;
                }
                if (_previousCharacter == '!' && NextCharacter == '>')
                {
                    // Nonstandard but possible comment end - '!>'. Exit the loop
                    break;
                }
                // Not an end. Save character and continue continue reading
                _nextToken.Append(_previousCharacter);
            }

            // Read end of comment combination
            if (NextCharacter == '>')
            {
                GetNextCharacter();
            }
        }
Пример #21
0
        // ---------------------------------------------------------------------
        //
        // Constructors
        //
        // ---------------------------------------------------------------------
        #region Constructors

        /// <summary>
        /// Initializes a new instance of the <see cref="HtmlLexicalAnalyzer" /> class.
        /// initializes the inputStringReader member with the string to be read
        /// also sets initial values for nextCharacterCode and nextTokenType
        /// </summary>
        /// <param name="inputTextString">
        /// text string to be parsed for xml content
        /// </param>
        internal HtmlLexicalAnalyzer(string inputTextString)
        {
            this.inputStringReader      = new StringReader(inputTextString);
            this.nextCharacterCode      = 0;
            this.nextCharacter          = ' ';
            this.lookAheadCharacterCode = this.inputStringReader.Read();
            this.lookAheadCharacter     = (char)this.lookAheadCharacterCode;
            this.previousCharacter      = ' ';
            this.ignoreNextWhitespace   = true;
            this.nextToken     = new StringBuilder(100);
            this.nextTokenType = HtmlTokenType.Text;

            // read the first character so we have some value for the NextCharacter property
            this.GetNextCharacter();
        }
Пример #22
0
        /// <summary>
        /// Unconditionally returns equal sign token. Even if there is no
        /// real equal sign in the stream, it behaves as if it were there.
        /// Does not guarantee token reader advancing.
        /// </summary>
        internal void GetNextEqualSignToken()
        {
            Debug.Assert(this.nextTokenType != HtmlTokenType.EOF, "Unexpected EOF");
            this.nextToken.Length = 0;

            this.nextToken.Append('=');
            this.nextTokenType = HtmlTokenType.EqualSign;

            this.SkipWhiteSpace();

            if (this.NextCharacter == '=')
            {
                // '=' is not in the list of entities, so no need to check for entities here
                this.GetNextCharacter();
            }
        }
Пример #23
0
 private void ReadDynamicContent()
 {
     this._nextTokenType    = HtmlTokenType.Text;
     this._nextToken.Length = 0;
     this.GetNextCharacter();
     this.GetNextCharacter();
     while ((this._nextCharacter != ']' || this._lookAheadCharacter != '>') && !this.IsAtEndOfStream)
     {
         this.GetNextCharacter();
     }
     if (!this.IsAtEndOfStream)
     {
         this.GetNextCharacter();
         this.GetNextCharacter();
     }
 }
Пример #24
0
        private void ReadUnknownDirective()
        {
            NextTokenType     = HtmlTokenType.Text;
            _nextToken.Length = 0;

            GetNextCharacter();

            while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
            {
                GetNextCharacter();
            }

            if (!IsAtEndOfStream)
            {
                GetNextCharacter();
            }
        }
Пример #25
0
 internal void GetNextTagToken()
 {
     this._nextToken.Length = 0;
     if (this.IsAtEndOfStream)
     {
         this._nextTokenType = HtmlTokenType.EOF;
         return;
     }
     this.SkipWhiteSpace();
     if (this.NextCharacter == '>' && !this.IsNextCharacterEntity)
     {
         this._nextTokenType = HtmlTokenType.TagEnd;
         this._nextToken.Append('>');
         this.GetNextCharacter();
         return;
     }
     if (this.NextCharacter == '/' && this._lookAheadCharacter == '>')
     {
         this._nextTokenType = HtmlTokenType.EmptyTagEnd;
         this._nextToken.Append("/>");
         this.GetNextCharacter();
         this.GetNextCharacter();
         this._ignoreNextWhitespace = false;
         return;
     }
     if (this.IsGoodForNameStart(this.NextCharacter))
     {
         this._nextTokenType = HtmlTokenType.Name;
         while (this.IsGoodForName(this.NextCharacter))
         {
             if (this.IsAtEndOfStream)
             {
                 return;
             }
             this._nextToken.Append(this.NextCharacter);
             this.GetNextCharacter();
         }
     }
     else
     {
         this._nextTokenType = HtmlTokenType.Atom;
         this._nextToken.Append(this.NextCharacter);
         this.GetNextCharacter();
     }
 }
Пример #26
0
        /// <summary>
        /// Unconditionally returns an atomic value for an attribute
        /// Even if there is no appropriate token it returns Atom value
        /// Does not guarantee token reader advancing.
        /// </summary>
        internal void GetNextAtomToken()
        {
            Debug.Assert(this.nextTokenType != HtmlTokenType.EOF, "Unexpected EOF");
            this.nextToken.Length = 0;

            this.SkipWhiteSpace();

            this.nextTokenType = HtmlTokenType.Atom;

            if ((this.NextCharacter == '\'' || this.NextCharacter == '"') && !this.IsNextCharacterEntity)
            {
                char startingQuote = this.NextCharacter;
                this.GetNextCharacter();

                // Consume all characters between quotes
                while (!(this.NextCharacter == startingQuote && !this.IsNextCharacterEntity) && !this.IsAtEndOfStream)
                {
                    this.nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }

                if (this.NextCharacter == startingQuote)
                {
                    this.GetNextCharacter();
                }

                // complete the quoted value
                // NOTE: our recovery here is different from IE's
                // IE keeps reading until it finds a closing quote or end of file
                // if end of file, it treats current value as text
                // if it finds a closing quote at any point within the text, it eats everything between the quotes
                // TODO: Suggestion:
                // however, we could stop when we encounter end of file or an angle bracket of any kind
                // and assume there was a quote there
                // so the attribute value may be meaningless but it is never treated as text
            }
            else
            {
                while (!this.IsAtEndOfStream && !char.IsWhiteSpace(this.NextCharacter) && this.NextCharacter != '>')
                {
                    this.nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }
            }
        }
Пример #27
0
        private void BogusComment()
        {
            _type = HtmlTokenType.Comment;
            while (true)
            {
                switch (Current())
                {
                case '>':
                case '\0':
                    Consume();
                    return;

                default:
                    Consume();
                    break;
                }
            }
        }
Пример #28
0
        private void CommentStart()
        {
            _type = HtmlTokenType.Comment;
            switch (Current())
            {
            case '-':
                Consume();
                CommentStartDash();
                break;

            case '>':
                Consume();
                break;

            default:
                Comment();
                break;
            }
        }
Пример #29
0
        private void SelfClosingStartTag()
        {
            switch (Current())
            {
            case '>':
                _isSelfClosing = true;
                Consume();
                break;

            case '\0':
                _type = HtmlTokenType.Comment;
                Consume();
                break;

            default:
                BeforeAttributeName();
                break;
            }
        }
Пример #30
0
        private void ReadDynamicContent()
        {
            NextTokenType     = HtmlTokenType.Text;
            _nextToken.Length = 0;

            GetNextCharacter();
            GetNextCharacter();

            while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
            {
                GetNextCharacter();
            }

            if (IsAtEndOfStream)
            {
                return;
            }
            GetNextCharacter();
            GetNextCharacter();
        }
Пример #31
0
        private void Data()
        {
            _type = HtmlTokenType.Text;

            while (true)
            {
                switch (Current())
                {
                case '\0':
                    Consume();
                    return;

                case '<':
                    return;

                default:
                    Consume();
                    break;
                }
            }
        }
Пример #32
0
 /// <summary>
 /// Creates a new HTML TagToken with the defined name.
 /// </summary>
 /// <param name="type">The type of the tag token.</param>
 /// <param name="position">The token's position.</param>
 /// <param name="name">The name of the tag.</param>
 public HtmlTagToken(HtmlTokenType type, TextPosition position, String name)
     : base(type, position, name)
 {
     _attributes = new List<KeyValuePair<String, String>>();
 }
Пример #33
0
        /// <summary>
        /// skips past unknown directives that start with "<!" but are not comments or Cdata
        /// ignores content of such directives until the next ">" character
        /// applies to directives such as DOCTYPE, etc that we do not presently support
        /// </summary>
        private void ReadUnknownDirective()
        {
            // verify that we are at an unknown directive
            Debug.Assert(_previousCharacter == '<' && _nextCharacter == '!' && !(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));

            // Let's treat this as empty text
            _nextTokenType = HtmlTokenType.Text;
            _nextToken.Length = 0;

            // advance to the next character
            this.GetNextCharacter();

            // skip to the first tag end we find
            while (!(_nextCharacter == '>' && !IsNextCharacterEntity) && !this.IsAtEndOfStream)
            {
                this.GetNextCharacter();
            }

            if (!this.IsAtEndOfStream)
            {
                // advance past the tag end
                this.GetNextCharacter();
            }
        }
Пример #34
0
        /// <summary>
        /// skips dynamic content starting with '<![' and ending with ']>' 
        /// </summary>
        private void ReadDynamicContent()
        {
            // verify that we are at dynamic content, which may include CDATA
            Debug.Assert(_previousCharacter == '<' && _nextCharacter == '!' && _lookAheadCharacter == '[');

            // Let's treat this as empty text
            _nextTokenType = HtmlTokenType.Text;
            _nextToken.Length = 0;

            // advance twice, once to get the lookahead character and then to reach the start of the cdata
            this.GetNextCharacter();
            this.GetNextCharacter();

            // some directives may start with a <![ and then have some data and they will just end with a ]>
            // this function is modified to stop at the sequence ]> and not ]]>
            // this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
            // directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
            // sequence anyway, it probably stops at the first ]
            while (!(_nextCharacter == ']' && _lookAheadCharacter == '>') && !this.IsAtEndOfStream)
            {
                // advance
                this.GetNextCharacter();
            }

            if (!this.IsAtEndOfStream)
            {
                // advance, first to the last >
                this.GetNextCharacter();

                // then advance past it to the next character after processing directive
                this.GetNextCharacter();
            }
        }
Пример #35
0
        /// <summary>
        /// skips comments starting with '<!-' and ending with '-->' 
        /// NOTE: 10/06/2004: processing changed, will now skip anything starting with
        /// the "<!-"  sequence and ending in "!>" or "->", because in practice many html pages do not
        /// use the full comment specifying conventions
        /// </summary>
        private void ReadComment()
        {
            // verify that we are at a comment
            Debug.Assert(_previousCharacter == '<' && _nextCharacter == '!' && _lookAheadCharacter == '-');

            // Initialize a token
            _nextTokenType = HtmlTokenType.Comment;
            _nextToken.Length = 0;

            // advance to the next character, so that to be at the start of comment value
            this.GetNextCharacter(); // get first '-'
            this.GetNextCharacter(); // get second '-'
            this.GetNextCharacter(); // get first character of comment content

            while (true)
            {
                // Read text until end of comment
                // Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
                while (!this.IsAtEndOfStream && !(_nextCharacter == '-' && _lookAheadCharacter == '-' || _nextCharacter == '!' && _lookAheadCharacter == '>'))
                {
                    _nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }

                // Finish comment reading
                this.GetNextCharacter();
                if (_previousCharacter == '-' && _nextCharacter == '-' && _lookAheadCharacter == '>')
                {
                    // Standard comment end. Eat it and exit the loop
                    this.GetNextCharacter(); // get '>'
                    break;
                }
                else if (_previousCharacter == '!' && _nextCharacter == '>')
                {
                    // Nonstandard but possible comment end - '!>'. Exit the loop
                    break;
                }
                else
                {
                    // Not an end. Save character and continue continue reading
                    _nextToken.Append(_previousCharacter);
                    continue;
                }
            }

            // Read end of comment combination
            if (_nextCharacter == '>')
            {
                this.GetNextCharacter();
            }
        }
Пример #36
0
        /// <summary>
        /// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
        /// Does not guarantee token reader advancing.
        /// </summary>
        internal void GetNextTagToken()
        {
            _nextToken.Length = 0;
            if (this.IsAtEndOfStream)
            {
                _nextTokenType = HtmlTokenType.EOF;
                return;
            }

            this.SkipWhiteSpace();

            if (this.NextCharacter == '>' && !this.IsNextCharacterEntity)
            {
                // &gt; should not end a tag, so make sure it's not an entity
                _nextTokenType = HtmlTokenType.TagEnd;
                _nextToken.Append('>');
                this.GetNextCharacter();
            }
            else if (this.NextCharacter == '/' && _lookAheadCharacter == '>')
            {
                // could be start of closing of empty tag
                _nextTokenType = HtmlTokenType.EmptyTagEnd;
                _nextToken.Append("/>");
                this.GetNextCharacter();
                this.GetNextCharacter();
                _ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
            }
            else if (IsGoodForNameStart(this.NextCharacter))
            {
                _nextTokenType = HtmlTokenType.Name;

                // starts a name
                // we allow character entities here
                // we do not throw exceptions here if end of stream is encountered
                // just stop and return whatever is in the token
                // if the parser is not expecting end of file after this it will call
                // the get next token function and throw an exception
                while (IsGoodForName(this.NextCharacter) && !this.IsAtEndOfStream)
                {
                    _nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }
            }
            else
            {
                // Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
                _nextTokenType = HtmlTokenType.Atom;
                _nextToken.Append(this.NextCharacter);
                this.GetNextCharacter();
            }
        }
Пример #37
0
 public static HtmlToken FromBounds(HtmlTokenType type, int start, int end) {
     return new HtmlToken(type, start, end - start);
 }
Пример #38
0
        /// <summary>
        /// Unconditionally returns an atomic value for an attribute
        /// Even if there is no appropriate token it returns Atom value
        /// Does not guarantee token reader advancing.
        /// </summary>
        internal void GetNextAtomToken()
        {
            Debug.Assert(_nextTokenType != HtmlTokenType.EOF);
            _nextToken.Length = 0;

            this.SkipWhiteSpace();

            _nextTokenType = HtmlTokenType.Atom;

            if ((this.NextCharacter == '\'' || this.NextCharacter == '"') && !this.IsNextCharacterEntity)
            {
                char startingQuote = this.NextCharacter;
                this.GetNextCharacter();

                // Consume all characters between quotes
                while (!(this.NextCharacter == startingQuote && !this.IsNextCharacterEntity) && !this.IsAtEndOfStream)
                {
                    _nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }
                if (this.NextCharacter == startingQuote)
                {
                    this.GetNextCharacter();
                }

                // complete the quoted value
                // IE keeps reading until it finds a closing quote or end of file
                // if end of file, it treats current value as text
                // if it finds a closing quote at any point within the text, it eats everything between the quotes
                // however, we could stop when we encounter end of file or an angle bracket of any kind
                // and assume there was a quote there
                // so the attribute value may be meaningless but it is never treated as text
            }
            else
            {
                while (!this.IsAtEndOfStream && !Char.IsWhiteSpace(this.NextCharacter) && this.NextCharacter != '>')
                {
                    _nextToken.Append(this.NextCharacter);
                    this.GetNextCharacter();
                }
            }
        }
Пример #39
0
 public HtmlToken(HtmlTokenType type)
     : this(type, 0, 0) {
 }
Пример #40
0
 public HtmlToken(HtmlTokenType type, TextPosition position, String name)
 {
     _type = type;
     _position = position;
     _name = name;
 }
Пример #41
0
 public HtmlToken(HtmlTokenType type, TextPosition position)
     : this(type, position, null)
 {
 }
Пример #42
0
 public HtmlToken(HtmlTokenType type, int start, int length)
     : this(type, start, length, true) {
 }
Пример #43
0
 public HtmlToken(HtmlTokenType type, int start, int length, bool wellFormed)
     : base(start, length) {
     _tokenType = type;
     _isWellFormed = wellFormed;
 }
Пример #44
0
 public static HtmlToken FromBounds(HtmlTokenType type, int start, int end, bool wellFormed) {
     return new HtmlToken(type, start, end - start, wellFormed);
 }
Пример #45
0
 /// <summary>
 /// Sets the default values.
 /// </summary>
 /// <param name="type">The type of the tag token.</param>
 /// <param name="position">The token's position.</param>
 public HtmlTagToken(HtmlTokenType type, TextPosition position)
     : this(type, position, String.Empty)
 {
 }
Пример #46
0
 string TokenTypeAsString(HtmlTokenType t) {
     string type = t.ToString();
     //int lastDot = type.LastIndexOf('.');
     return type;
 }
Пример #47
0
        /// <summary>
        /// retrieves next recognizable token from input string 
        /// and identifies its type
        /// if no valid token is found, the output parameters are set to null
        /// if end of stream is reached without matching any token, token type
        /// paramter is set to EOF
        /// </summary>
        internal void GetNextContentToken()
        {
            Debug.Assert(_nextTokenType != HtmlTokenType.EOF);
            _nextToken.Length = 0;
            if (this.IsAtEndOfStream)
            {
                _nextTokenType = HtmlTokenType.EOF;
                return;
            }

            if (this.IsAtTagStart)
            {
                this.GetNextCharacter();

                if (this.NextCharacter == '/')
                {
                    _nextToken.Append("</");
                    _nextTokenType = HtmlTokenType.ClosingTagStart;

                    // advance
                    this.GetNextCharacter();
                    _ignoreNextWhitespace = false; // Whitespaces after closing tags are significant
                }
                else
                {
                    _nextTokenType = HtmlTokenType.OpeningTagStart;
                    _nextToken.Append("<");
                    _ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant
                }
            }
            else if (this.IsAtDirectiveStart)
            {
                // either a comment or CDATA
                this.GetNextCharacter();
                if (_lookAheadCharacter == '[')
                {
                    // cdata
                    this.ReadDynamicContent();
                }
                else if (_lookAheadCharacter == '-')
                {
                    this.ReadComment();
                }
                else
                {
                    // neither a comment nor cdata, should be something like DOCTYPE
                    // skip till the next tag ender
                    this.ReadUnknownDirective();
                }
            }
            else
            {
                // read text content, unless you encounter a tag
                _nextTokenType = HtmlTokenType.Text;
                while (!this.IsAtTagStart && !this.IsAtEndOfStream && !this.IsAtDirectiveStart)
                {
                    if (this.NextCharacter == '<' && !this.IsNextCharacterEntity && _lookAheadCharacter == '?')
                    {
                        // ignore processing directive
                        this.SkipProcessingDirective();
                    }
                    else
                    {
                        if (this.NextCharacter <= ' ')
                        {
                            //  Respect xml:preserve or its equivalents for whitespace processing
                            if (_ignoreNextWhitespace)
                            {
                                // Ignore repeated whitespaces
                            }
                            else
                            {
                                // Treat any control character sequence as one whitespace
                                _nextToken.Append(' ');
                            }
                            _ignoreNextWhitespace = true; // and keep ignoring the following whitespaces
                        }
                        else
                        {
                            _nextToken.Append(this.NextCharacter);
                            _ignoreNextWhitespace = false;
                        }
                        this.GetNextCharacter();
                    }
                }
            }
        }
Пример #48
0
 public ComplexAttributeValueToken(IHtmlToken token, char openQuote, char closeQuote)
     : base(token) {
     _tokenType = token.TokenType;
     _openQuote = openQuote;
     _closeQuote = closeQuote;
 }
Пример #49
0
        /// <summary>
        /// Unconditionally returns equal sign token. Even if there is no
        /// real equal sign in the stream, it behaves as if it were there.
        /// Does not guarantee token reader advancing.
        /// </summary>
        internal void GetNextEqualSignToken()
        {
            Debug.Assert(_nextTokenType != HtmlTokenType.EOF);
            _nextToken.Length = 0;

            _nextToken.Append('=');
            _nextTokenType = HtmlTokenType.EqualSign;

            this.SkipWhiteSpace();

            if (this.NextCharacter == '=')
            {
                // '=' is not in the list of entities, so no need to check for entities here
                this.GetNextCharacter();
            }
        }