/// <summary> /// skips past unknown directives that start with "<!" but are not comments or Cdata /// ignores content of such directives until the next ">" /// character /// applies to directives such as DOCTYPE, etc that we do not presently support /// </summary> private void ReadUnknownDirective() { // verify that we are at an unknown directive Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && !(_lookAheadCharacter == '-' || _lookAheadCharacter == '[')); // Let's treat this as empty text NextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; // advance to the next character GetNextCharacter(); // skip to the first tag end we find while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream) { GetNextCharacter(); } if (!IsAtEndOfStream) { // advance past the tag end GetNextCharacter(); } }
internal HtmlToken( HtmlTokenType type, bool isSelfClosing, ReadOnlyMemory <char> name, ReadOnlyMemory <char> rawText, Memory <HtmlAttribute> attributes, in HtmlTextRange range,
public ComplexAttributeValueToken(IHtmlToken token, char openQuote, char closeQuote) : base(token) { _tokenType = token.TokenType; _openQuote = openQuote; _closeQuote = closeQuote; }
string TokenTypeAsString(HtmlTokenType t) { string type = t.ToString(); //int lastDot = type.LastIndexOf('.'); return(type); }
private void AttributeValue(char quote) { _attributeValueStart = Position(); while (true) { switch (Current()) { case '\0': _type = HtmlTokenType.Comment; Consume(); return; case char c when c == quote: _attributeValueEnd = Position(); Consume(); _attributeEnd = Position(); AddAttribute(); BeforeAttributeName(); return; default: Consume(); break; } } }
private void EndTagOpen() { _type = HtmlTokenType.EndTag; switch (Current()) { case '>': Consume(); _nameEnd = _nameStart; break; case '\0': Consume(); _type = HtmlTokenType.Text; break; case char c when IsASCIIAlpha(c): TagName(readAttributes: false); break; default: Consume(); BogusComment(); break; } }
internal void GetNextAtomToken() { this._nextToken.Length = 0; this.SkipWhiteSpace(); this._nextTokenType = HtmlTokenType.Atom; if ((this.NextCharacter == '\'' || this.NextCharacter == '"') && !this.IsNextCharacterEntity) { char nextCharacter = this.NextCharacter; this.GetNextCharacter(); while ((this.NextCharacter != nextCharacter || this.IsNextCharacterEntity) && !this.IsAtEndOfStream) { this._nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } if (this.NextCharacter == nextCharacter) { this.GetNextCharacter(); return; } } else { while (!this.IsAtEndOfStream && !char.IsWhiteSpace(this.NextCharacter) && this.NextCharacter != '>') { this._nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } }
/// <summary> /// skips dynamic content starting with '<![' and ending with ']>' /// </summary> private void ReadDynamicContent() { // verify that we are at dynamic content, which may include CDATA Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '['); // Let's treat this as empty text NextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; // advance twice, once to get the lookahead character and then to reach the start of the cdata GetNextCharacter(); GetNextCharacter(); // NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else // some directives may start with a <![ and then have some data and they will just end with a ]> // this function is modified to stop at the sequence ]> and not ]]> // this means that CDATA and anything else expressed in their own set of [] within the <! [...]> // directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such // sequence anyway, it probably stops at the first ] while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream) { // advance GetNextCharacter(); } if (!IsAtEndOfStream) { // advance, first to the last > GetNextCharacter(); // then advance past it to the next character after processing directive GetNextCharacter(); } }
private void ReadComment() { this._nextTokenType = HtmlTokenType.Comment; this._nextToken.Length = 0; this.GetNextCharacter(); this.GetNextCharacter(); this.GetNextCharacter(); while (true) { if (this.IsAtEndOfStream || (this._nextCharacter == '-' && this._lookAheadCharacter == '-') || (this._nextCharacter == '!' && this._lookAheadCharacter == '>')) { this.GetNextCharacter(); if (this._previousCharacter == '-' && this._nextCharacter == '-' && this._lookAheadCharacter == '>') { break; } if (this._previousCharacter == '!' && this._nextCharacter == '>') { goto IL_C6; } this._nextToken.Append(this._previousCharacter); } else { this._nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } this.GetNextCharacter(); IL_C6: if (this._nextCharacter == '>') { this.GetNextCharacter(); } }
private void TagOpen() { _type = HtmlTokenType.StartTag; switch (Current()) { case '!': Consume(); MarkdownDeclarationOpen(); break; case '?': BogusComment(); break; case '/': Consume(); EndTagOpen(); break; case char c when IsASCIIAlpha(c): TagName(readAttributes: true); break; default: Data(); break; } }
private bool IsValidTokenType(HtmlTokenType type) { return(type != HtmlTokenType.WhiteSpace && type != HtmlTokenType.NewLine && type != HtmlTokenType.RazorComment && type != HtmlTokenType.RazorCommentStar && type != HtmlTokenType.RazorCommentTransition && type != HtmlTokenType.Transition); }
private void AfterAttributeName() { while (true) { switch (Current()) { case '\0': _type = HtmlTokenType.Comment; Consume(); return; case '>': if (_attributeEnd.Index == _attributeStart.Index) { _attributeEnd = Position(); } Consume(); AddAttribute(); return; case '/': if (_attributeEnd.Index == _attributeStart.Index) { _attributeEnd = Position(); } Consume(); AddAttribute(); SelfClosingStartTag(); return; case '=': Consume(); BeforeAttributeValue(); return; case '\t': case '\r': case '\n': case '\f': case ' ': if (_attributeEnd.Index == _attributeStart.Index) { _attributeEnd = Position(); } Consume(); break; default: AddAttribute(); AttributeName(); return; } } }
internal void GetNextEqualSignToken() { this._nextToken.Length = 0; this._nextToken.Append('='); this._nextTokenType = HtmlTokenType.EqualSign; this.SkipWhiteSpace(); if (this.NextCharacter == '=') { this.GetNextCharacter(); } }
private void TagName(bool readAttributes) { _nameStart = _nameEnd = Position(); while (true) { switch (Current()) { case '\0': _type = HtmlTokenType.Comment; Consume(); return; case '>': if (_nameEnd.Index == _nameStart.Index) { _nameEnd = Position(); } Consume(); return; case '/': if (_nameEnd.Index == _nameStart.Index) { _nameEnd = Position(); } Consume(); SelfClosingStartTag(); return; case '\t': case '\r': case '\n': case '\f': case ' ': if (_nameEnd.Index == _nameStart.Index) { _nameEnd = Position(); } Consume(); if (readAttributes) { BeforeAttributeName(); return; } break; default: Consume(); break; } } }
/// <summary> /// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream /// Does not guarantee token reader advancing. /// </summary> internal void GetNextTagToken() { this.nextToken.Length = 0; if (this.IsAtEndOfStream) { this.nextTokenType = HtmlTokenType.EOF; return; } this.SkipWhiteSpace(); if (this.NextCharacter == '>' && !this.IsNextCharacterEntity) { // > should not end a tag, so make sure it's not an entity this.nextTokenType = HtmlTokenType.TagEnd; this.nextToken.Append('>'); this.GetNextCharacter(); //// Note: ignoreNextWhitespace must be set appropriately on tag start processing } else if (this.NextCharacter == '/' && this.lookAheadCharacter == '>') { // could be start of closing of empty tag this.nextTokenType = HtmlTokenType.EmptyTagEnd; this.nextToken.Append("/>"); this.GetNextCharacter(); this.GetNextCharacter(); this.ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant } else if (this.IsGoodForNameStart(this.NextCharacter)) { this.nextTokenType = HtmlTokenType.Name; // starts a name // we allow character entities here // we do not throw exceptions here if end of stream is encountered // just stop and return whatever is in the token // if the parser is not expecting end of file after this it will call // the get next token function and throw an exception while (this.IsGoodForName(this.NextCharacter) && !this.IsAtEndOfStream) { this.nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } else { // Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it. this.nextTokenType = HtmlTokenType.Atom; this.nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } }
internal HtmlLexicalAnalyzer(string inputTextString) { this._inputStringReader = new StringReader(inputTextString); this._nextCharacterCode = 0; this._nextCharacter = ' '; this._lookAheadCharacterCode = this._inputStringReader.Read(); this._lookAheadCharacter = (char)this._lookAheadCharacterCode; this._previousCharacter = ' '; this._ignoreNextWhitespace = true; this._nextToken = new StringBuilder(100); this._nextTokenType = HtmlTokenType.Text; this.GetNextCharacter(); }
/// <summary> /// skips comments starting with '!-' and ending with '--' /// NOTE: 10/06/2004: processing changed, will now skip anything starting with /// the "!-" sequence and ending in "!" or "-", because in practice many html pages do not /// use the full comment specifying conventions /// </summary> private void ReadComment() { // verify that we are at a comment Debug.Assert(this.previousCharacter == '<' && this.nextCharacter == '!' && this.lookAheadCharacter == '-', "HtmlToXamlConverter"); // Initialize a token this.nextTokenType = HtmlTokenType.Comment; this.nextToken.Length = 0; // advance to the next character, so that to be at the start of comment value this.GetNextCharacter(); // get first '-' this.GetNextCharacter(); // get second '-' this.GetNextCharacter(); // get first character of comment content while (true) { // Read text until end of comment // Note that in many actual html pages comments end with "!>" (while xml standard is "-->") while (!this.IsAtEndOfStream && !((this.nextCharacter == '-' && this.lookAheadCharacter == '-') || (this.nextCharacter == '!' && this.lookAheadCharacter == '>'))) { this.nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } // Finish comment reading this.GetNextCharacter(); if (this.previousCharacter == '-' && this.nextCharacter == '-' && this.lookAheadCharacter == '>') { // Standard comment end. Eat it and exit the loop this.GetNextCharacter(); // get '>' break; } else if (this.previousCharacter == '!' && this.nextCharacter == '>') { // Nonstandard but possible comment end - '!>'. Exit the loop break; } else { // Not an end. Save character and continue continue reading this.nextToken.Append(this.previousCharacter); continue; } } // Read end of comment combination if (this.nextCharacter == '>') { this.GetNextCharacter(); } }
/// <summary> /// initializes the _inputStringReader member with the string to be read /// also sets initial values for _nextCharacterCode and _nextTokenType /// </summary> /// <param name="inputTextString"> /// text string to be parsed for xml content /// </param> internal HtmlLexicalAnalyzer(string inputTextString) { _inputStringReader = new StringReader(inputTextString); _nextCharacterCode = 0; _nextCharacter = ' '; _lookAheadCharacterCode = _inputStringReader.Read(); _lookAheadCharacter = (char)_lookAheadCharacterCode; _previousCharacter = ' '; _ignoreNextWhitespace = true; _nextToken = new StringBuilder(100); _nextTokenType = HtmlTokenType.Text; // read the first character so we have some value for the NextCharacter property this.GetNextCharacter(); }
private void ReadUnknownDirective() { this._nextTokenType = HtmlTokenType.Text; this._nextToken.Length = 0; this.GetNextCharacter(); while ((this._nextCharacter != '>' || this.IsNextCharacterEntity) && !this.IsAtEndOfStream) { this.GetNextCharacter(); } if (!this.IsAtEndOfStream) { this.GetNextCharacter(); } }
/// <summary> /// skips comments starting with '<!-' and ending with '-->' /// NOTE: 10/06/2004: processing changed, will now skip anything starting with /// the "<!-" sequence and ending in "!>" or "->", because in practice many html pages do not /// use the full comment specifying conventions /// </summary> private void ReadComment() { // verify that we are at a comment Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-'); // Initialize a token NextTokenType = HtmlTokenType.Comment; _nextToken.Length = 0; // advance to the next character, so that to be at the start of comment value GetNextCharacter(); // get first '-' GetNextCharacter(); // get second '-' GetNextCharacter(); // get first character of comment content while (true) { // Read text until end of comment // Note that in many actual html pages comments end with "!>" (while xml standard is "-->") while (!IsAtEndOfStream && !(NextCharacter == '-' && _lookAheadCharacter == '-' || NextCharacter == '!' && _lookAheadCharacter == '>')) { _nextToken.Append(NextCharacter); GetNextCharacter(); } // Finish comment reading GetNextCharacter(); if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>') { // Standard comment end. Eat it and exit the loop GetNextCharacter(); // get '>' break; } if (_previousCharacter == '!' && NextCharacter == '>') { // Nonstandard but possible comment end - '!>'. Exit the loop break; } // Not an end. Save character and continue continue reading _nextToken.Append(_previousCharacter); } // Read end of comment combination if (NextCharacter == '>') { GetNextCharacter(); } }
// --------------------------------------------------------------------- // // Constructors // // --------------------------------------------------------------------- #region Constructors /// <summary> /// Initializes a new instance of the <see cref="HtmlLexicalAnalyzer" /> class. /// initializes the inputStringReader member with the string to be read /// also sets initial values for nextCharacterCode and nextTokenType /// </summary> /// <param name="inputTextString"> /// text string to be parsed for xml content /// </param> internal HtmlLexicalAnalyzer(string inputTextString) { this.inputStringReader = new StringReader(inputTextString); this.nextCharacterCode = 0; this.nextCharacter = ' '; this.lookAheadCharacterCode = this.inputStringReader.Read(); this.lookAheadCharacter = (char)this.lookAheadCharacterCode; this.previousCharacter = ' '; this.ignoreNextWhitespace = true; this.nextToken = new StringBuilder(100); this.nextTokenType = HtmlTokenType.Text; // read the first character so we have some value for the NextCharacter property this.GetNextCharacter(); }
/// <summary> /// Unconditionally returns equal sign token. Even if there is no /// real equal sign in the stream, it behaves as if it were there. /// Does not guarantee token reader advancing. /// </summary> internal void GetNextEqualSignToken() { Debug.Assert(this.nextTokenType != HtmlTokenType.EOF, "Unexpected EOF"); this.nextToken.Length = 0; this.nextToken.Append('='); this.nextTokenType = HtmlTokenType.EqualSign; this.SkipWhiteSpace(); if (this.NextCharacter == '=') { // '=' is not in the list of entities, so no need to check for entities here this.GetNextCharacter(); } }
private void ReadDynamicContent() { this._nextTokenType = HtmlTokenType.Text; this._nextToken.Length = 0; this.GetNextCharacter(); this.GetNextCharacter(); while ((this._nextCharacter != ']' || this._lookAheadCharacter != '>') && !this.IsAtEndOfStream) { this.GetNextCharacter(); } if (!this.IsAtEndOfStream) { this.GetNextCharacter(); this.GetNextCharacter(); } }
private void ReadUnknownDirective() { NextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; GetNextCharacter(); while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream) { GetNextCharacter(); } if (!IsAtEndOfStream) { GetNextCharacter(); } }
internal void GetNextTagToken() { this._nextToken.Length = 0; if (this.IsAtEndOfStream) { this._nextTokenType = HtmlTokenType.EOF; return; } this.SkipWhiteSpace(); if (this.NextCharacter == '>' && !this.IsNextCharacterEntity) { this._nextTokenType = HtmlTokenType.TagEnd; this._nextToken.Append('>'); this.GetNextCharacter(); return; } if (this.NextCharacter == '/' && this._lookAheadCharacter == '>') { this._nextTokenType = HtmlTokenType.EmptyTagEnd; this._nextToken.Append("/>"); this.GetNextCharacter(); this.GetNextCharacter(); this._ignoreNextWhitespace = false; return; } if (this.IsGoodForNameStart(this.NextCharacter)) { this._nextTokenType = HtmlTokenType.Name; while (this.IsGoodForName(this.NextCharacter)) { if (this.IsAtEndOfStream) { return; } this._nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } else { this._nextTokenType = HtmlTokenType.Atom; this._nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } }
/// <summary> /// Unconditionally returns an atomic value for an attribute /// Even if there is no appropriate token it returns Atom value /// Does not guarantee token reader advancing. /// </summary> internal void GetNextAtomToken() { Debug.Assert(this.nextTokenType != HtmlTokenType.EOF, "Unexpected EOF"); this.nextToken.Length = 0; this.SkipWhiteSpace(); this.nextTokenType = HtmlTokenType.Atom; if ((this.NextCharacter == '\'' || this.NextCharacter == '"') && !this.IsNextCharacterEntity) { char startingQuote = this.NextCharacter; this.GetNextCharacter(); // Consume all characters between quotes while (!(this.NextCharacter == startingQuote && !this.IsNextCharacterEntity) && !this.IsAtEndOfStream) { this.nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } if (this.NextCharacter == startingQuote) { this.GetNextCharacter(); } // complete the quoted value // NOTE: our recovery here is different from IE's // IE keeps reading until it finds a closing quote or end of file // if end of file, it treats current value as text // if it finds a closing quote at any point within the text, it eats everything between the quotes // TODO: Suggestion: // however, we could stop when we encounter end of file or an angle bracket of any kind // and assume there was a quote there // so the attribute value may be meaningless but it is never treated as text } else { while (!this.IsAtEndOfStream && !char.IsWhiteSpace(this.NextCharacter) && this.NextCharacter != '>') { this.nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } }
private void BogusComment() { _type = HtmlTokenType.Comment; while (true) { switch (Current()) { case '>': case '\0': Consume(); return; default: Consume(); break; } } }
private void CommentStart() { _type = HtmlTokenType.Comment; switch (Current()) { case '-': Consume(); CommentStartDash(); break; case '>': Consume(); break; default: Comment(); break; } }
private void SelfClosingStartTag() { switch (Current()) { case '>': _isSelfClosing = true; Consume(); break; case '\0': _type = HtmlTokenType.Comment; Consume(); break; default: BeforeAttributeName(); break; } }
private void ReadDynamicContent() { NextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; GetNextCharacter(); GetNextCharacter(); while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream) { GetNextCharacter(); } if (IsAtEndOfStream) { return; } GetNextCharacter(); GetNextCharacter(); }
private void Data() { _type = HtmlTokenType.Text; while (true) { switch (Current()) { case '\0': Consume(); return; case '<': return; default: Consume(); break; } } }
/// <summary> /// Creates a new HTML TagToken with the defined name. /// </summary> /// <param name="type">The type of the tag token.</param> /// <param name="position">The token's position.</param> /// <param name="name">The name of the tag.</param> public HtmlTagToken(HtmlTokenType type, TextPosition position, String name) : base(type, position, name) { _attributes = new List<KeyValuePair<String, String>>(); }
/// <summary> /// skips past unknown directives that start with "<!" but are not comments or Cdata /// ignores content of such directives until the next ">" character /// applies to directives such as DOCTYPE, etc that we do not presently support /// </summary> private void ReadUnknownDirective() { // verify that we are at an unknown directive Debug.Assert(_previousCharacter == '<' && _nextCharacter == '!' && !(_lookAheadCharacter == '-' || _lookAheadCharacter == '[')); // Let's treat this as empty text _nextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; // advance to the next character this.GetNextCharacter(); // skip to the first tag end we find while (!(_nextCharacter == '>' && !IsNextCharacterEntity) && !this.IsAtEndOfStream) { this.GetNextCharacter(); } if (!this.IsAtEndOfStream) { // advance past the tag end this.GetNextCharacter(); } }
/// <summary> /// skips dynamic content starting with '<![' and ending with ']>' /// </summary> private void ReadDynamicContent() { // verify that we are at dynamic content, which may include CDATA Debug.Assert(_previousCharacter == '<' && _nextCharacter == '!' && _lookAheadCharacter == '['); // Let's treat this as empty text _nextTokenType = HtmlTokenType.Text; _nextToken.Length = 0; // advance twice, once to get the lookahead character and then to reach the start of the cdata this.GetNextCharacter(); this.GetNextCharacter(); // some directives may start with a <![ and then have some data and they will just end with a ]> // this function is modified to stop at the sequence ]> and not ]]> // this means that CDATA and anything else expressed in their own set of [] within the <! [...]> // directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such // sequence anyway, it probably stops at the first ] while (!(_nextCharacter == ']' && _lookAheadCharacter == '>') && !this.IsAtEndOfStream) { // advance this.GetNextCharacter(); } if (!this.IsAtEndOfStream) { // advance, first to the last > this.GetNextCharacter(); // then advance past it to the next character after processing directive this.GetNextCharacter(); } }
/// <summary> /// skips comments starting with '<!-' and ending with '-->' /// NOTE: 10/06/2004: processing changed, will now skip anything starting with /// the "<!-" sequence and ending in "!>" or "->", because in practice many html pages do not /// use the full comment specifying conventions /// </summary> private void ReadComment() { // verify that we are at a comment Debug.Assert(_previousCharacter == '<' && _nextCharacter == '!' && _lookAheadCharacter == '-'); // Initialize a token _nextTokenType = HtmlTokenType.Comment; _nextToken.Length = 0; // advance to the next character, so that to be at the start of comment value this.GetNextCharacter(); // get first '-' this.GetNextCharacter(); // get second '-' this.GetNextCharacter(); // get first character of comment content while (true) { // Read text until end of comment // Note that in many actual html pages comments end with "!>" (while xml standard is "-->") while (!this.IsAtEndOfStream && !(_nextCharacter == '-' && _lookAheadCharacter == '-' || _nextCharacter == '!' && _lookAheadCharacter == '>')) { _nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } // Finish comment reading this.GetNextCharacter(); if (_previousCharacter == '-' && _nextCharacter == '-' && _lookAheadCharacter == '>') { // Standard comment end. Eat it and exit the loop this.GetNextCharacter(); // get '>' break; } else if (_previousCharacter == '!' && _nextCharacter == '>') { // Nonstandard but possible comment end - '!>'. Exit the loop break; } else { // Not an end. Save character and continue continue reading _nextToken.Append(_previousCharacter); continue; } } // Read end of comment combination if (_nextCharacter == '>') { this.GetNextCharacter(); } }
/// <summary> /// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream /// Does not guarantee token reader advancing. /// </summary> internal void GetNextTagToken() { _nextToken.Length = 0; if (this.IsAtEndOfStream) { _nextTokenType = HtmlTokenType.EOF; return; } this.SkipWhiteSpace(); if (this.NextCharacter == '>' && !this.IsNextCharacterEntity) { // > should not end a tag, so make sure it's not an entity _nextTokenType = HtmlTokenType.TagEnd; _nextToken.Append('>'); this.GetNextCharacter(); } else if (this.NextCharacter == '/' && _lookAheadCharacter == '>') { // could be start of closing of empty tag _nextTokenType = HtmlTokenType.EmptyTagEnd; _nextToken.Append("/>"); this.GetNextCharacter(); this.GetNextCharacter(); _ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant } else if (IsGoodForNameStart(this.NextCharacter)) { _nextTokenType = HtmlTokenType.Name; // starts a name // we allow character entities here // we do not throw exceptions here if end of stream is encountered // just stop and return whatever is in the token // if the parser is not expecting end of file after this it will call // the get next token function and throw an exception while (IsGoodForName(this.NextCharacter) && !this.IsAtEndOfStream) { _nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } else { // Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it. _nextTokenType = HtmlTokenType.Atom; _nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } }
public static HtmlToken FromBounds(HtmlTokenType type, int start, int end) { return new HtmlToken(type, start, end - start); }
/// <summary> /// Unconditionally returns an atomic value for an attribute /// Even if there is no appropriate token it returns Atom value /// Does not guarantee token reader advancing. /// </summary> internal void GetNextAtomToken() { Debug.Assert(_nextTokenType != HtmlTokenType.EOF); _nextToken.Length = 0; this.SkipWhiteSpace(); _nextTokenType = HtmlTokenType.Atom; if ((this.NextCharacter == '\'' || this.NextCharacter == '"') && !this.IsNextCharacterEntity) { char startingQuote = this.NextCharacter; this.GetNextCharacter(); // Consume all characters between quotes while (!(this.NextCharacter == startingQuote && !this.IsNextCharacterEntity) && !this.IsAtEndOfStream) { _nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } if (this.NextCharacter == startingQuote) { this.GetNextCharacter(); } // complete the quoted value // IE keeps reading until it finds a closing quote or end of file // if end of file, it treats current value as text // if it finds a closing quote at any point within the text, it eats everything between the quotes // however, we could stop when we encounter end of file or an angle bracket of any kind // and assume there was a quote there // so the attribute value may be meaningless but it is never treated as text } else { while (!this.IsAtEndOfStream && !Char.IsWhiteSpace(this.NextCharacter) && this.NextCharacter != '>') { _nextToken.Append(this.NextCharacter); this.GetNextCharacter(); } } }
public HtmlToken(HtmlTokenType type) : this(type, 0, 0) { }
public HtmlToken(HtmlTokenType type, TextPosition position, String name) { _type = type; _position = position; _name = name; }
public HtmlToken(HtmlTokenType type, TextPosition position) : this(type, position, null) { }
public HtmlToken(HtmlTokenType type, int start, int length) : this(type, start, length, true) { }
public HtmlToken(HtmlTokenType type, int start, int length, bool wellFormed) : base(start, length) { _tokenType = type; _isWellFormed = wellFormed; }
public static HtmlToken FromBounds(HtmlTokenType type, int start, int end, bool wellFormed) { return new HtmlToken(type, start, end - start, wellFormed); }
/// <summary> /// Sets the default values. /// </summary> /// <param name="type">The type of the tag token.</param> /// <param name="position">The token's position.</param> public HtmlTagToken(HtmlTokenType type, TextPosition position) : this(type, position, String.Empty) { }
string TokenTypeAsString(HtmlTokenType t) { string type = t.ToString(); //int lastDot = type.LastIndexOf('.'); return type; }
/// <summary> /// retrieves next recognizable token from input string /// and identifies its type /// if no valid token is found, the output parameters are set to null /// if end of stream is reached without matching any token, token type /// paramter is set to EOF /// </summary> internal void GetNextContentToken() { Debug.Assert(_nextTokenType != HtmlTokenType.EOF); _nextToken.Length = 0; if (this.IsAtEndOfStream) { _nextTokenType = HtmlTokenType.EOF; return; } if (this.IsAtTagStart) { this.GetNextCharacter(); if (this.NextCharacter == '/') { _nextToken.Append("</"); _nextTokenType = HtmlTokenType.ClosingTagStart; // advance this.GetNextCharacter(); _ignoreNextWhitespace = false; // Whitespaces after closing tags are significant } else { _nextTokenType = HtmlTokenType.OpeningTagStart; _nextToken.Append("<"); _ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant } } else if (this.IsAtDirectiveStart) { // either a comment or CDATA this.GetNextCharacter(); if (_lookAheadCharacter == '[') { // cdata this.ReadDynamicContent(); } else if (_lookAheadCharacter == '-') { this.ReadComment(); } else { // neither a comment nor cdata, should be something like DOCTYPE // skip till the next tag ender this.ReadUnknownDirective(); } } else { // read text content, unless you encounter a tag _nextTokenType = HtmlTokenType.Text; while (!this.IsAtTagStart && !this.IsAtEndOfStream && !this.IsAtDirectiveStart) { if (this.NextCharacter == '<' && !this.IsNextCharacterEntity && _lookAheadCharacter == '?') { // ignore processing directive this.SkipProcessingDirective(); } else { if (this.NextCharacter <= ' ') { // Respect xml:preserve or its equivalents for whitespace processing if (_ignoreNextWhitespace) { // Ignore repeated whitespaces } else { // Treat any control character sequence as one whitespace _nextToken.Append(' '); } _ignoreNextWhitespace = true; // and keep ignoring the following whitespaces } else { _nextToken.Append(this.NextCharacter); _ignoreNextWhitespace = false; } this.GetNextCharacter(); } } } }
/// <summary> /// Unconditionally returns equal sign token. Even if there is no /// real equal sign in the stream, it behaves as if it were there. /// Does not guarantee token reader advancing. /// </summary> internal void GetNextEqualSignToken() { Debug.Assert(_nextTokenType != HtmlTokenType.EOF); _nextToken.Length = 0; _nextToken.Append('='); _nextTokenType = HtmlTokenType.EqualSign; this.SkipWhiteSpace(); if (this.NextCharacter == '=') { // '=' is not in the list of entities, so no need to check for entities here this.GetNextCharacter(); } }