public static NextTokenType PeekNextToken(HtmlCharStream cs, int tagEnd, out ITextRange range) { NextTokenType tokenType = NextTokenType.Unknown; int current = cs.Position; if (cs.IsEndOfStream() || cs.Position == tagEnd) { range = new TextRange(); return NextTokenType.None; } int start = cs.Position; while (cs.IsWhiteSpace()) cs.MoveToNextChar(); if (cs.IsEndOfStream() || cs.Position == tagEnd) { range = TextRange.FromBounds(start, cs.Position); return NextTokenType.Unknown; } if (cs.IsAtTagDelimiter()) { tokenType = NextTokenType.Tag; } else if (cs.CurrentChar == '=') { tokenType = NextTokenType.Equals; } else { int digits = 0; bool firstLetter = false; int length = 0; int chars = 0; if (cs.IsAnsiLetter()) firstLetter = true; while (!cs.IsEndOfStream() && !cs.IsWhiteSpace() && !cs.IsAtTagDelimiter() && cs.CurrentChar != '=' && cs.Position < tagEnd) { if (cs.IsAnsiLetter() || cs.CurrentChar == '_' || cs.CurrentChar == '-') chars++; else if (cs.IsDecimal() || cs.CurrentChar == '.') digits++; cs.MoveToNextChar(); length++; } if (length > 0) { if (length == digits) tokenType = NextTokenType.Number; else if (length == chars) tokenType = NextTokenType.Letters; else if (firstLetter) tokenType = NextTokenType.Identifier; } } range = TextRange.FromBounds(start, cs.Position); cs.Position = current; return tokenType; }
public void HtmlTokenizer_GetNameToken_BasicTest() { var cs = new HtmlCharStream("foo"); HtmlTokenizer target = new HtmlTokenizer(cs); NameToken actual = target.GetNameToken(); Assert.Equal(3, actual.Length); Assert.Equal(0, actual.Start); Assert.Equal(3, actual.End); Assert.Equal(3, actual.NameRange.Length); Assert.Equal(0, actual.NameRange.Start); Assert.Equal(3, actual.NameRange.End); Assert.Equal(0, actual.PrefixRange.Start); Assert.Equal(0, actual.PrefixRange.End); }
public void CharStream_BasicTest() { string text = "abcd\"foo\"\r\n<a href="; HtmlCharStream cs = new HtmlCharStream(text); Assert.Equal('a', cs.CurrentChar); cs.Advance(2); Assert.False(cs.IsEndOfStream()); Assert.Equal('c', cs.CurrentChar); cs.Advance(-1); Assert.False(cs.IsEndOfStream()); Assert.Equal('b', cs.CurrentChar); cs.Advance(text.Length); Assert.True(cs.IsEndOfStream()); Assert.Equal(0, cs.CurrentChar); cs.Advance(-text.Length); Assert.False(cs.IsEndOfStream()); Assert.Equal('a', cs.CurrentChar); Assert.Equal('d', cs.LookAhead(3)); Assert.Equal('\"', cs.LookAhead(4)); Assert.Equal(0, cs.LookAhead(text.Length)); Assert.Equal(0, cs.LookAhead(-1)); Assert.Equal(text.Length, cs.DistanceFromEnd); cs.Advance(1); Assert.Equal(text.Length - 1, cs.DistanceFromEnd); cs.Position = 4; Assert.True(cs.IsAtString()); cs.Position = 5; Assert.False(cs.IsAtString()); cs.Position = 9; Assert.True(cs.IsWhiteSpace()); cs.MoveToNextChar(); Assert.True(cs.IsWhiteSpace()); cs.MoveToNextChar(); Assert.True(cs.IsAtTagDelimiter()); }
public void HtmlTokenizer_GetNameToken_MissingNameTest() { var cs = new HtmlCharStream("foo:"); HtmlTokenizer target = new HtmlTokenizer(cs); NameToken actual = target.GetNameToken(); Assert.Equal(4, actual.Length); Assert.Equal(0, actual.Start); Assert.Equal(4, actual.End); Assert.True(actual.HasPrefix()); Assert.Equal(0, actual.PrefixRange.Start); Assert.Equal(3, actual.PrefixRange.End); Assert.False(actual.HasName()); Assert.Equal(0, actual.NameRange.Length); Assert.False(actual.HasQualifiedName()); Assert.Equal(0, actual.QualifiedName.Start); Assert.Equal(4, actual.QualifiedName.End); }
public void HtmlCharStream_IsNameCharTest() { var stream = new HtmlCharStream(new TextStream("")); Assert.True(stream.IsEndOfStream()); Assert.Equal(0, stream.Length); stream = new HtmlCharStream(new TextStream("<h123")); Assert.Equal(0, stream.Position); Assert.False(stream.IsEndOfStream()); stream.Position = 5; Assert.True(stream.IsEndOfStream()); stream.Position = 0; Assert.False(stream.IsEndOfStream()); stream.MoveToNextChar(); Assert.Equal(1, stream.Position); stream.Advance(2); Assert.Equal(3, stream.Position); stream.Advance(-2); Assert.Equal(1, stream.Position); stream.Advance(1000); Assert.True(stream.IsEndOfStream()); stream.Position = 0; Assert.True(stream.IsAtTagDelimiter()); Assert.Equal('<', stream.CurrentChar); Assert.Equal('h', stream.NextChar); stream.Position = 1; Assert.False(stream.IsAtTagDelimiter()); Assert.True(stream.IsNameChar()); Assert.True(HtmlCharStream.IsNameStartChar(stream.CurrentChar)); stream.Position = 2; Assert.False(stream.IsAtTagDelimiter()); Assert.True(stream.IsNameChar()); Assert.False(HtmlCharStream.IsNameStartChar(stream.CurrentChar)); }
public HtmlTokenizer(HtmlCharStream cs) { _cs = cs; _stringClosure = new StringClosure(cs); }
private StringClosure CreateStringClosure(string text) { HtmlCharStream stream = new HtmlCharStream(text); StringClosure closure = new StringClosure(stream); return closure; }
public void HtmlTokenizer_SkipWhitespaceTest() { var cs = new HtmlCharStream(" abc\t\tdef\r\n gh"); HtmlTokenizer target = new HtmlTokenizer(cs); target.SkipWhitespace(); Assert.Equal(3, cs.Position); target.SkipWhitespace(); Assert.Equal(3, cs.Position); cs.Advance(3); target.SkipWhitespace(); Assert.Equal(8, cs.Position); cs.Advance(3); target.SkipWhitespace(); Assert.Equal(14, cs.Position); }
public StringClosure(HtmlCharStream cs) { _cs = cs; }
/// <summary> /// Parse text from a text provider within a given range /// </summary> /// <param name="textProvider">Text provider</param> /// <param name="range">Range to parse</param> public void Parse(ITextProvider textProvider, ITextRange range) { DateTime? timeStart = null; if (Stats.Enabled) timeStart = DateTime.UtcNow; if (ParsingStarting != null) ParsingStarting(this, new HtmlParserRangeEventArgs(range)); DocType = DocType.Undefined; _cs = new HtmlCharStream(textProvider, range); _tokenizer = new HtmlTokenizer(_cs); _softRangeEnd = range.End; OnTextState(); if (ParsingComplete != null) ParsingComplete(this, new HtmlParserRangeEventArgs(range)); if (Stats.Enabled) { Stats.ParseTime = (DateTime.UtcNow - timeStart.Value); Stats.CharactersPerSecond = (int)(1000.0 * (double)_cs.Length / (double)Stats.ParseTime.TotalMilliseconds + 0.5); } }