Пример #1
0
 private void Parse()
 {
   int num = 0;
   if (this.OptionComputeChecksum)
     this._crc32 = new Crc32();
   this.Lastnodes = new Dictionary<string, HtmlNode>();
   this._c = 0;
   this._fullcomment = false;
   this._parseerrors = new List<HtmlParseError>();
   this._line = 1;
   this._lineposition = 1;
   this._maxlineposition = 1;
   this._state = HtmlDocument.ParseState.Text;
   this._oldstate = this._state;
   this._documentnode._innerlength = this.Text.Length;
   this._documentnode._outerlength = this.Text.Length;
   this._remainderOffset = this.Text.Length;
   this._lastparentnode = this._documentnode;
   this._currentnode = this.CreateNode(HtmlNodeType.Text, 0);
   this._currentattribute = (HtmlAttribute) null;
   this._index = 0;
   this.PushNodeStart(HtmlNodeType.Text, 0);
   while (this._index < this.Text.Length)
   {
     this._c = (int) this.Text[this._index];
     this.IncrementPosition();
     switch (this._state)
     {
       case HtmlDocument.ParseState.Text:
         if (!this.NewCheck())
           continue;
         continue;
       case HtmlDocument.ParseState.WhichTag:
         if (!this.NewCheck())
         {
           if (this._c == 47)
           {
             this.PushNodeNameStart(false, this._index);
           }
           else
           {
             this.PushNodeNameStart(true, this._index - 1);
             this.DecrementPosition();
           }
           this._state = HtmlDocument.ParseState.Tag;
           continue;
         }
         continue;
       case HtmlDocument.ParseState.Tag:
         if (!this.NewCheck())
         {
           if (HtmlDocument.IsWhiteSpace(this._c))
           {
             this.PushNodeNameEnd(this._index - 1);
             if (this._state == HtmlDocument.ParseState.Tag)
             {
               this._state = HtmlDocument.ParseState.BetweenAttributes;
               continue;
             }
             continue;
           }
           if (this._c == 47)
           {
             this.PushNodeNameEnd(this._index - 1);
             if (this._state == HtmlDocument.ParseState.Tag)
             {
               this._state = HtmlDocument.ParseState.EmptyTag;
               continue;
             }
             continue;
           }
           if (this._c == 62)
           {
             this.PushNodeNameEnd(this._index - 1);
             if (this._state == HtmlDocument.ParseState.Tag)
             {
               if (!this.PushNodeEnd(this._index, false))
               {
                 this._index = this.Text.Length;
                 continue;
               }
               if (this._state == HtmlDocument.ParseState.Tag)
               {
                 this._state = HtmlDocument.ParseState.Text;
                 this.PushNodeStart(HtmlNodeType.Text, this._index);
                 continue;
               }
               continue;
             }
             continue;
           }
           continue;
         }
         continue;
       case HtmlDocument.ParseState.BetweenAttributes:
         if (!this.NewCheck() && !HtmlDocument.IsWhiteSpace(this._c))
         {
           if (this._c == 47 || this._c == 63)
           {
             this._state = HtmlDocument.ParseState.EmptyTag;
             continue;
           }
           if (this._c == 62)
           {
             if (!this.PushNodeEnd(this._index, false))
             {
               this._index = this.Text.Length;
               continue;
             }
             if (this._state == HtmlDocument.ParseState.BetweenAttributes)
             {
               this._state = HtmlDocument.ParseState.Text;
               this.PushNodeStart(HtmlNodeType.Text, this._index);
               continue;
             }
             continue;
           }
           this.PushAttributeNameStart(this._index - 1);
           this._state = HtmlDocument.ParseState.AttributeName;
           continue;
         }
         continue;
       case HtmlDocument.ParseState.EmptyTag:
         if (!this.NewCheck())
         {
           if (this._c == 62)
           {
             if (!this.PushNodeEnd(this._index, true))
             {
               this._index = this.Text.Length;
               continue;
             }
             if (this._state == HtmlDocument.ParseState.EmptyTag)
             {
               this._state = HtmlDocument.ParseState.Text;
               this.PushNodeStart(HtmlNodeType.Text, this._index);
               continue;
             }
             continue;
           }
           this._state = HtmlDocument.ParseState.BetweenAttributes;
           continue;
         }
         continue;
       case HtmlDocument.ParseState.AttributeName:
         if (!this.NewCheck())
         {
           if (HtmlDocument.IsWhiteSpace(this._c))
           {
             this.PushAttributeNameEnd(this._index - 1);
             this._state = HtmlDocument.ParseState.AttributeBeforeEquals;
             continue;
           }
           if (this._c == 61)
           {
             this.PushAttributeNameEnd(this._index - 1);
             this._state = HtmlDocument.ParseState.AttributeAfterEquals;
             continue;
           }
           if (this._c == 62)
           {
             this.PushAttributeNameEnd(this._index - 1);
             if (!this.PushNodeEnd(this._index, false))
             {
               this._index = this.Text.Length;
               continue;
             }
             if (this._state == HtmlDocument.ParseState.AttributeName)
             {
               this._state = HtmlDocument.ParseState.Text;
               this.PushNodeStart(HtmlNodeType.Text, this._index);
               continue;
             }
             continue;
           }
           continue;
         }
         continue;
       case HtmlDocument.ParseState.AttributeBeforeEquals:
         if (!this.NewCheck() && !HtmlDocument.IsWhiteSpace(this._c))
         {
           if (this._c == 62)
           {
             if (!this.PushNodeEnd(this._index, false))
             {
               this._index = this.Text.Length;
               continue;
             }
             if (this._state == HtmlDocument.ParseState.AttributeBeforeEquals)
             {
               this._state = HtmlDocument.ParseState.Text;
               this.PushNodeStart(HtmlNodeType.Text, this._index);
               continue;
             }
             continue;
           }
           if (this._c == 61)
           {
             this._state = HtmlDocument.ParseState.AttributeAfterEquals;
             continue;
           }
           this._state = HtmlDocument.ParseState.BetweenAttributes;
           this.DecrementPosition();
           continue;
         }
         continue;
       case HtmlDocument.ParseState.AttributeAfterEquals:
         if (!this.NewCheck() && !HtmlDocument.IsWhiteSpace(this._c))
         {
           if (this._c == 39 || this._c == 34)
           {
             this._state = HtmlDocument.ParseState.QuotedAttributeValue;
             this.PushAttributeValueStart(this._index, this._c);
             num = this._c;
             continue;
           }
           if (this._c == 62)
           {
             if (!this.PushNodeEnd(this._index, false))
             {
               this._index = this.Text.Length;
               continue;
             }
             if (this._state == HtmlDocument.ParseState.AttributeAfterEquals)
             {
               this._state = HtmlDocument.ParseState.Text;
               this.PushNodeStart(HtmlNodeType.Text, this._index);
               continue;
             }
             continue;
           }
           this.PushAttributeValueStart(this._index - 1);
           this._state = HtmlDocument.ParseState.AttributeValue;
           continue;
         }
         continue;
       case HtmlDocument.ParseState.AttributeValue:
         if (!this.NewCheck())
         {
           if (HtmlDocument.IsWhiteSpace(this._c))
           {
             this.PushAttributeValueEnd(this._index - 1);
             this._state = HtmlDocument.ParseState.BetweenAttributes;
             continue;
           }
           if (this._c == 62)
           {
             this.PushAttributeValueEnd(this._index - 1);
             if (!this.PushNodeEnd(this._index, false))
             {
               this._index = this.Text.Length;
               continue;
             }
             if (this._state == HtmlDocument.ParseState.AttributeValue)
             {
               this._state = HtmlDocument.ParseState.Text;
               this.PushNodeStart(HtmlNodeType.Text, this._index);
               continue;
             }
             continue;
           }
           continue;
         }
         continue;
       case HtmlDocument.ParseState.Comment:
         if (this._c == 62 && (!this._fullcomment || (int) this.Text[this._index - 2] == 45 && (int) this.Text[this._index - 3] == 45))
         {
           if (!this.PushNodeEnd(this._index, false))
           {
             this._index = this.Text.Length;
             continue;
           }
           this._state = HtmlDocument.ParseState.Text;
           this.PushNodeStart(HtmlNodeType.Text, this._index);
           continue;
         }
         continue;
       case HtmlDocument.ParseState.QuotedAttributeValue:
         if (this._c == num)
         {
           this.PushAttributeValueEnd(this._index - 1);
           this._state = HtmlDocument.ParseState.BetweenAttributes;
           continue;
         }
         if (this._c == 60 && this._index < this.Text.Length && (int) this.Text[this._index] == 37)
         {
           this._oldstate = this._state;
           this._state = HtmlDocument.ParseState.ServerSideCode;
           continue;
         }
         continue;
       case HtmlDocument.ParseState.ServerSideCode:
         if (this._c == 37 && this._index < this.Text.Length && (int) this.Text[this._index] == 62)
         {
           switch (this._oldstate)
           {
             case HtmlDocument.ParseState.BetweenAttributes:
               this.PushAttributeNameEnd(this._index + 1);
               this._state = HtmlDocument.ParseState.BetweenAttributes;
               break;
             case HtmlDocument.ParseState.AttributeAfterEquals:
               this._state = HtmlDocument.ParseState.AttributeValue;
               break;
             default:
               this._state = this._oldstate;
               break;
           }
           this.IncrementPosition();
           continue;
         }
         continue;
       case HtmlDocument.ParseState.PcData:
         if (this._currentnode._namelength + 3 <= this.Text.Length - (this._index - 1) && string.Compare(this.Text.Substring(this._index - 1, this._currentnode._namelength + 2), "</" + this._currentnode.Name, StringComparison.OrdinalIgnoreCase) == 0)
         {
           int c = (int) this.Text[this._index - 1 + 2 + this._currentnode.Name.Length];
           if (c == 62 || HtmlDocument.IsWhiteSpace(c))
           {
             HtmlNode node = this.CreateNode(HtmlNodeType.Text, this._currentnode._outerstartindex + this._currentnode._outerlength);
             node._outerlength = this._index - 1 - node._outerstartindex;
             this._currentnode.AppendChild(node);
             this.PushNodeStart(HtmlNodeType.Element, this._index - 1);
             this.PushNodeNameStart(false, this._index - 1 + 2);
             this._state = HtmlDocument.ParseState.Tag;
             this.IncrementPosition();
             continue;
           }
           continue;
         }
         continue;
       default:
         continue;
     }
   }
   if (this._currentnode._namestartindex > 0)
     this.PushNodeNameEnd(this._index);
   this.PushNodeEnd(this._index, false);
   this.Lastnodes.Clear();
 }
Пример #2
0
        private void Parse()
        {
            int lastquote = 0;
            if (OptionComputeChecksum)
            {
                _crc32 = new Crc32();
            }

            Lastnodes = new Dictionary<string, HtmlNode>();
            _c = 0;
            _fullcomment = false;
            _parseerrors = new List<HtmlParseError>();
            _line = 1;
            _lineposition = 1;
            _maxlineposition = 1;

            _state = ParseState.Text;
            _oldstate = _state;
            _documentnode._innerlength = Text.Length;
            _documentnode._outerlength = Text.Length;
            _remainderOffset = Text.Length;

            _lastparentnode = _documentnode;
            _currentnode = CreateNode(HtmlNodeType.Text, 0);
            _currentattribute = null;

            _index = 0;
            PushNodeStart(HtmlNodeType.Text, 0);
            while (_index < Text.Length)
            {
                _c = Text[_index];
                IncrementPosition();

                switch (_state)
                {
                    case ParseState.Text:
                        if (NewCheck())
                            continue;
                        break;

                    case ParseState.WhichTag:
                        if (NewCheck())
                            continue;
                        if (_c == '/')
                        {
                            PushNodeNameStart(false, _index);
                        }
                        else
                        {
                            PushNodeNameStart(true, _index - 1);
                            DecrementPosition();
                        }
                        _state = ParseState.Tag;
                        break;

                    case ParseState.Tag:
                        if (NewCheck())
                            continue;
                        if (IsWhiteSpace(_c))
                        {
                            PushNodeNameEnd(_index - 1);
                            if (_state != ParseState.Tag)
                                continue;
                            _state = ParseState.BetweenAttributes;
                            continue;
                        }
                        if (_c == '/')
                        {
                            PushNodeNameEnd(_index - 1);
                            if (_state != ParseState.Tag)
                                continue;
                            _state = ParseState.EmptyTag;
                            continue;
                        }
                        if (_c == '>')
                        {
                            PushNodeNameEnd(_index - 1);
                            if (_state != ParseState.Tag)
                                continue;
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }
                            if (_state != ParseState.Tag)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                        }
                        break;

                    case ParseState.BetweenAttributes:
                        if (NewCheck())
                            continue;

                        if (IsWhiteSpace(_c))
                            continue;

                        if ((_c == '/') || (_c == '?'))
                        {
                            _state = ParseState.EmptyTag;
                            continue;
                        }

                        if (_c == '>')
                        {
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }

                            if (_state != ParseState.BetweenAttributes)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }

                        PushAttributeNameStart(_index - 1);
                        _state = ParseState.AttributeName;
                        break;

                    case ParseState.EmptyTag:
                        if (NewCheck())
                            continue;

                        if (_c == '>')
                        {
                            if (!PushNodeEnd(_index, true))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }

                            if (_state != ParseState.EmptyTag)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }
                        _state = ParseState.BetweenAttributes;
                        break;

                    case ParseState.AttributeName:
                        if (NewCheck())
                            continue;

                        if (IsWhiteSpace(_c))
                        {
                            PushAttributeNameEnd(_index - 1);
                            _state = ParseState.AttributeBeforeEquals;
                            continue;
                        }
                        if (_c == '=')
                        {
                            PushAttributeNameEnd(_index - 1);
                            _state = ParseState.AttributeAfterEquals;
                            continue;
                        }
                        if (_c == '>')
                        {
                            PushAttributeNameEnd(_index - 1);
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }
                            if (_state != ParseState.AttributeName)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }
                        break;

                    case ParseState.AttributeBeforeEquals:
                        if (NewCheck())
                            continue;

                        if (IsWhiteSpace(_c))
                            continue;
                        if (_c == '>')
                        {
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }
                            if (_state != ParseState.AttributeBeforeEquals)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }
                        if (_c == '=')
                        {
                            _state = ParseState.AttributeAfterEquals;
                            continue;
                        }
                        // no equals, no whitespace, it's a new attrribute starting
                        _state = ParseState.BetweenAttributes;
                        DecrementPosition();
                        break;

                    case ParseState.AttributeAfterEquals:
                        if (NewCheck())
                            continue;

                        if (IsWhiteSpace(_c))
                            continue;

                        if ((_c == '\'') || (_c == '"'))
                        {
                            _state = ParseState.QuotedAttributeValue;
                            PushAttributeValueStart(_index, _c);
                            lastquote = _c;
                            continue;
                        }
                        if (_c == '>')
                        {
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }
                            if (_state != ParseState.AttributeAfterEquals)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }
                        PushAttributeValueStart(_index - 1);
                        _state = ParseState.AttributeValue;
                        break;

                    case ParseState.AttributeValue:
                        if (NewCheck())
                            continue;

                        if (IsWhiteSpace(_c))
                        {
                            PushAttributeValueEnd(_index - 1);
                            _state = ParseState.BetweenAttributes;
                            continue;
                        }

                        if (_c == '>')
                        {
                            PushAttributeValueEnd(_index - 1);
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }
                            if (_state != ParseState.AttributeValue)
                                continue;
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }
                        break;

                    case ParseState.QuotedAttributeValue:
                        if (_c == lastquote)
                        {
                            PushAttributeValueEnd(_index - 1);
                            _state = ParseState.BetweenAttributes;
                            continue;
                        }
                        if (_c == '<')
                        {
                            if (_index < Text.Length)
                            {
                                if (Text[_index] == '%')
                                {
                                    _oldstate = _state;
                                    _state = ParseState.ServerSideCode;
                                    continue;
                                }
                            }
                        }
                        break;

                    case ParseState.Comment:
                        if (_c == '>')
                        {
                            if (_fullcomment)
                            {
                                if ((Text[_index - 2] != '-') ||
                                    (Text[_index - 3] != '-'))
                                {
                                    continue;
                                }
                            }
                            if (!PushNodeEnd(_index, false))
                            {
                                // stop parsing
                                _index = Text.Length;
                                break;
                            }
                            _state = ParseState.Text;
                            PushNodeStart(HtmlNodeType.Text, _index);
                            continue;
                        }
                        break;

                    case ParseState.ServerSideCode:
                        if (_c == '%')
                        {
                            if (_index < Text.Length)
                            {
                                if (Text[_index] == '>')
                                {
                                    switch (_oldstate)
                                    {
                                        case ParseState.AttributeAfterEquals:
                                            _state = ParseState.AttributeValue;
                                            break;

                                        case ParseState.BetweenAttributes:
                                            PushAttributeNameEnd(_index + 1);
                                            _state = ParseState.BetweenAttributes;
                                            break;

                                        default:
                                            _state = _oldstate;
                                            break;
                                    }
                                    IncrementPosition();
                                }
                            }
                        }
                        break;

                    case ParseState.PcData:
                        // look for </tag + 1 char

                        // check buffer end
                        if ((_currentnode._namelength + 3) <= (Text.Length - (_index - 1)))
                        {
                            if (string.Compare(Text.Substring(_index - 1, _currentnode._namelength + 2),
                                               "</" + _currentnode.Name, StringComparison.OrdinalIgnoreCase) == 0)
                            {
                                int c = Text[_index - 1 + 2 + _currentnode.Name.Length];
                                if ((c == '>') || (IsWhiteSpace(c)))
                                {
                                    // add the script as a text node
                                    HtmlNode script = CreateNode(HtmlNodeType.Text,
                                                                 _currentnode._outerstartindex +
                                                                 _currentnode._outerlength);
                                    script._outerlength = _index - 1 - script._outerstartindex;
                                    _currentnode.AppendChild(script);

                                    PushNodeStart(HtmlNodeType.Element, _index - 1);
                                    PushNodeNameStart(false, _index - 1 + 2);
                                    _state = ParseState.Tag;
                                    IncrementPosition();
                                }
                            }
                        }
                        break;
                }
            }

            // finish the current work
            if (_currentnode._namestartindex > 0)
            {
                PushNodeNameEnd(_index);
            }
            PushNodeEnd(_index, false);

            // we don't need this anymore
            Lastnodes.Clear();
        }
Пример #3
0
		private void Parse()
		{
			int lastquote = 0;
			if (OptionComputeChecksum)
			{
				_crc32 = new Crc32();
			}

			_lastnodes = new Hashtable();
			_c = 0;
			_fullcomment = false;
			_parseerrors = new ArrayList();
			_line = 1;
			_lineposition = 1;
			_maxlineposition = 1;

			_state = ParseState.Text;
			_oldstate = _state;
			_documentnode._innerlength = _text.FullLength;
			_documentnode._outerlength = _text.FullLength;

			_lastparentnode = _documentnode;
			_currentnode = CreateNode(HtmlNodeType.Text, 0);
			_currentattribute = null;

			_index = 0;
			PushNodeStart(HtmlNodeType.Text, 0);

			DoParse (lastquote);
		}