Esempio n. 1
0
        public List<WebItem> Tokenise(string html)
        {
            List<WebItem> page = new List<WebItem>();
            WebItem currentItem = new WebItem(false);
            bool inScript = false;

            //Loop through each character
            foreach (char c in html)
            {
                switch (c)
                {
                    case '<':
                        //Is there any content?
                        if (!String.IsNullOrEmpty(currentItem.Text) && !currentItem.IsTag && !inScript)
                            _stack.Peek().Children.Add(currentItem);

                        //Start New Item
                        currentItem = new WebItem(true);
                        break;
                    case '>':
                        //Is it a comment?
                        if (currentItem.IsComment) continue;

                        //Ignore Script
                        if (currentItem.Text.ToLower() == "script") inScript = true;
                        if (currentItem.Text.ToLower() == "/script") inScript = false;
                        if (inScript || currentItem.Text.ToLower() == "/script") continue;

                        //Is this a terminator tag?
                        if (currentItem.IsTerminatorTag)
                        {
                            WebItem topOfStack = _stack.Pop();

                            if ("/" + topOfStack.Text.ToLower() != currentItem.Text.ToLower())
                            {
                                //Possible bug, but more likely a malformed page
                                //TDHelper.Trc(TDHelper.TrcLvl.Warning, string.Format("Terminator tag {0} does not match top of stack {1}.  Ignoring terminator.", currentItem.Text, topOfStack.Text));
                                _stack.Push(topOfStack);
                                continue;
                            }

                            if (_stack.Count > 0)
                                _stack.Peek().Children.Add(topOfStack);
                            else
                                page.Add(topOfStack);

                            currentItem = new WebItem(false);
                            continue;
                        }

                        //Did this item self-terminate?
                        currentItem.CheckText();
                        if (currentItem.LastChar == '/' || currentItem.IsAutoTerminatorTag)
                            _stack.Peek().Children.Add(currentItem);
                        else
                            _stack.Push(currentItem);

                        //Close Item
                        currentItem = new WebItem(false);
                        break;
                    case '\r':
                    case '\n':
                        //Ignore These characters
                        break;
                    default:
                        currentItem.AddChar(c);
                        break;
                }
            }

            //Dump out from the stack
            //The stack should be empty, but this could be due to bugs or malformed code
            while (_stack.Count > 0)
            {
                WebItem topOfStack = _stack.Pop();
                if (_stack.Count > 0)
                    _stack.Peek().Children.Add(topOfStack);
                else
                    page.Add(topOfStack);
            }

            return page;
        }