public IExtensionData[] CalculateReferencedExtensionData(string content)
 {
     Hashtable datas = new Hashtable();
     ContentSourceManager.SmartContentPredicate predicate = new ContentSourceManager.SmartContentPredicate();
     SimpleHtmlParser p = new SimpleHtmlParser(content);
     for (Element el; null != (el = p.Next());)
     {
         if (predicate.IsMatch(el))
         {
             BeginTag bt = el as BeginTag;
             Attr idAttr = bt.GetAttribute("id");
             if (idAttr != null) //Synchronized WP posts will strip ID attrs (bug 488143)
             {
                 string smartContentSourceId;
                 string smartContentId;
                 string smartContentElementId = idAttr.Value;
                 ContentSourceManager.ParseContainingElementId(smartContentElementId, out smartContentSourceId, out smartContentId);
                 IExtensionData data = GetExtensionData(smartContentId);
                 if (data != null)
                     datas[smartContentId] = data;
             }
         }
     }
     return (IExtensionData[])ArrayHelper.CollectionToArray(datas.Values, typeof(IExtensionData));
 }
        protected internal override void HandleResult(string homepageHtml, ITestResults results)
        {
            Regex regex = new Regex(Regex.Escape(guid1) + "(.*?)" + Regex.Escape(guid2));

            SimpleHtmlParser parser = new SimpleHtmlParser(homepageHtml);
            for (Element e = parser.Next(); e != null; e = parser.Next())
            {
                if (e is Text)
                {
                    Match m = regex.Match(e.ToString());
                    if (m.Success)
                    {
                        string str = m.Groups[1].Value;
                        if (str == HtmlUtils.EscapeEntities(TEST_STRING))
                            results.AddResult("requiresHtmlTitles", YES);
                        else if (str == HtmlUtils.EscapeEntities(HtmlUtils.EscapeEntities(TEST_STRING)))
                            results.AddResult("requiresHtmlTitles", NO);
                        else
                            results.AddResult("requiresHtmlTitles", "[ERROR] (value was: " + str + ")");

                        return;
                    }
                }
            }

            throw new InvalidOperationException("Title encoding test failed--title was not detected");
        }
Esempio n. 3
0
 public FormFactory(Stream s)
 {
     using (StreamReader reader = new StreamReader(s))
     {
         parser = new SimpleHtmlParser(reader.ReadToEnd());
     }
 }
 public virtual string Execute(string html)
 {
     SimpleHtmlParser parser = new SimpleHtmlParser(html);
     StringBuilder output = new StringBuilder(html.Length);
     Element next;
     while (null != (next = parser.Next()))
         output.Append(Replace(next));
     return output.ToString();
 }
        public virtual string Execute(string html)
        {
            SimpleHtmlParser parser = new SimpleHtmlParser(html);
            StringBuilder    output = new StringBuilder(html.Length);
            Element          next;

            while (null != (next = parser.Next()))
            {
                output.Append(Replace(next));
            }
            return(output.ToString());
        }
        public string ScanAndPreserve(string html)
        {
            StringBuilder sb = new StringBuilder(html.Length);
            SimpleHtmlParser p = new SimpleHtmlParser(html);
            Element e;
            while (null != (e = p.Next()))
            {
                if (!(e is BeginTag))
                {
                    sb.Append(html, e.Offset, e.Length);
                    continue;
                }

                BeginTag bt = (BeginTag)e;

                if (bt.NameEquals("div"))
                {
                    switch (bt.GetAttributeValue("class"))
                    {
                        case ContentSourceManager.EDITABLE_SMART_CONTENT:
                        case ContentSourceManager.SMART_CONTENT:
                            sb.Append(html, e.Offset, e.Length);
                            sb.Append(p.CollectHtmlUntil("div"));
                            sb.Append("</div>");
                            continue;
                    }
                }

                if (!(bt.NameEquals("object")
                    || bt.NameEquals("embed")
                    || bt.NameEquals("noembed")
                    || bt.NameEquals("script")))
                {
                    sb.Append(html, e.Offset, e.Length);
                    continue;
                }
                else
                {
                    string collected = p.CollectHtmlUntil(bt.Name);
                    string preserve = bt.RawText + collected + "</" + bt.Name + ">";

                    string preserveId = Guid.NewGuid().ToString("N");
                    preserved[preserveId] = preserve;

                    sb.AppendFormat("<span id=\"preserve{0}\" class=\"{1}\">", preserveId, PRESERVE_CLASS);
                    sb.Append(preserve);
                    sb.Append("</span>");
                }
            }
            return sb.ToString();
        }
        internal static IElementPredicate Parse(string criterion)
        {
            SimpleHtmlParser parser = new SimpleHtmlParser(criterion);
            Element          el     = parser.Next();

            if (el == null)
            {
                Debug.Fail("Criterion was null");
                throw new ArgumentException("Criterion was null");
            }
            if (parser.Next() != null)
            {
                Debug.Fail("Too many criteria");
                throw new ArgumentException("Too many criteria");
            }

            if (el is BeginTag)
            {
                BeginTag tag = (BeginTag)el;

                if (tag.HasResidue || tag.Unterminated)
                {
                    Debug.Fail("Malformed criterion");
                    throw new ArgumentException("Malformed criterion");
                }

                RequiredAttribute[] attributes = new RequiredAttribute[tag.Attributes.Length];
                for (int i = 0; i < attributes.Length; i++)
                {
                    attributes[i] = new RequiredAttribute(tag.Attributes[i].Name, tag.Attributes[i].Value);
                }
                return(new BeginTagPredicate(tag.Name, attributes));
            }
            else if (el is EndTag)
            {
                return(new EndTagPredicate(((EndTag)el).Name));
            }
            else if (el is Text)
            {
                return(new TextPredicate(el.RawText));
            }
            else if (el is Comment)
            {
                return(new CommentPredicate(el.RawText));
            }
            else
            {
                Debug.Fail("Invalid criterion \"" + criterion + "\"");
                throw new ArgumentException("Invalid criterion \"" + criterion + "\"");
            }
        }
        /// <summary>
        /// Clones active smart content contained in the provided HTML, and disables unknown smart content.
        /// </summary>
        public static string PrepareSmartContentHtmlForEditorInsertion(string html, IContentSourceSidebarContext sourceContext)
        {
            StringBuilder output = new StringBuilder();
            ContentSourceManager.SmartContentPredicate predicate = new ContentSourceManager.SmartContentPredicate();
            SimpleHtmlParser p = new SimpleHtmlParser(html);
            for (Element el; null != (el = p.Next());)
            {
                if (predicate.IsMatch(el))
                {
                    BeginTag bt = el as BeginTag;
                    Attr idAttr = bt.GetAttribute("id");

                    String contentSourceId, contentItemId;
                    ContentSourceManager.ParseContainingElementId(idAttr.Value, out contentSourceId, out contentItemId);
                    ISmartContent smartContent = sourceContext.FindSmartContent(contentItemId);
                    if (smartContent != null)
                    {
                        String newId = Guid.NewGuid().ToString();
                        sourceContext.CloneSmartContent(contentItemId, newId);

                        if (RefreshableContentManager.ContentSourcesWithRefreshableContent.Contains(contentSourceId))
                        {
                            IExtensionData extensionData = sourceContext.FindExtentsionData(newId);
                            Debug.Assert(extensionData != null);

                            // Since we just made a new id for the smart content just about to be inserted
                            // we want to give it a chance to get a callback because its callback might have happened while
                            // it was on the clipboard(in the event of cut).  This means the refreshable content manager doesnt know
                            // to watch out for this smart content on paste, it only knows to look out for who created it.   Thus
                            // we just force the callback, and if it didnt need it, nothing will happen.
                            if (extensionData.RefreshCallBack == null)
                            {
                                extensionData.RefreshCallBack = DateTime.UtcNow;
                            }
                        }

                        idAttr.Value = ContentSourceManager.MakeContainingElementId(contentSourceId, newId);
                    }
                    else
                    {
                        ContentSourceManager.RemoveSmartContentAttributes(bt);
                    }
                }
                output.Append(el.ToString());
            }
            return output.ToString();
        }
Esempio n. 9
0
 private static Element[] Elements(string html)
 {
     ArrayList elements = new ArrayList();
     SimpleHtmlParser parser = new SimpleHtmlParser(html);
     Element el;
     while (null != (el = parser.Next()))
         elements.Add(el);
     return (Element[])elements.ToArray(typeof(Element));
 }
Esempio n. 10
0
        /// <summary>
        /// Balances the HTML and safely truncates it, using a custom algorithm
        /// to determine how much each character/string counts against maxCost.
        /// </summary>
        public static string Balance(string html, int maxCost, HTMLBalancerCostFilter costFilter, bool ellipsis)
        {
            bool appendEllipsis = false;
            SimpleHtmlParser parser = new SimpleHtmlParser(html);

            ArrayList openTags = new ArrayList();
            StringBuilder output = new StringBuilder();
            long balance = 0;  // long to make sure that int32.MaxValue does not cause overflow

            if (costFilter == null)
                costFilter = new DefaultCostFilter();

            Element el;
            while (null != (el = parser.Next()))
            {
                if (el is StyleElement ||
                   el is ScriptElement ||
                   el is Comment ||
                   el is MarkupDirective)
                {
                    continue;
                }

                long lenLeft = Math.Max(0, maxCost - balance - LengthToClose(costFilter, openTags));

                if (el is Tag)
                {
                    if (el is BeginTag && ((BeginTag)el).Unterminated)
                        continue;  // skip corrupted tags

                    if (TagCost(costFilter, openTags, (Tag)el) > lenLeft)
                        break;  // don't use this tag; we're done
                    else
                    {
                        RegisterTag(openTags, (Tag)el);
                        output.Append(el.ToString());
                        balance += costFilter.ElementCost(el);
                    }
                }
                else if (el is Text)
                {
                    if (costFilter.ElementCost(el) > lenLeft)
                    {
                        // shrink down the text to fit
                        output.Append(costFilter.TruncateText((Text)el, (int)lenLeft));
                        appendEllipsis = true;
                        break;
                    }
                    else
                    {
                        // plenty of room
                        output.Append(el.ToString());
                        balance += costFilter.ElementCost(el);
                    }

                    //update the text end index
                }
                else
                {
                    if (costFilter.ElementCost(el) > lenLeft)
                        break;
                    else
                    {
                        output.Append(el.ToString());
                        balance += costFilter.ElementCost(el);
                    }
                }
            }

            // Append an ellipsis if we truncated text
            // We use "..." here rather than TextHelper.Ellipsis, because some mail clients don't understand "\u2026".
            if (ellipsis && appendEllipsis)
                output.Append("...");

            for (int i = openTags.Count - 1; i >= 0; i--)
            {
                output.Append(MakeEndTag((string)openTags[i]));
            }

            return output.ToString();
        }
        private static void EditorContext_PerformTemporaryFixupsToEditedHtml(TemporaryFixupArgs args)
        {
            string html = args.Html;
            if (html.Contains("table"))
            {
                StringBuilder output = new StringBuilder(html.Length);
                SimpleHtmlParser parser = new SimpleHtmlParser(html);
                for (Element el; null != (el = parser.Next());)
                {
                    output.Append(html, el.Offset, el.Length);
                    if (el is BeginTag &&
                        ((BeginTag)el).NameEquals("td"))
                    {
                        Element e = parser.Peek(0);
                        if (e is EndTag && ((EndTag)e).NameEquals("td"))
                            output.Append("&nbsp;");
                    }

                }
                args.Html = output.ToString();
            }
        }
        private static void DetachExtendedEntryBehavior(TemporaryFixupArgs args)
        {
            string html = args.Html;

            if (html.Contains(EXTENDED_ENTRY_ID))
            {
                //replace the EXTENDED_ENTRY_ID behavior div with the <!--more--> comment
                StringBuilder output = new StringBuilder(html.Length);
                SimpleHtmlParser parser = new SimpleHtmlParser(html);
                SmartPredicate splitDiv = new SmartPredicate(String.Format(CultureInfo.InvariantCulture, "<div id='{0}'>", EXTENDED_ENTRY_ID));
                for (Element el; null != (el = parser.Next());)
                {
                    if (splitDiv.IsMatch(el))
                    {
                        Element e = parser.Peek(0);
                        if (e is EndTag && ((EndTag)e).NameEquals("div"))
                        {
                            output.Append(BlogPost.ExtendedEntryBreak);
                            parser.Next();
                        }

                    }
                    else
                        output.Append(html, el.Offset, el.Length);
                }
                args.Html = output.ToString();
            }
        }
 public HtmlTextSource(SimpleHtmlParser parser)
 {
     this._parser = parser;
 }
Esempio n. 14
0
 /// <summary>
 /// Namespaced tags come with Office 2007 clipboard data and result in weird
 /// namespace declarations being inserted as text into the DOM.
 /// </summary>
 public static string StripNamespacedTagsAndCommentsAndMarkupDirectives(string html)
 {
     StringBuilder output = new StringBuilder(html.Length);
     SimpleHtmlParser parser = new SimpleHtmlParser(html);
     for (Element el; null != (el = parser.Next());)
     {
         if (el is Tag && ((Tag)el).Name.IndexOf(':') >= 0)
             continue;
         if (el is Comment)
             continue;
         if (el is MarkupDirective)
             continue;
         if (el is BeginTag)
         {
             foreach (Attr attr in ((BeginTag)el).Attributes)
             {
                 if (ILLEGAL_ATTR_REGEX.IsMatch(attr.Name))
                     ((BeginTag)el).RemoveAttribute(attr.Name);
             }
         }
         output.Append(el.ToString());
     }
     html = output.ToString();
     return html;
 }
Esempio n. 15
0
        public static string ConvertNewLinesToBr(string html)
        {
            SimpleHtmlParser parser = new SimpleHtmlParser(html);
            StringBuilder sb = new StringBuilder();
            Element ele = parser.Next();
            while (ele != null)
            {
                if (ele is Text)
                    sb.Append(ele.RawText.Replace("\r\n", "<br/>"));
                else
                    sb.Append(ele.RawText);
                ele = parser.Next();
            }

            return sb.ToString();
        }
        protected override bool ShouldAllowNewLineInsert(string html)
        {
            SimpleHtmlParser p = new SimpleHtmlParser(html);
            for (Element el; null != (el = p.Next());)
            {
                BeginTag tag = el as BeginTag;
                if (tag != null && tag.NameEquals("img"))
                {
                    // Don't allow new lines after emoticons.
                    string classNames = tag.GetAttributeValue("class");
                    if (!String.IsNullOrEmpty(classNames) && classNames.Contains(Emoticon.CLASS_NAME))
                        return false;
                }
            }

            return base.ShouldAllowNewLineInsert(html);
        }
Esempio n. 17
0
 /// <summary>
 /// Reposition the extractor back to the beginning of the
 /// HTML.
 /// </summary>
 /// <returns>Returns this. This allows chaining together of calls,
 /// like this:
 ///
 /// if (ex.Seek(...).Success || ex.Reset().Seek(...).Success) { ... }
 /// </returns>
 public HtmlExtractor Reset()
 {
     lastMatch = null;
     parser = new SimpleHtmlParser(html);
     return this;
 }
Esempio n. 18
0
 public HtmlExtractor(string html)
 {
     this.html = html;
     this.parser = new SimpleHtmlParser(html);
 }
Esempio n. 19
0
 public HtmlExtractor(Stream data, Encoding encoding)
 {
     using (StreamReader reader = new StreamReader(data, encoding))
         html = reader.ReadToEnd();
     this.parser = new SimpleHtmlParser(html);
 }
Esempio n. 20
0
        internal static IElementPredicate Parse(string criterion)
        {
            SimpleHtmlParser parser = new SimpleHtmlParser(criterion);
            Element el = parser.Next();
            if (el == null)
            {
                Trace.Fail("Criterion was null");
                throw new ArgumentException("Criterion was null");
            }
            if (parser.Next() != null)
            {
                Trace.Fail("Too many criteria");
                throw new ArgumentException("Too many criteria");
            }

            if (el is BeginTag)
            {
                BeginTag tag = (BeginTag)el;

                if (tag.HasResidue || tag.Unterminated)
                {
                    Trace.Fail("Malformed criterion");
                    throw new ArgumentException("Malformed criterion");
                }

                RequiredAttribute[] attributes = new RequiredAttribute[tag.Attributes.Length];
                for (int i = 0; i < attributes.Length; i++)
                    attributes[i] = new RequiredAttribute(tag.Attributes[i].Name, tag.Attributes[i].Value);
                return new BeginTagPredicate(tag.Name, attributes);
            }
            else if (el is EndTag)
            {
                return new EndTagPredicate(((EndTag)el).Name);
            }
            else if (el is Text)
            {
                return new TextPredicate(el.RawText);
            }
            else if (el is Comment)
            {
                return new CommentPredicate(el.RawText);
            }
            else
            {
                Trace.Fail("Invalid criterion \"" + criterion + "\"");
                throw new ArgumentException("Invalid criterion \"" + criterion + "\"");
            }
        }
        /// <summary>
        /// Walks the current contents to find smart content areas.  When one is found, it calls the operation on the smart content.  The operation has a chance
        /// to return new content.  If the content is non-null it will replace the current content.
        /// </summary>
        /// <param name="contents">the raw HTML string whose structured blocks will be replaced.</param>
        /// <param name="operation">Delegate for generating replacement content.</param>
        /// <param name="editMode">If true, then the element's stylename will be activated for editing</param>
        /// <param name="continueOnError">
        /// true - if the plugin throws an exception, it keeps crawling the DOM
        /// false - if a plugin throws an exception, it stops processing the DOM and return empty string
        /// null - if a plugin throws an exception, this function will rethrow it
        /// </param
        /// <returns>the contents with structured blocks replaced.</returns>
        internal static string PerformOperation(string contents, SmartContentOperation operation, bool editMode, IContentSourceSidebarContext sourceContext, bool? continueOnError)
        {
            //replace all structured content blocks with their editor HTML
            //string html = PostBodyPreprocessor.Preprocess(contents);
            StringBuilder sb = new StringBuilder();
            SimpleHtmlParser parser = new SimpleHtmlParser(contents);
            for (Element e = parser.Next(); e != null; e = parser.Next())
            {

                if (e is BeginTag)
                {
                    BeginTag beginTag = (BeginTag)e;
                    string elementClassName = beginTag.GetAttributeValue("class");
                    if (ContentSourceManager.IsSmartContentClass(elementClassName))
                    {
                        ISmartContent sContent = null;
                        try
                        {
                            string contentSourceId, contentItemId;
                            string blockId = beginTag.GetAttributeValue("id");
                            if (blockId != null)
                            {
                                ContentSourceManager.ParseContainingElementId(blockId, out contentSourceId, out contentItemId);

                                ContentSourceInfo contentSource = sourceContext.FindContentSource(contentSourceId);
                                if (contentSource != null && contentSource.Instance is SmartContentSource)
                                {
                                    SmartContentSource sSource = (SmartContentSource)contentSource.Instance;
                                    sContent = sourceContext.FindSmartContent(contentItemId);
                                    if (sContent != null)
                                    {

                                        //write the div with the appropriate className
                                        string newClassName = editMode ? ContentSourceManager.EDITABLE_SMART_CONTENT : ContentSourceManager.SMART_CONTENT;
                                        beginTag.GetAttribute("class").Value = newClassName;

                                        //replace the inner HTML of the div with the source's editor HTML
                                        string content = parser.CollectHtmlUntil("div");

                                        sb.Append(e.ToString());

                                        operation(sourceContext, sSource, sContent, ref content);

                                        sb.Append(content);

                                        sb.Append("</div>");
                                        continue;
                                    }
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            Trace.WriteLine(String.Format(CultureInfo.InvariantCulture, "Error loading smart content item\r\n{0}", ex));
                            sContent = null;

                            if (continueOnError == null)
                                throw;

                            if (!continueOnError.Value)
                                return String.Empty;
                        }

                        if (sContent == null)
                        {
                            //this element references an unknown smart content, so it should not be editable
                            Attr classAttr = beginTag.GetAttribute("class");
                            classAttr.Value = ContentSourceManager.SMART_CONTENT;
                        }
                    }
                }
                sb.Append(e.ToString());
            }

            return sb.ToString();
        }
Esempio n. 22
0
        /// <summary>
        /// Decides if, after the given HTML is inserted, we should insert a new line.
        /// </summary>
        protected virtual bool ShouldAllowNewLineInsert(string html)
        {
            SimpleHtmlParser p = new SimpleHtmlParser(html);
            for (Element el; null != (el = p.Next());)
            {
                if (el is BeginTag && (((BeginTag)el).NameEquals("div") || ((BeginTag)el).NameEquals("img")))
                {
                    return true;
                }
            }

            return false;
        }
 public HtmlExtractor(string html)
 {
     this.html   = html;
     this.parser = new SimpleHtmlParser(html);
 }
Esempio n. 24
0
 /// <summary>
 /// Namespaced tags come with Office 2007 clipboard data and result in weird
 /// namespace declarations being inserted as text into the DOM. (Bug 303784)
 /// </summary>
 private static string StripNamespacedTags(string html)
 {
     StringBuilder output = new StringBuilder(html.Length);
     SimpleHtmlParser parser = new SimpleHtmlParser(html);
     for (Element el; null != (el = parser.Next());)
     {
         if (el is Tag && ((Tag)el).Name.IndexOf(':') >= 0)
             continue;
         output.Append(el.RawText);
     }
     html = output.ToString();
     return html;
 }
Esempio n. 25
0
 public FormFactory(string html)
 {
     parser = new SimpleHtmlParser(html);
 }
        private static string BalanceHtml(string html)
        {
            StringBuilder sb = new StringBuilder(html.Length + 10);

            SimpleHtmlParser parser = new SimpleHtmlParser(html);
            Element el;
            while (null != (el = parser.Next()))
            {
                if (el is BeginTag)
                {
                    BeginTag bt = (BeginTag)el;
                    if (!ElementFilters.RequiresEndTag(bt.Name))
                        bt.Complete = true;
                }
                sb.Append(el.ToString());
            }

            return sb.ToString();
        }
        private string ThinInternal(string html, bool preserveImages, bool strict, params ModifyReplacement[] modifyReplacements)
        {
            Hashtable replacements = _tagSpecs;
            if (strict)
            {
                replacements = _tagSpecsStrict;
            }

            if (modifyReplacements != null)
            {
                replacements = (Hashtable)replacements.Clone();
                foreach (ModifyReplacement modifyReplacement in modifyReplacements)
                    modifyReplacement(replacements);
            }

            // Will hold the results of the leading whitespace buffer.
            // This buffer may or may not make it into the final result,
            // depending on whether any block-level tags are present.
            StringBuilder leadingOutput = new StringBuilder(10);
            // Will hold the results of everything else.
            StringBuilder mainOutput = new StringBuilder(html.Length);

            // references whichever output buffer is current.
            StringBuilder output = leadingOutput;

            SimpleHtmlParser parser = new SimpleHtmlParser(html);
            Element el;

            bool preserveWhitespace = false;  // <pre> blocks should preserve whitespace
            WhitespaceBuffer whitespaceBuffer = new WhitespaceBuffer();
            whitespaceBuffer.Promote(WhitespaceClass.Paragraph);  // Insert an implicit <p> unless the first non-whitespace element is a block
            bool hasBlock = false;

            while (null != (el = parser.Next()))
            {
                if (el is Tag)
                {
                    Tag t = (Tag)el;
                    string lowerName = t.Name.ToLower(CultureInfo.InvariantCulture);

                    TagDesc desc = (TagDesc)replacements[lowerName];
                    // if this tag is not in the table, drop it
                    if (desc == null)
                        continue;

                    // Replace tag with substitute tag if necessary (e.g. <DIV> becomes <P>)
                    string tagName = desc.Substitute;
                    if (tagName == null)
                        tagName = lowerName;

                    // special case for images
                    if (!preserveImages && tagName == TAG_IMG)
                        continue;

                    bool beginTag = el is BeginTag;

                    ElementClass elClass = WhitespaceBuffer.ClassifyTag(tagName, desc.TagType);
                    hasBlock |= (elClass == ElementClass.Block || elClass == ElementClass.Paragraph || elClass == ElementClass.Break);
                    if (!preserveWhitespace && WhitespaceBuffer.ProcessElementClass(ref whitespaceBuffer, output, elClass, true))
                        continue;

                    output = mainOutput;

                    if (beginTag)
                    {
                        WriteBeginTag(desc, tagName, ((BeginTag)el).Attributes, output);
                        if (tagName == TAG_PRE)
                            preserveWhitespace = true;
                    }
                    else if (el is EndTag)
                    {
                        if (!((EndTag)el).Implicit && desc.TagType != TagType.Empty)
                        {
                            output.Append(string.Format(CultureInfo.InvariantCulture, "</{0}>", tagName));
                        }
                        if (tagName == TAG_PRE)
                            preserveWhitespace = false;
                    }
                }
                else if (el is Text)
                {
                    string text = el.RawText;
                    text = HtmlUtils.EscapeEntities(HtmlUtils.UnEscapeEntities(text, HtmlUtils.UnEscapeMode.NonMarkupText));

                    if (!preserveWhitespace && WhitespaceBuffer.ProcessElementClass(ref whitespaceBuffer, output, WhitespaceBuffer.ClassifyText(text), false))
                        continue;

                    output = mainOutput;

                    output.Append(text);
                }
            }

            if (hasBlock && ReferenceEquals(mainOutput, output))
                output.Insert(0, leadingOutput.ToString());

            // The whitespace buffer may not be empty at this point.  That's OK--we want to drop trailing whitespace

            return output.ToString();
        }
        /// <summary>
        /// Converts tag names, attribute names, and style text to lowercase.
        /// </summary>
        private string CleanupHtml(string html, bool xml)
        {
            bool needsCleanup;
            do
            {
                needsCleanup = false;
                StringBuilder output = new StringBuilder(html.Length);
                SimpleHtmlParser htmlParser = new SimpleHtmlParser(html);
                for (Element el; null != (el = htmlParser.Next());)
                {
                    if (el is BeginTag)
                    {
                        BeginTag bt = (BeginTag)el;

                        if (RemoveMeaninglessTags(htmlParser, bt))
                        {
                            // Since we are removing a tag, we will want to clean up again, since that might mean
                            // there will be another tag to remove
                            needsCleanup = true;
                            continue;
                        }

                        output.Append("<");
                        output.Append(bt.Name.ToLower(CultureInfo.InvariantCulture));
                        foreach (Attr attr in bt.Attributes)
                        {
                            if (attr.NameEquals("contenteditable") || attr.NameEquals("atomicselection") ||
                                attr.NameEquals("unselectable"))
                                continue;

                            output.Append(" ");
                            output.Append(attr.Name.ToLower(CultureInfo.InvariantCulture));
                            if (attr.Value != null)
                            {
                                string attrVal = attr.Value;
                                if (attr.NameEquals("style"))
                                    attrVal = LowerCaseCss(attrVal);
                                else if (attr.Name == attr.Value)
                                    attrVal = attrVal.ToLower(CultureInfo.InvariantCulture);
                                output.AppendFormat("=\"{0}\"",
                                                    xml
                                                        ? HtmlUtils.EscapeEntitiesForXml(attrVal, true)
                                                        : HtmlUtils.EscapeEntities(attrVal));
                            }
                        }
                        if (bt.HasResidue)
                        {
                            if (bt.Attributes.Length == 0)
                                output.Append(" ");
                            output.Append(bt.Residue);
                        }
                        if (bt.Complete)
                            output.Append(" /");
                        output.Append(">");
                    }
                    else if (el is EndTag)
                    {
                        output.AppendFormat("</{0}>", ((EndTag)el).Name.ToLower(CultureInfo.InvariantCulture));
                    }
                    else if (el is Text)
                    {
                        string textHtml = HtmlUtils.TidyNbsps(el.RawText);
                        if (xml)
                            textHtml =
                                HtmlUtils.EscapeEntitiesForXml(
                                    HtmlUtils.UnEscapeEntities(textHtml, HtmlUtils.UnEscapeMode.NonMarkupText), false);
                        output.Append(textHtml);
                    }
                    else if (el is StyleText)
                        output.Append(el.RawText.ToLower(CultureInfo.InvariantCulture));
                    else
                        output.Append(el.RawText);
                }
                html = output.ToString();
            } while (needsCleanup);
            return html;
        }
        public static bool ContainsUnbalancedDivs(string html)
        {
            int tags = 0;
            SimpleHtmlParser p = new SimpleHtmlParser(html);
            for (Element e; (e = p.Next()) != null;)
            {
                if (e is Tag && ((Tag)e).NameEquals("div"))
                {
                    if (e is BeginTag)
                        ++tags;
                    else
                        --tags;
                }
            }

            return tags != 0;
        }
 public HtmlExtractor(Stream data, Encoding encoding)
 {
     using (StreamReader reader = new StreamReader(data, encoding))
         html = reader.ReadToEnd();
     this.parser = new SimpleHtmlParser(html);
 }
        /// <summary>
        /// Is the tag a meaningless tag such as <p></p> or <a href="..."></a> or <a href="...">&nbsp;</a>
        /// </summary>
        /// <param name="htmlParser"></param>
        /// <param name="bt"></param>
        /// <returns></returns>
        private static bool RemoveMeaninglessTags(SimpleHtmlParser htmlParser, BeginTag bt)
        {
            // Look to see if the tag is a <p> without any attributes
            if ((bt.NameEquals("p") && bt.Attributes.Length == 0 && !bt.HasResidue))
            {
                Element e = htmlParser.Peek(0);

                // Look to see if thereis a matching end tag to the element we are looking at
                if (e != null && e is EndTag && ((EndTag)e).NameEquals("p"))
                {
                    // eat up the end tag
                    htmlParser.Next();
                    return true;
                }
            }

            // Look to see if the tag is an <a> without a style/id/name attribute, but has an href... meaning the link is not useful
            if ((bt.NameEquals("a") && bt.GetAttribute("name") == null && bt.GetAttributeValue("style") == null && bt.GetAttributeValue("id") == null && bt.GetAttributeValue("href") != null))
            {
                bool hadWhiteSpaceText = false;
                Element e = htmlParser.Peek(0);

                // Look to see if the a just has whitespace inside of it
                if (e is Text && HtmlUtils.UnEscapeEntities(e.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0)
                {
                    e = htmlParser.Peek(1);
                    hadWhiteSpaceText = true;
                }

                // Look to see if thereis a matching end tag to the element we are looking at
                if (e != null && e is EndTag && ((EndTag)e).NameEquals("a"))
                {
                    // if this was an <a> with whitespace in the middle eat it up
                    if (hadWhiteSpaceText)
                        htmlParser.Next();
                    // eat up the end tag
                    htmlParser.Next();

                    return true;
                }
            }

            return false;
        }
 /// <summary>
 /// Reposition the extractor back to the beginning of the
 /// HTML.
 /// </summary>
 /// <returns>Returns this. This allows chaining together of calls,
 /// like this:
 ///
 /// if (ex.Seek(...).Success || ex.Reset().Seek(...).Success) { ... }
 /// </returns>
 public HtmlExtractor Reset()
 {
     lastMatch = null;
     parser    = new SimpleHtmlParser(html);
     return(this);
 }
        public void Parse()
        {
            SimpleHtmlParser parser = new SimpleHtmlParser(_html);

            OnDocumentBegin();
            while (true)
            {
                Element currentElement = parser.Next();

                BeginTag beginTag = currentElement as BeginTag;
                if (beginTag != null)
                {
                    OnBeginTag(beginTag);
                    continue;
                }

                EndTag endTag = currentElement as EndTag;
                if (endTag != null)
                {
                    OnEndTag(endTag);
                    continue;
                }

                ScriptLiteral literal = currentElement as ScriptLiteral;
                if (literal != null)
                {
                    OnScriptLiteral(literal);
                    continue;
                }

                Comment comment = currentElement as Comment;
                if (comment != null)
                {
                    OnComment(comment);
                    continue;
                }

                MarkupDirective markupDirective = currentElement as MarkupDirective;
                if (markupDirective != null)
                {
                    OnMarkupDirective(markupDirective);
                    continue;
                }

                ScriptText scriptText = currentElement as ScriptText;
                if (scriptText != null)
                {
                    OnScriptText(scriptText);
                    continue;
                }

                ScriptComment scriptComment = currentElement as ScriptComment;
                if (scriptComment != null)
                {
                    OnScriptComment(scriptComment);
                    continue;
                }

                StyleText styleText = currentElement as StyleText;
                if (styleText != null)
                {
                    OnStyleText(styleText);
                    continue;
                }

                StyleUrl styleUrl = currentElement as StyleUrl;
                if (styleUrl != null)
                {
                    OnStyleUrl(styleUrl);
                    continue;
                }

                StyleImport styleImport = currentElement as StyleImport;
                if (styleImport != null)
                {
                    OnStyleImport(styleImport);
                    continue;
                }

                StyleComment styleComment = currentElement as StyleComment;
                if (styleComment != null)
                {
                    OnStyleComment(styleComment);
                    continue;
                }

                StyleLiteral styleLiteral = currentElement as StyleLiteral;
                if (styleLiteral != null)
                {
                    OnStyleLiteral(styleLiteral);
                    continue;
                }

                Text text = currentElement as Text;
                if (text != null)
                {
                    OnText(text);
                    continue;
                }

                if (currentElement == null)
                {
                    OnDocumentEnd();
                    return;
                }

                Debug.Fail("Unrecognized element in LightWeightHTMLDocumentIterator");
            }
        }