예제 #1
0
 protected override string Replace(Element el)
 {
     if (el is BeginTag)
     {
         BeginTag beginTag = (BeginTag)el;
         if (beginTag.NameEquals("a"))
         {
             Attr href = beginTag.GetAttribute("href");
             if (href != null && href.Value != null)
             {
                 href.Value = ConvertUrl(href.Value);
                 return(beginTag.ToString());
             }
         }
         else if (beginTag.NameEquals("img"))
         {
             Attr src = beginTag.GetAttribute("src");
             if (src != null && src.Value != null)
             {
                 src.Value = ConvertUrl(src.Value);
                 return(beginTag.ToString());
             }
         }
     }
     return(base.Replace(el));
 }
        public string ScanAndPreserve(string html)
        {
            StringBuilder    sb = new StringBuilder(html.Length);
            SimpleHtmlParser p  = new SimpleHtmlParser(html);
            Element          e;

            while (null != (e = p.Next()))
            {
                if (!(e is BeginTag))
                {
                    sb.Append(html, e.Offset, e.Length);
                    continue;
                }

                BeginTag bt = (BeginTag)e;

                if (bt.NameEquals("div"))
                {
                    switch (bt.GetAttributeValue("class"))
                    {
                    case ContentSourceManager.EDITABLE_SMART_CONTENT:
                    case ContentSourceManager.SMART_CONTENT:
                        sb.Append(html, e.Offset, e.Length);
                        sb.Append(p.CollectHtmlUntil("div"));
                        sb.Append("</div>");
                        continue;
                    }
                }

                if (!(bt.NameEquals("object") ||
                      bt.NameEquals("embed") ||
                      bt.NameEquals("noembed") ||
                      bt.NameEquals("script")))
                {
                    sb.Append(html, e.Offset, e.Length);
                    continue;
                }
                else
                {
                    string collected = p.CollectHtmlUntil(bt.Name);
                    string preserve  = bt.RawText + collected + "</" + bt.Name + ">";

                    string preserveId = Guid.NewGuid().ToString("N");
                    preserved[preserveId] = preserve;

                    sb.AppendFormat("<span id=\"preserve{0}\" class=\"{1}\">", preserveId, PRESERVE_CLASS);
                    sb.Append(preserve);
                    sb.Append("</span>");
                }
            }
            return(sb.ToString());
        }
            public bool IsMatch(Element e)
            {
                BeginTag tag = e as BeginTag;

                if (tag == null)
                {
                    return(false);
                }

                if (!tag.NameEquals("meta"))
                {
                    return(false);
                }

                if (tag.GetAttributeValue("name") != "generator")
                {
                    return(false);
                }

                string generator = tag.GetAttributeValue("content");

                if (generator == null || CaseInsensitiveComparer.DefaultInvariant.Compare("blogger", generator) != 0)
                {
                    return(false);
                }

                return(true);
            }
        protected override void OnBeginTag(BeginTag tag)
        {
            if (FlagIsSet(Flag.RemovePartialTags) && tag.Unterminated)
            {
                return;
            }

            //remove all illegal attributes from the tag
            foreach (Attr attr in tag.Attributes)
            {
                if (IsIllegalAttribute(attr))
                {
                    attr.Value = string.Empty;
                }
            }

            if (tag.NameEquals("script"))
            {
                Debug.WriteLine("Script tag");
            }
            if (IsRegexMatch(IllegalTagTreeName, tag.Name))
            {
                suspendTagDepth++;
            }
            else if (!IsIllegalTag(tag) && suspendTagDepth == 0)
            {
                PushStartTag(tag.Name);
                base.OnBeginTag(tag);
            }
        }
예제 #5
0
        /// <summary>
        /// Is the tag a meaningless tag such as <p></p> or <a href="..."></a> or <a href="...">&nbsp;</a>
        /// </summary>
        /// <param name="htmlParser"></param>
        /// <param name="bt"></param>
        /// <returns></returns>
        private static bool RemoveMeaninglessTags(SimpleHtmlParser htmlParser, BeginTag bt)
        {
            // Look to see if the tag is a <p> without any attributes
            if ((bt.NameEquals("p") && bt.Attributes.Length == 0 && !bt.HasResidue))
            {
                Element e = htmlParser.Peek(0);

                // Look to see if thereis a matching end tag to the element we are looking at
                if (e != null && e is EndTag && ((EndTag)e).NameEquals("p"))
                {
                    // eat up the end tag
                    htmlParser.Next();
                    return(true);
                }
            }

            // Look to see if the tag is an <a> without a style/id/name attribute, but has an href... meaning the link is not useful
            if ((bt.NameEquals("a") && bt.GetAttribute("name") == null && bt.GetAttributeValue("style") == null && bt.GetAttributeValue("id") == null && bt.GetAttributeValue("href") != null))
            {
                bool    hadWhiteSpaceText = false;
                Element e = htmlParser.Peek(0);

                // Look to see if the a just has whitespace inside of it
                if (e is Text && HtmlUtils.UnEscapeEntities(e.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0)
                {
                    e = htmlParser.Peek(1);
                    hadWhiteSpaceText = true;
                }

                // Look to see if thereis a matching end tag to the element we are looking at
                if (e != null && e is EndTag && ((EndTag)e).NameEquals("a"))
                {
                    // if this was an <a> with whitespace in the middle eat it up
                    if (hadWhiteSpaceText)
                    {
                        htmlParser.Next();
                    }
                    // eat up the end tag
                    htmlParser.Next();

                    return(true);
                }
            }

            return(false);
        }
예제 #6
0
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag.NameEquals(HTMLTokens.Body))
            {
                bodyBeginTag = tag;
            }

            base.OnBeginTag(tag);
        }
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag.NameEquals(HTMLTokens.Ul))
            {
                unorderedListLevel++;
            }
            else if (tag.NameEquals(HTMLTokens.Ol))
            {
                orderedListLevel++;
            }
            else if ((unorderedListLevel < 1) &&
                     (orderedListLevel < 1) &&
                     (tag.NameEquals(HTMLTokens.Li)))
            {
                hasIncompleteList = true;
            }

            base.OnBeginTag(tag);
        }
예제 #8
0
 private bool TagPermittedAboveBody(BeginTag tag)
 {
     foreach (string permittedAboveBody in _permittedBeforeBody)
     {
         if (tag.NameEquals(permittedAboveBody))
         {
             return(true);
         }
     }
     return(false);
 }
예제 #9
0
        public static string Trim(string html, bool onlyTrimParagraphs)
        {
            Element[] els = Elements(html);

            int pos;

            // First, go backwards over the list, deleting
            // all <br> and whitespace.  Stop as soon as
            // significant content is encountered.
            if (onlyTrimParagraphs)
            {
                pos = 1 + FindCleanupIndexForParagraphTrim(els);
            }
            else
            {
                pos = 1 + FindLastVisibleElementAndRemoveWhitespace(els);
            }


            // pos now points to the index where whitespace cleanup should begin

            // Remove empty pairs of invisible tags, e.g. <b></b>.  Each time
            // a pair is removed, start over, because the removal
            // of an empty pair may create another empty pair, e.g. <p><i></i></p>
            while (FindAndRemoveEmptyTag(pos, els))
            {
            }

            // Remove extra unmatched <p> begin tags.
            for (int i = pos; i < els.Length; i++)
            {
                BeginTag bt = els[i] as BeginTag;
                if (bt != null && bt.NameEquals("p"))
                {
                    els[i] = null;
                }
            }

            // Concatenate all the elements that are left.
            StringBuilder output = new StringBuilder(html.Length);

            foreach (Element el in els)
            {
                if (el != null)
                {
                    output.Append(el.RawText);
                }
            }
            return(output.ToString());
        }
예제 #10
0
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete)
            {
                _inTitle = true;
            }

            if (TagsToPreserve.Contains(tag.Name.ToUpper(CultureInfo.InvariantCulture)))
            {
                EmitTagAndAttributes(tag.Name, tag);
            }
            else if (ReplaceTags.ContainsKey(tag.Name.ToUpper(CultureInfo.InvariantCulture)))
            {
                EmitTagAndAttributes((string)ReplaceTags[tag.Name.ToUpper(CultureInfo.InvariantCulture)], tag);
            }
        }
 private bool IsIllegalTag(BeginTag tag)
 {
     if (IsRegexMatch(IllegalTagName, tag.Name))
     {
         return(true);
     }
     else if (FlagIsSet(Flag.RemoveStyles) && tag.NameEquals("link"))
     {
         //if this link element is a stylesheet, it is illegal
         Attr relAttr = tag.GetAttribute("rel");
         if (relAttr != null && relAttr.Value != null && relAttr.Value.ToUpperInvariant().Trim() == "STYLESHEET")
         {
             return(true);
         }
     }
     return(false);
 }
예제 #12
0
        public HtmlForm NextForm()
        {
            Element el;

            while (null != (el = parser.Next()))
            {
                BeginTag tag = el as BeginTag;
                if (tag == null)
                {
                    continue;
                }

                if (tag.NameEquals("form"))
                {
                    return(HandleForm(tag));
                }
            }
            return(null);
        }
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag != null && LightWeightHTMLDocument.AllUrlElements.ContainsKey(tag.Name.ToUpper(CultureInfo.InvariantCulture)))
            {
                Attr attr = tag.GetAttribute((string)LightWeightHTMLDocument.AllUrlElements[tag.Name.ToUpper(CultureInfo.InvariantCulture)]);
                if (attr != null)
                {
                    string url = attr.Value;
                    if (!UrlHelper.IsUrl(url) && ShouldEscapeRelativeUrl(url))
                    {
                        attr.Value = UrlHelper.EscapeRelativeURL(BaseUrl, url);
                    }
                }
            }

            // Special case params
            if (tag != null && tag.NameEquals(HTMLTokens.Param))
            {
                // Handle Params
                foreach (string paramValue in LightWeightHTMLDocument.ParamsUrlElements)
                {
                    Attr attr = tag.GetAttribute(HTMLTokens.Name);
                    if (attr != null)
                    {
                        if (attr.Value.ToUpper(CultureInfo.InvariantCulture) == paramValue)
                        {
                            Attr valueAttr = tag.GetAttribute(HTMLTokens.Value);
                            if (valueAttr != null)
                            {
                                string url = valueAttr.Value;
                                if (!UrlHelper.IsUrl(url))
                                {
                                    valueAttr.Value = UrlHelper.EscapeRelativeURL(BaseUrl, url);
                                }
                            }
                        }
                    }
                }
            }
            base.OnBeginTag(tag);
        }
예제 #14
0
        private void HandleSelect(HtmlForm parentForm, BeginTag selectTag)
        {
            string name = selectTag.GetAttributeValue("name");
            int    dummy;
            bool   multiple = selectTag.GetAttribute("multiple", true, 0, out dummy) != null;

            ArrayList optionInfos = new ArrayList();

            Element el = parser.Next();

            while (el != null)
            {
                BeginTag tag = el as BeginTag;
                if (tag != null && tag.NameEquals("option"))
                {
                    string value      = tag.GetAttributeValue("value");
                    bool   isSelected = tag.GetAttribute("selected", true, 0, out dummy) != null;

                    string label = string.Empty;
                    el = parser.Next();
                    if (el != null && el is Text)
                    {
                        label = HtmlUtils.UnEscapeEntities(el.ToString(), HtmlUtils.UnEscapeMode.NonMarkupText).TrimEnd(' ', '\r', '\n', '\t');
                        el    = parser.Next();
                    }
                    optionInfos.Add(new OptionInfo(value, label, isSelected));
                    continue;
                }

                if (el is EndTag && ((EndTag)el).NameEquals("select"))
                {
                    new Select(parentForm, name, multiple, (OptionInfo[])optionInfos.ToArray(typeof(OptionInfo)));
                    return;
                }

                el = parser.Next();
            }
        }
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag != null)
            {
                // Reset any frame urls
                // This is done because the HTML that is often in this document may have
                // incorrect urls for frames.  The frames enumeration is accurate, so if the
                // name from the frames enumeration is the same as this frame, we should fix its
                // url up.
                if (tag.NameEquals(HTMLTokens.Frame))
                {
                    Attr name = tag.GetAttribute(HTMLTokens.Name);
                    if (name != null && this._frames != null)
                    {
                        LightWeightHTMLDocument frameDoc = GetFrameDocumentByName(name.Value);
                        if (frameDoc != null)
                        {
                            Attr src = tag.GetAttribute(HTMLTokens.Src);
                            if (src != null && src.Value != frameDoc.Url)
                            {
                                Generator.AddSubstitionUrl(new UrlToReplace(src.Value, frameDoc.Url));
                            }
                        }
                    }
                }

                LightWeightTag currentTag = new LightWeightTag(tag);
                // The key we'll use for the table
                string key = tag.Name.ToUpper(CultureInfo.InvariantCulture);
                if (!_tagTable.ContainsKey(key))
                {
                    _tagTable[key] = new LightWeightTag[0];
                }

                LightWeightTag[] currentTags = (LightWeightTag[])_tagTable[key];
                LightWeightTag[] grownTags   = new LightWeightTag[currentTags.Length + 1];
                currentTags.CopyTo(grownTags, 0);
                grownTags[currentTags.Length] = currentTag;
                _tagTable[key] = grownTags;

                // Accumulate the title text
                if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete)
                {
                    _nextTextIsTitleText = true;
                }
                else if (tag.NameEquals(HTMLTokens.A) && !tag.Complete && tag.GetAttribute(HTMLTokens.Href) != null)
                {
                    if (_collectingForTag != null)
                    {
                        if (tag.NameEquals(HTMLTokens.A))
                        {
                            _collectingForTagDepth++;
                        }
                    }
                    else
                    {
                        _collectingForTag = currentTag;
                    }
                }
            }
            base.OnBeginTag(tag);
        }
예제 #16
0
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag == null)
            {
                return;
            }

            if (_firstTag)
            {
                if (!tag.NameEquals(HTMLTokens.Html))
                {
                    EmitTag(HTMLTokens.Html);
                }
                _firstTag = false;
            }

            if (!_seenHead && !TagPermittedAboveBody(tag))
            {
                Emit("<head>");
                EmitAdditionalMetaData();
                Emit("</head>");
                _seenHead = true;
            }

            if (tag.NameEquals(HTMLTokens.Script))
            {
                if (!tag.Complete)
                {
                    _scriptDepth++;
                }
                return;
            }

            if (tag.NameEquals(HTMLTokens.Head))
            {
                _seenHead = true;
            }
            else if (!_seenBody && !tag.NameEquals(HTMLTokens.Body))
            {
                if (!TagPermittedAboveBody(tag))
                {
                    EmitTag(HTMLTokens.Body);
                    _seenBody = true;
                }
            }
            else if (!_seenBody && tag.NameEquals(HTMLTokens.Body))
            {
                _seenBody = true;
            }

            if (tag.NameEquals(HTMLTokens.Base))
            {
                if (_metaData == null || _metaData.Base == null)
                {
                    return;
                }
                else
                {
                    Attr href = tag.GetAttribute(HTMLTokens.Href);
                    if (href != null)
                    {
                        href.Value = _metaData.Base;
                    }
                }
                _emittedMetaData.Add(HTMLTokens.Base);
            }

            if (tag.NameEquals(HTMLTokens.Meta))
            {
                ModifyMetaDataAsNecessary(tag);
            }

            foreach (Attr attr in tag.Attributes)
            {
                if (attr != null)
                {
                    if (IsScriptAttribute(attr))
                    {
                        tag.RemoveAttribute(attr.Name);
                    }
                    else
                    {
                        attr.Value = ReplaceValue(attr.Value);
                    }
                }
            }

            Emit(tag.ToString());
            base.OnBeginTag(tag);
        }