protected override string Replace(Element el) { if (el is BeginTag) { BeginTag beginTag = (BeginTag)el; if (beginTag.NameEquals("a")) { Attr href = beginTag.GetAttribute("href"); if (href != null && href.Value != null) { href.Value = ConvertUrl(href.Value); return(beginTag.ToString()); } } else if (beginTag.NameEquals("img")) { Attr src = beginTag.GetAttribute("src"); if (src != null && src.Value != null) { src.Value = ConvertUrl(src.Value); return(beginTag.ToString()); } } } return(base.Replace(el)); }
public string ScanAndPreserve(string html) { StringBuilder sb = new StringBuilder(html.Length); SimpleHtmlParser p = new SimpleHtmlParser(html); Element e; while (null != (e = p.Next())) { if (!(e is BeginTag)) { sb.Append(html, e.Offset, e.Length); continue; } BeginTag bt = (BeginTag)e; if (bt.NameEquals("div")) { switch (bt.GetAttributeValue("class")) { case ContentSourceManager.EDITABLE_SMART_CONTENT: case ContentSourceManager.SMART_CONTENT: sb.Append(html, e.Offset, e.Length); sb.Append(p.CollectHtmlUntil("div")); sb.Append("</div>"); continue; } } if (!(bt.NameEquals("object") || bt.NameEquals("embed") || bt.NameEquals("noembed") || bt.NameEquals("script"))) { sb.Append(html, e.Offset, e.Length); continue; } else { string collected = p.CollectHtmlUntil(bt.Name); string preserve = bt.RawText + collected + "</" + bt.Name + ">"; string preserveId = Guid.NewGuid().ToString("N"); preserved[preserveId] = preserve; sb.AppendFormat("<span id=\"preserve{0}\" class=\"{1}\">", preserveId, PRESERVE_CLASS); sb.Append(preserve); sb.Append("</span>"); } } return(sb.ToString()); }
public bool IsMatch(Element e) { BeginTag tag = e as BeginTag; if (tag == null) { return(false); } if (!tag.NameEquals("meta")) { return(false); } if (tag.GetAttributeValue("name") != "generator") { return(false); } string generator = tag.GetAttributeValue("content"); if (generator == null || CaseInsensitiveComparer.DefaultInvariant.Compare("blogger", generator) != 0) { return(false); } return(true); }
protected override void OnBeginTag(BeginTag tag) { if (FlagIsSet(Flag.RemovePartialTags) && tag.Unterminated) { return; } //remove all illegal attributes from the tag foreach (Attr attr in tag.Attributes) { if (IsIllegalAttribute(attr)) { attr.Value = string.Empty; } } if (tag.NameEquals("script")) { Debug.WriteLine("Script tag"); } if (IsRegexMatch(IllegalTagTreeName, tag.Name)) { suspendTagDepth++; } else if (!IsIllegalTag(tag) && suspendTagDepth == 0) { PushStartTag(tag.Name); base.OnBeginTag(tag); } }
/// <summary> /// Is the tag a meaningless tag such as <p></p> or <a href="..."></a> or <a href="..."> </a> /// </summary> /// <param name="htmlParser"></param> /// <param name="bt"></param> /// <returns></returns> private static bool RemoveMeaninglessTags(SimpleHtmlParser htmlParser, BeginTag bt) { // Look to see if the tag is a <p> without any attributes if ((bt.NameEquals("p") && bt.Attributes.Length == 0 && !bt.HasResidue)) { Element e = htmlParser.Peek(0); // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("p")) { // eat up the end tag htmlParser.Next(); return(true); } } // Look to see if the tag is an <a> without a style/id/name attribute, but has an href... meaning the link is not useful if ((bt.NameEquals("a") && bt.GetAttribute("name") == null && bt.GetAttributeValue("style") == null && bt.GetAttributeValue("id") == null && bt.GetAttributeValue("href") != null)) { bool hadWhiteSpaceText = false; Element e = htmlParser.Peek(0); // Look to see if the a just has whitespace inside of it if (e is Text && HtmlUtils.UnEscapeEntities(e.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0) { e = htmlParser.Peek(1); hadWhiteSpaceText = true; } // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("a")) { // if this was an <a> with whitespace in the middle eat it up if (hadWhiteSpaceText) { htmlParser.Next(); } // eat up the end tag htmlParser.Next(); return(true); } } return(false); }
protected override void OnBeginTag(BeginTag tag) { if (tag.NameEquals(HTMLTokens.Body)) { bodyBeginTag = tag; } base.OnBeginTag(tag); }
protected override void OnBeginTag(BeginTag tag) { if (tag.NameEquals(HTMLTokens.Ul)) { unorderedListLevel++; } else if (tag.NameEquals(HTMLTokens.Ol)) { orderedListLevel++; } else if ((unorderedListLevel < 1) && (orderedListLevel < 1) && (tag.NameEquals(HTMLTokens.Li))) { hasIncompleteList = true; } base.OnBeginTag(tag); }
private bool TagPermittedAboveBody(BeginTag tag) { foreach (string permittedAboveBody in _permittedBeforeBody) { if (tag.NameEquals(permittedAboveBody)) { return(true); } } return(false); }
public static string Trim(string html, bool onlyTrimParagraphs) { Element[] els = Elements(html); int pos; // First, go backwards over the list, deleting // all <br> and whitespace. Stop as soon as // significant content is encountered. if (onlyTrimParagraphs) { pos = 1 + FindCleanupIndexForParagraphTrim(els); } else { pos = 1 + FindLastVisibleElementAndRemoveWhitespace(els); } // pos now points to the index where whitespace cleanup should begin // Remove empty pairs of invisible tags, e.g. <b></b>. Each time // a pair is removed, start over, because the removal // of an empty pair may create another empty pair, e.g. <p><i></i></p> while (FindAndRemoveEmptyTag(pos, els)) { } // Remove extra unmatched <p> begin tags. for (int i = pos; i < els.Length; i++) { BeginTag bt = els[i] as BeginTag; if (bt != null && bt.NameEquals("p")) { els[i] = null; } } // Concatenate all the elements that are left. StringBuilder output = new StringBuilder(html.Length); foreach (Element el in els) { if (el != null) { output.Append(el.RawText); } } return(output.ToString()); }
protected override void OnBeginTag(BeginTag tag) { if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete) { _inTitle = true; } if (TagsToPreserve.Contains(tag.Name.ToUpper(CultureInfo.InvariantCulture))) { EmitTagAndAttributes(tag.Name, tag); } else if (ReplaceTags.ContainsKey(tag.Name.ToUpper(CultureInfo.InvariantCulture))) { EmitTagAndAttributes((string)ReplaceTags[tag.Name.ToUpper(CultureInfo.InvariantCulture)], tag); } }
private bool IsIllegalTag(BeginTag tag) { if (IsRegexMatch(IllegalTagName, tag.Name)) { return(true); } else if (FlagIsSet(Flag.RemoveStyles) && tag.NameEquals("link")) { //if this link element is a stylesheet, it is illegal Attr relAttr = tag.GetAttribute("rel"); if (relAttr != null && relAttr.Value != null && relAttr.Value.ToUpperInvariant().Trim() == "STYLESHEET") { return(true); } } return(false); }
public HtmlForm NextForm() { Element el; while (null != (el = parser.Next())) { BeginTag tag = el as BeginTag; if (tag == null) { continue; } if (tag.NameEquals("form")) { return(HandleForm(tag)); } } return(null); }
protected override void OnBeginTag(BeginTag tag) { if (tag != null && LightWeightHTMLDocument.AllUrlElements.ContainsKey(tag.Name.ToUpper(CultureInfo.InvariantCulture))) { Attr attr = tag.GetAttribute((string)LightWeightHTMLDocument.AllUrlElements[tag.Name.ToUpper(CultureInfo.InvariantCulture)]); if (attr != null) { string url = attr.Value; if (!UrlHelper.IsUrl(url) && ShouldEscapeRelativeUrl(url)) { attr.Value = UrlHelper.EscapeRelativeURL(BaseUrl, url); } } } // Special case params if (tag != null && tag.NameEquals(HTMLTokens.Param)) { // Handle Params foreach (string paramValue in LightWeightHTMLDocument.ParamsUrlElements) { Attr attr = tag.GetAttribute(HTMLTokens.Name); if (attr != null) { if (attr.Value.ToUpper(CultureInfo.InvariantCulture) == paramValue) { Attr valueAttr = tag.GetAttribute(HTMLTokens.Value); if (valueAttr != null) { string url = valueAttr.Value; if (!UrlHelper.IsUrl(url)) { valueAttr.Value = UrlHelper.EscapeRelativeURL(BaseUrl, url); } } } } } } base.OnBeginTag(tag); }
private void HandleSelect(HtmlForm parentForm, BeginTag selectTag) { string name = selectTag.GetAttributeValue("name"); int dummy; bool multiple = selectTag.GetAttribute("multiple", true, 0, out dummy) != null; ArrayList optionInfos = new ArrayList(); Element el = parser.Next(); while (el != null) { BeginTag tag = el as BeginTag; if (tag != null && tag.NameEquals("option")) { string value = tag.GetAttributeValue("value"); bool isSelected = tag.GetAttribute("selected", true, 0, out dummy) != null; string label = string.Empty; el = parser.Next(); if (el != null && el is Text) { label = HtmlUtils.UnEscapeEntities(el.ToString(), HtmlUtils.UnEscapeMode.NonMarkupText).TrimEnd(' ', '\r', '\n', '\t'); el = parser.Next(); } optionInfos.Add(new OptionInfo(value, label, isSelected)); continue; } if (el is EndTag && ((EndTag)el).NameEquals("select")) { new Select(parentForm, name, multiple, (OptionInfo[])optionInfos.ToArray(typeof(OptionInfo))); return; } el = parser.Next(); } }
protected override void OnBeginTag(BeginTag tag) { if (tag != null) { // Reset any frame urls // This is done because the HTML that is often in this document may have // incorrect urls for frames. The frames enumeration is accurate, so if the // name from the frames enumeration is the same as this frame, we should fix its // url up. if (tag.NameEquals(HTMLTokens.Frame)) { Attr name = tag.GetAttribute(HTMLTokens.Name); if (name != null && this._frames != null) { LightWeightHTMLDocument frameDoc = GetFrameDocumentByName(name.Value); if (frameDoc != null) { Attr src = tag.GetAttribute(HTMLTokens.Src); if (src != null && src.Value != frameDoc.Url) { Generator.AddSubstitionUrl(new UrlToReplace(src.Value, frameDoc.Url)); } } } } LightWeightTag currentTag = new LightWeightTag(tag); // The key we'll use for the table string key = tag.Name.ToUpper(CultureInfo.InvariantCulture); if (!_tagTable.ContainsKey(key)) { _tagTable[key] = new LightWeightTag[0]; } LightWeightTag[] currentTags = (LightWeightTag[])_tagTable[key]; LightWeightTag[] grownTags = new LightWeightTag[currentTags.Length + 1]; currentTags.CopyTo(grownTags, 0); grownTags[currentTags.Length] = currentTag; _tagTable[key] = grownTags; // Accumulate the title text if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete) { _nextTextIsTitleText = true; } else if (tag.NameEquals(HTMLTokens.A) && !tag.Complete && tag.GetAttribute(HTMLTokens.Href) != null) { if (_collectingForTag != null) { if (tag.NameEquals(HTMLTokens.A)) { _collectingForTagDepth++; } } else { _collectingForTag = currentTag; } } } base.OnBeginTag(tag); }
protected override void OnBeginTag(BeginTag tag) { if (tag == null) { return; } if (_firstTag) { if (!tag.NameEquals(HTMLTokens.Html)) { EmitTag(HTMLTokens.Html); } _firstTag = false; } if (!_seenHead && !TagPermittedAboveBody(tag)) { Emit("<head>"); EmitAdditionalMetaData(); Emit("</head>"); _seenHead = true; } if (tag.NameEquals(HTMLTokens.Script)) { if (!tag.Complete) { _scriptDepth++; } return; } if (tag.NameEquals(HTMLTokens.Head)) { _seenHead = true; } else if (!_seenBody && !tag.NameEquals(HTMLTokens.Body)) { if (!TagPermittedAboveBody(tag)) { EmitTag(HTMLTokens.Body); _seenBody = true; } } else if (!_seenBody && tag.NameEquals(HTMLTokens.Body)) { _seenBody = true; } if (tag.NameEquals(HTMLTokens.Base)) { if (_metaData == null || _metaData.Base == null) { return; } else { Attr href = tag.GetAttribute(HTMLTokens.Href); if (href != null) { href.Value = _metaData.Base; } } _emittedMetaData.Add(HTMLTokens.Base); } if (tag.NameEquals(HTMLTokens.Meta)) { ModifyMetaDataAsNecessary(tag); } foreach (Attr attr in tag.Attributes) { if (attr != null) { if (IsScriptAttribute(attr)) { tag.RemoveAttribute(attr.Name); } else { attr.Value = ReplaceValue(attr.Value); } } } Emit(tag.ToString()); base.OnBeginTag(tag); }