public bool IsMatch(Element e) { BeginTag tag = e as BeginTag; if (tag == null) { return(false); } if (tagName != null && !tag.NameEquals(tagName)) { return(false); } foreach (RequiredAttribute reqAttr in attrs) { int foundAt; Attr attr = tag.GetAttribute(reqAttr.Name, true, 0, out foundAt); if (attr == null) { return(false); } if (reqAttr.Value != null && reqAttr.Value != attr.Value) { return(false); } } return(true); }
protected override void OnBeginTag(BeginTag tag) { if (tag.NameEquals(HTMLTokens.Body)) { bodyBeginTag = tag; } base.OnBeginTag(tag); }
protected override void OnBeginTag(BeginTag tag) { if (tag.NameEquals(HTMLTokens.Ul)) { unorderedListLevel++; } else if (tag.NameEquals(HTMLTokens.Ol)) { orderedListLevel++; } else if ((unorderedListLevel < 1) && (orderedListLevel < 1) && (tag.NameEquals(HTMLTokens.Li))) { hasIncompleteList = true; } base.OnBeginTag(tag); }
protected override void OnBeginTag(BeginTag tag) { if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete) _inTitle = true; if (TagsToPreserve.Contains(tag.Name.ToUpper(CultureInfo.InvariantCulture))) { EmitTagAndAttributes(tag.Name, tag); } else if (ReplaceTags.ContainsKey(tag.Name.ToUpper(CultureInfo.InvariantCulture))) { EmitTagAndAttributes((string)ReplaceTags[tag.Name.ToUpper(CultureInfo.InvariantCulture)], tag); } }
protected override void OnBeginTag(BeginTag tag) { if (tag != null && LightWeightHTMLDocument.AllUrlElements.ContainsKey(tag.Name.ToUpper(CultureInfo.InvariantCulture))) { Attr attr = tag.GetAttribute((string)LightWeightHTMLDocument.AllUrlElements[tag.Name.ToUpper(CultureInfo.InvariantCulture)]); if (attr != null) { string url = attr.Value; if (!UrlHelper.IsUrl(url) && ShouldEscapeRelativeUrl(url)) attr.Value = UrlHelper.EscapeRelativeURL(BaseUrl, url); } } // Special case params if (tag != null && tag.NameEquals(HTMLTokens.Param)) { // Handle Params foreach (string paramValue in LightWeightHTMLDocument.ParamsUrlElements) { Attr attr = tag.GetAttribute(HTMLTokens.Name); if (attr != null) { if (attr.Value.ToUpper(CultureInfo.InvariantCulture) == paramValue) { Attr valueAttr = tag.GetAttribute(HTMLTokens.Value); if (valueAttr != null) { string url = valueAttr.Value; if (!UrlHelper.IsUrl(url)) valueAttr.Value = UrlHelper.EscapeRelativeURL(BaseUrl, url); } } } } } base.OnBeginTag(tag); }
/// <summary> /// Is the tag a meaningless tag such as <p></p> or <a href="..."></a> or <a href="..."> </a> /// </summary> /// <param name="htmlParser"></param> /// <param name="bt"></param> /// <returns></returns> private static bool RemoveMeaninglessTags(SimpleHtmlParser htmlParser, BeginTag bt) { // Look to see if the tag is a <p> without any attributes if ((bt.NameEquals("p") && bt.Attributes.Length == 0 && !bt.HasResidue)) { Element e = htmlParser.Peek(0); // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("p")) { // eat up the end tag htmlParser.Next(); return true; } } // Look to see if the tag is an <a> without a style/id/name attribute, but has an href... meaning the link is not useful if ((bt.NameEquals("a") && bt.GetAttribute("name") == null && bt.GetAttributeValue("style") == null && bt.GetAttributeValue("id") == null && bt.GetAttributeValue("href") != null)) { bool hadWhiteSpaceText = false; Element e = htmlParser.Peek(0); // Look to see if the a just has whitespace inside of it if (e is Text && HtmlUtils.UnEscapeEntities(e.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0) { e = htmlParser.Peek(1); hadWhiteSpaceText = true; } // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("a")) { // if this was an <a> with whitespace in the middle eat it up if (hadWhiteSpaceText) htmlParser.Next(); // eat up the end tag htmlParser.Next(); return true; } } return false; }
protected override void OnBeginTag(BeginTag tag) { if (tag != null) { // Reset any frame urls // This is done because the HTML that is often in this document may have // incorrect urls for frames. The frames enumeration is accurate, so if the // name from the frames enumeration is the same as this frame, we should fix its // url up. if (tag.NameEquals(HTMLTokens.Frame)) { Attr name = tag.GetAttribute(HTMLTokens.Name); if (name != null && this._frames != null) { LightWeightHTMLDocument frameDoc = GetFrameDocumentByName(name.Value); if (frameDoc != null) { Attr src = tag.GetAttribute(HTMLTokens.Src); if (src != null && src.Value != frameDoc.Url) Generator.AddSubstitionUrl(new UrlToReplace(src.Value, frameDoc.Url)); } } } LightWeightTag currentTag = new LightWeightTag(tag); // The key we'll use for the table string key = tag.Name.ToUpper(CultureInfo.InvariantCulture); if (!_tagTable.ContainsKey(key)) _tagTable[key] = new LightWeightTag[0]; LightWeightTag[] currentTags = (LightWeightTag[])_tagTable[key]; LightWeightTag[] grownTags = new LightWeightTag[currentTags.Length + 1]; currentTags.CopyTo(grownTags, 0); grownTags[currentTags.Length] = currentTag; _tagTable[key] = grownTags; // Accumulate the title text if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete) _nextTextIsTitleText = true; else if (tag.NameEquals(HTMLTokens.A) && !tag.Complete && tag.GetAttribute(HTMLTokens.Href) != null) { if (_collectingForTag != null) { if (tag.NameEquals(HTMLTokens.A)) _collectingForTagDepth++; } else _collectingForTag = currentTag; } } base.OnBeginTag(tag); }
/// <summary> /// Retrieves the next element from the stream, or null /// if the end of the stream has been reached. /// </summary> private Element Next(bool allowPeekElement) { if (allowPeekElement && peekElements.Count > 0) { Element peekElement = peekElements[0]; peekElements.RemoveAt(0); return(peekElement); } if (elementStack.Count != 0) { return(elementStack.Pop()); } int dataLen = data.Length; if (dataLen == pos) { // If we're at EOF, return return(null); } // None of the special cases are true. Start consuming characters int tokenStart = pos; while (true) { // Consume everything until a tag-looking thing while (pos < dataLen && data[pos] != '<') { pos++; } if (pos >= dataLen) { // EOF has been reached. if (tokenStart != pos) { return(new Text(data, tokenStart, pos - tokenStart)); } else { return(null); } } // We started parsing right on a tag-looking thing. Try // parsing it as such. If it doesn't turn out to be a tag, // we'll return it as text int oldPos = pos; Element element; EndTag trailingEnd; int len = ParseMarkup(out element, out trailingEnd); if (len >= 0) { pos += len; if (trailingEnd != null) { // empty-element tag detected, add implicit end tag elementStack.Push(trailingEnd); } else if (element is BeginTag) { // look for <script> or <style> body Regex consumeTextUntil = null; BeginTag tag = (BeginTag)element; if (tag.NameEquals("script")) { consumeTextUntil = endScript; } else if (tag.NameEquals("style")) { consumeTextUntil = endStyle; } if (consumeTextUntil != null) { int structuredTextLen = ConsumeStructuredText(data, pos, consumeTextUntil); pos += structuredTextLen; } } elementStack.Push(element); if (oldPos != tokenStart) { elementStack.Push(new Text(data, tokenStart, oldPos - tokenStart)); } return(elementStack.Pop()); } else { // '<' didn't begin a tag after all; // consume it and continue pos++; continue; } } }
private bool IsIllegalTag(BeginTag tag) { if (IsRegexMatch(IllegalTagName, tag.Name)) { return true; } else if (FlagIsSet(Flag.RemoveStyles) && tag.NameEquals("link")) { //if this link element is a stylesheet, it is illegal Attr relAttr = tag.GetAttribute("rel"); if (relAttr != null && relAttr.Value != null && relAttr.Value.ToUpperInvariant().Trim() == "STYLESHEET") { return true; } } return false; }
protected override void OnBeginTag(BeginTag tag) { if (FlagIsSet(Flag.RemovePartialTags) && tag.Unterminated) { return; } //remove all illegal attributes from the tag foreach (Attr attr in tag.Attributes) { if (IsIllegalAttribute(attr)) attr.Value = string.Empty; } if (tag.NameEquals("script")) Debug.WriteLine("Script tag"); if (IsRegexMatch(IllegalTagTreeName, tag.Name)) { suspendTagDepth++; } else if (!IsIllegalTag(tag) && suspendTagDepth == 0) { PushStartTag(tag.Name); base.OnBeginTag(tag); } }
private bool TagPermittedAboveBody(BeginTag tag) { foreach (string permittedAboveBody in _permittedBeforeBody) if (tag.NameEquals(permittedAboveBody)) return true; return false; }
protected override void OnBeginTag(BeginTag tag) { if (tag == null) return; if (_firstTag) { if (!tag.NameEquals(HTMLTokens.Html)) EmitTag(HTMLTokens.Html); _firstTag = false; } if (!_seenHead && !TagPermittedAboveBody(tag)) { Emit("<head>"); EmitAdditionalMetaData(); Emit("</head>"); _seenHead = true; } if (tag.NameEquals(HTMLTokens.Script)) { if (!tag.Complete) _scriptDepth++; return; } if (tag.NameEquals(HTMLTokens.Head)) _seenHead = true; else if (!_seenBody && !tag.NameEquals(HTMLTokens.Body)) { if (!TagPermittedAboveBody(tag)) { EmitTag(HTMLTokens.Body); _seenBody = true; } } else if (!_seenBody && tag.NameEquals(HTMLTokens.Body)) _seenBody = true; if (tag.NameEquals(HTMLTokens.Base)) { if (_metaData == null || _metaData.Base == null) return; else { Attr href = tag.GetAttribute(HTMLTokens.Href); if (href != null) href.Value = _metaData.Base; } _emittedMetaData.Add(HTMLTokens.Base); } if (tag.NameEquals(HTMLTokens.Meta)) ModifyMetaDataAsNecessary(tag); foreach (Attr attr in tag.Attributes) if (attr != null) { if (IsScriptAttribute(attr)) tag.RemoveAttribute(attr.Name); else attr.Value = ReplaceValue(attr.Value); } Emit(tag.ToString()); base.OnBeginTag(tag); }