public IExtensionData[] CalculateReferencedExtensionData(string content) { Hashtable datas = new Hashtable(); ContentSourceManager.SmartContentPredicate predicate = new ContentSourceManager.SmartContentPredicate(); SimpleHtmlParser p = new SimpleHtmlParser(content); for (Element el; null != (el = p.Next());) { if (predicate.IsMatch(el)) { BeginTag bt = el as BeginTag; Attr idAttr = bt.GetAttribute("id"); if (idAttr != null) //Synchronized WP posts will strip ID attrs (bug 488143) { string smartContentSourceId; string smartContentId; string smartContentElementId = idAttr.Value; ContentSourceManager.ParseContainingElementId(smartContentElementId, out smartContentSourceId, out smartContentId); IExtensionData data = GetExtensionData(smartContentId); if (data != null) datas[smartContentId] = data; } } } return (IExtensionData[])ArrayHelper.CollectionToArray(datas.Values, typeof(IExtensionData)); }
protected internal override void HandleResult(string homepageHtml, ITestResults results) { Regex regex = new Regex(Regex.Escape(guid1) + "(.*?)" + Regex.Escape(guid2)); SimpleHtmlParser parser = new SimpleHtmlParser(homepageHtml); for (Element e = parser.Next(); e != null; e = parser.Next()) { if (e is Text) { Match m = regex.Match(e.ToString()); if (m.Success) { string str = m.Groups[1].Value; if (str == HtmlUtils.EscapeEntities(TEST_STRING)) results.AddResult("requiresHtmlTitles", YES); else if (str == HtmlUtils.EscapeEntities(HtmlUtils.EscapeEntities(TEST_STRING))) results.AddResult("requiresHtmlTitles", NO); else results.AddResult("requiresHtmlTitles", "[ERROR] (value was: " + str + ")"); return; } } } throw new InvalidOperationException("Title encoding test failed--title was not detected"); }
public FormFactory(Stream s) { using (StreamReader reader = new StreamReader(s)) { parser = new SimpleHtmlParser(reader.ReadToEnd()); } }
public virtual string Execute(string html) { SimpleHtmlParser parser = new SimpleHtmlParser(html); StringBuilder output = new StringBuilder(html.Length); Element next; while (null != (next = parser.Next())) output.Append(Replace(next)); return output.ToString(); }
public virtual string Execute(string html) { SimpleHtmlParser parser = new SimpleHtmlParser(html); StringBuilder output = new StringBuilder(html.Length); Element next; while (null != (next = parser.Next())) { output.Append(Replace(next)); } return(output.ToString()); }
public string ScanAndPreserve(string html) { StringBuilder sb = new StringBuilder(html.Length); SimpleHtmlParser p = new SimpleHtmlParser(html); Element e; while (null != (e = p.Next())) { if (!(e is BeginTag)) { sb.Append(html, e.Offset, e.Length); continue; } BeginTag bt = (BeginTag)e; if (bt.NameEquals("div")) { switch (bt.GetAttributeValue("class")) { case ContentSourceManager.EDITABLE_SMART_CONTENT: case ContentSourceManager.SMART_CONTENT: sb.Append(html, e.Offset, e.Length); sb.Append(p.CollectHtmlUntil("div")); sb.Append("</div>"); continue; } } if (!(bt.NameEquals("object") || bt.NameEquals("embed") || bt.NameEquals("noembed") || bt.NameEquals("script"))) { sb.Append(html, e.Offset, e.Length); continue; } else { string collected = p.CollectHtmlUntil(bt.Name); string preserve = bt.RawText + collected + "</" + bt.Name + ">"; string preserveId = Guid.NewGuid().ToString("N"); preserved[preserveId] = preserve; sb.AppendFormat("<span id=\"preserve{0}\" class=\"{1}\">", preserveId, PRESERVE_CLASS); sb.Append(preserve); sb.Append("</span>"); } } return sb.ToString(); }
internal static IElementPredicate Parse(string criterion) { SimpleHtmlParser parser = new SimpleHtmlParser(criterion); Element el = parser.Next(); if (el == null) { Debug.Fail("Criterion was null"); throw new ArgumentException("Criterion was null"); } if (parser.Next() != null) { Debug.Fail("Too many criteria"); throw new ArgumentException("Too many criteria"); } if (el is BeginTag) { BeginTag tag = (BeginTag)el; if (tag.HasResidue || tag.Unterminated) { Debug.Fail("Malformed criterion"); throw new ArgumentException("Malformed criterion"); } RequiredAttribute[] attributes = new RequiredAttribute[tag.Attributes.Length]; for (int i = 0; i < attributes.Length; i++) { attributes[i] = new RequiredAttribute(tag.Attributes[i].Name, tag.Attributes[i].Value); } return(new BeginTagPredicate(tag.Name, attributes)); } else if (el is EndTag) { return(new EndTagPredicate(((EndTag)el).Name)); } else if (el is Text) { return(new TextPredicate(el.RawText)); } else if (el is Comment) { return(new CommentPredicate(el.RawText)); } else { Debug.Fail("Invalid criterion \"" + criterion + "\""); throw new ArgumentException("Invalid criterion \"" + criterion + "\""); } }
/// <summary> /// Clones active smart content contained in the provided HTML, and disables unknown smart content. /// </summary> public static string PrepareSmartContentHtmlForEditorInsertion(string html, IContentSourceSidebarContext sourceContext) { StringBuilder output = new StringBuilder(); ContentSourceManager.SmartContentPredicate predicate = new ContentSourceManager.SmartContentPredicate(); SimpleHtmlParser p = new SimpleHtmlParser(html); for (Element el; null != (el = p.Next());) { if (predicate.IsMatch(el)) { BeginTag bt = el as BeginTag; Attr idAttr = bt.GetAttribute("id"); String contentSourceId, contentItemId; ContentSourceManager.ParseContainingElementId(idAttr.Value, out contentSourceId, out contentItemId); ISmartContent smartContent = sourceContext.FindSmartContent(contentItemId); if (smartContent != null) { String newId = Guid.NewGuid().ToString(); sourceContext.CloneSmartContent(contentItemId, newId); if (RefreshableContentManager.ContentSourcesWithRefreshableContent.Contains(contentSourceId)) { IExtensionData extensionData = sourceContext.FindExtentsionData(newId); Debug.Assert(extensionData != null); // Since we just made a new id for the smart content just about to be inserted // we want to give it a chance to get a callback because its callback might have happened while // it was on the clipboard(in the event of cut). This means the refreshable content manager doesnt know // to watch out for this smart content on paste, it only knows to look out for who created it. Thus // we just force the callback, and if it didnt need it, nothing will happen. if (extensionData.RefreshCallBack == null) { extensionData.RefreshCallBack = DateTime.UtcNow; } } idAttr.Value = ContentSourceManager.MakeContainingElementId(contentSourceId, newId); } else { ContentSourceManager.RemoveSmartContentAttributes(bt); } } output.Append(el.ToString()); } return output.ToString(); }
private static Element[] Elements(string html) { ArrayList elements = new ArrayList(); SimpleHtmlParser parser = new SimpleHtmlParser(html); Element el; while (null != (el = parser.Next())) elements.Add(el); return (Element[])elements.ToArray(typeof(Element)); }
/// <summary> /// Balances the HTML and safely truncates it, using a custom algorithm /// to determine how much each character/string counts against maxCost. /// </summary> public static string Balance(string html, int maxCost, HTMLBalancerCostFilter costFilter, bool ellipsis) { bool appendEllipsis = false; SimpleHtmlParser parser = new SimpleHtmlParser(html); ArrayList openTags = new ArrayList(); StringBuilder output = new StringBuilder(); long balance = 0; // long to make sure that int32.MaxValue does not cause overflow if (costFilter == null) costFilter = new DefaultCostFilter(); Element el; while (null != (el = parser.Next())) { if (el is StyleElement || el is ScriptElement || el is Comment || el is MarkupDirective) { continue; } long lenLeft = Math.Max(0, maxCost - balance - LengthToClose(costFilter, openTags)); if (el is Tag) { if (el is BeginTag && ((BeginTag)el).Unterminated) continue; // skip corrupted tags if (TagCost(costFilter, openTags, (Tag)el) > lenLeft) break; // don't use this tag; we're done else { RegisterTag(openTags, (Tag)el); output.Append(el.ToString()); balance += costFilter.ElementCost(el); } } else if (el is Text) { if (costFilter.ElementCost(el) > lenLeft) { // shrink down the text to fit output.Append(costFilter.TruncateText((Text)el, (int)lenLeft)); appendEllipsis = true; break; } else { // plenty of room output.Append(el.ToString()); balance += costFilter.ElementCost(el); } //update the text end index } else { if (costFilter.ElementCost(el) > lenLeft) break; else { output.Append(el.ToString()); balance += costFilter.ElementCost(el); } } } // Append an ellipsis if we truncated text // We use "..." here rather than TextHelper.Ellipsis, because some mail clients don't understand "\u2026". if (ellipsis && appendEllipsis) output.Append("..."); for (int i = openTags.Count - 1; i >= 0; i--) { output.Append(MakeEndTag((string)openTags[i])); } return output.ToString(); }
private static void EditorContext_PerformTemporaryFixupsToEditedHtml(TemporaryFixupArgs args) { string html = args.Html; if (html.Contains("table")) { StringBuilder output = new StringBuilder(html.Length); SimpleHtmlParser parser = new SimpleHtmlParser(html); for (Element el; null != (el = parser.Next());) { output.Append(html, el.Offset, el.Length); if (el is BeginTag && ((BeginTag)el).NameEquals("td")) { Element e = parser.Peek(0); if (e is EndTag && ((EndTag)e).NameEquals("td")) output.Append(" "); } } args.Html = output.ToString(); } }
private static void DetachExtendedEntryBehavior(TemporaryFixupArgs args) { string html = args.Html; if (html.Contains(EXTENDED_ENTRY_ID)) { //replace the EXTENDED_ENTRY_ID behavior div with the <!--more--> comment StringBuilder output = new StringBuilder(html.Length); SimpleHtmlParser parser = new SimpleHtmlParser(html); SmartPredicate splitDiv = new SmartPredicate(String.Format(CultureInfo.InvariantCulture, "<div id='{0}'>", EXTENDED_ENTRY_ID)); for (Element el; null != (el = parser.Next());) { if (splitDiv.IsMatch(el)) { Element e = parser.Peek(0); if (e is EndTag && ((EndTag)e).NameEquals("div")) { output.Append(BlogPost.ExtendedEntryBreak); parser.Next(); } } else output.Append(html, el.Offset, el.Length); } args.Html = output.ToString(); } }
public HtmlTextSource(SimpleHtmlParser parser) { this._parser = parser; }
/// <summary> /// Namespaced tags come with Office 2007 clipboard data and result in weird /// namespace declarations being inserted as text into the DOM. /// </summary> public static string StripNamespacedTagsAndCommentsAndMarkupDirectives(string html) { StringBuilder output = new StringBuilder(html.Length); SimpleHtmlParser parser = new SimpleHtmlParser(html); for (Element el; null != (el = parser.Next());) { if (el is Tag && ((Tag)el).Name.IndexOf(':') >= 0) continue; if (el is Comment) continue; if (el is MarkupDirective) continue; if (el is BeginTag) { foreach (Attr attr in ((BeginTag)el).Attributes) { if (ILLEGAL_ATTR_REGEX.IsMatch(attr.Name)) ((BeginTag)el).RemoveAttribute(attr.Name); } } output.Append(el.ToString()); } html = output.ToString(); return html; }
public static string ConvertNewLinesToBr(string html) { SimpleHtmlParser parser = new SimpleHtmlParser(html); StringBuilder sb = new StringBuilder(); Element ele = parser.Next(); while (ele != null) { if (ele is Text) sb.Append(ele.RawText.Replace("\r\n", "<br/>")); else sb.Append(ele.RawText); ele = parser.Next(); } return sb.ToString(); }
protected override bool ShouldAllowNewLineInsert(string html) { SimpleHtmlParser p = new SimpleHtmlParser(html); for (Element el; null != (el = p.Next());) { BeginTag tag = el as BeginTag; if (tag != null && tag.NameEquals("img")) { // Don't allow new lines after emoticons. string classNames = tag.GetAttributeValue("class"); if (!String.IsNullOrEmpty(classNames) && classNames.Contains(Emoticon.CLASS_NAME)) return false; } } return base.ShouldAllowNewLineInsert(html); }
/// <summary> /// Reposition the extractor back to the beginning of the /// HTML. /// </summary> /// <returns>Returns this. This allows chaining together of calls, /// like this: /// /// if (ex.Seek(...).Success || ex.Reset().Seek(...).Success) { ... } /// </returns> public HtmlExtractor Reset() { lastMatch = null; parser = new SimpleHtmlParser(html); return this; }
public HtmlExtractor(string html) { this.html = html; this.parser = new SimpleHtmlParser(html); }
public HtmlExtractor(Stream data, Encoding encoding) { using (StreamReader reader = new StreamReader(data, encoding)) html = reader.ReadToEnd(); this.parser = new SimpleHtmlParser(html); }
internal static IElementPredicate Parse(string criterion) { SimpleHtmlParser parser = new SimpleHtmlParser(criterion); Element el = parser.Next(); if (el == null) { Trace.Fail("Criterion was null"); throw new ArgumentException("Criterion was null"); } if (parser.Next() != null) { Trace.Fail("Too many criteria"); throw new ArgumentException("Too many criteria"); } if (el is BeginTag) { BeginTag tag = (BeginTag)el; if (tag.HasResidue || tag.Unterminated) { Trace.Fail("Malformed criterion"); throw new ArgumentException("Malformed criterion"); } RequiredAttribute[] attributes = new RequiredAttribute[tag.Attributes.Length]; for (int i = 0; i < attributes.Length; i++) attributes[i] = new RequiredAttribute(tag.Attributes[i].Name, tag.Attributes[i].Value); return new BeginTagPredicate(tag.Name, attributes); } else if (el is EndTag) { return new EndTagPredicate(((EndTag)el).Name); } else if (el is Text) { return new TextPredicate(el.RawText); } else if (el is Comment) { return new CommentPredicate(el.RawText); } else { Trace.Fail("Invalid criterion \"" + criterion + "\""); throw new ArgumentException("Invalid criterion \"" + criterion + "\""); } }
/// <summary> /// Walks the current contents to find smart content areas. When one is found, it calls the operation on the smart content. The operation has a chance /// to return new content. If the content is non-null it will replace the current content. /// </summary> /// <param name="contents">the raw HTML string whose structured blocks will be replaced.</param> /// <param name="operation">Delegate for generating replacement content.</param> /// <param name="editMode">If true, then the element's stylename will be activated for editing</param> /// <param name="continueOnError"> /// true - if the plugin throws an exception, it keeps crawling the DOM /// false - if a plugin throws an exception, it stops processing the DOM and return empty string /// null - if a plugin throws an exception, this function will rethrow it /// </param /// <returns>the contents with structured blocks replaced.</returns> internal static string PerformOperation(string contents, SmartContentOperation operation, bool editMode, IContentSourceSidebarContext sourceContext, bool? continueOnError) { //replace all structured content blocks with their editor HTML //string html = PostBodyPreprocessor.Preprocess(contents); StringBuilder sb = new StringBuilder(); SimpleHtmlParser parser = new SimpleHtmlParser(contents); for (Element e = parser.Next(); e != null; e = parser.Next()) { if (e is BeginTag) { BeginTag beginTag = (BeginTag)e; string elementClassName = beginTag.GetAttributeValue("class"); if (ContentSourceManager.IsSmartContentClass(elementClassName)) { ISmartContent sContent = null; try { string contentSourceId, contentItemId; string blockId = beginTag.GetAttributeValue("id"); if (blockId != null) { ContentSourceManager.ParseContainingElementId(blockId, out contentSourceId, out contentItemId); ContentSourceInfo contentSource = sourceContext.FindContentSource(contentSourceId); if (contentSource != null && contentSource.Instance is SmartContentSource) { SmartContentSource sSource = (SmartContentSource)contentSource.Instance; sContent = sourceContext.FindSmartContent(contentItemId); if (sContent != null) { //write the div with the appropriate className string newClassName = editMode ? ContentSourceManager.EDITABLE_SMART_CONTENT : ContentSourceManager.SMART_CONTENT; beginTag.GetAttribute("class").Value = newClassName; //replace the inner HTML of the div with the source's editor HTML string content = parser.CollectHtmlUntil("div"); sb.Append(e.ToString()); operation(sourceContext, sSource, sContent, ref content); sb.Append(content); sb.Append("</div>"); continue; } } } } catch (Exception ex) { Trace.WriteLine(String.Format(CultureInfo.InvariantCulture, "Error loading smart content item\r\n{0}", ex)); sContent = null; if (continueOnError == null) throw; if (!continueOnError.Value) return String.Empty; } if (sContent == null) { //this element references an unknown smart content, so it should not be editable Attr classAttr = beginTag.GetAttribute("class"); classAttr.Value = ContentSourceManager.SMART_CONTENT; } } } sb.Append(e.ToString()); } return sb.ToString(); }
/// <summary> /// Decides if, after the given HTML is inserted, we should insert a new line. /// </summary> protected virtual bool ShouldAllowNewLineInsert(string html) { SimpleHtmlParser p = new SimpleHtmlParser(html); for (Element el; null != (el = p.Next());) { if (el is BeginTag && (((BeginTag)el).NameEquals("div") || ((BeginTag)el).NameEquals("img"))) { return true; } } return false; }
/// <summary> /// Namespaced tags come with Office 2007 clipboard data and result in weird /// namespace declarations being inserted as text into the DOM. (Bug 303784) /// </summary> private static string StripNamespacedTags(string html) { StringBuilder output = new StringBuilder(html.Length); SimpleHtmlParser parser = new SimpleHtmlParser(html); for (Element el; null != (el = parser.Next());) { if (el is Tag && ((Tag)el).Name.IndexOf(':') >= 0) continue; output.Append(el.RawText); } html = output.ToString(); return html; }
public FormFactory(string html) { parser = new SimpleHtmlParser(html); }
private static string BalanceHtml(string html) { StringBuilder sb = new StringBuilder(html.Length + 10); SimpleHtmlParser parser = new SimpleHtmlParser(html); Element el; while (null != (el = parser.Next())) { if (el is BeginTag) { BeginTag bt = (BeginTag)el; if (!ElementFilters.RequiresEndTag(bt.Name)) bt.Complete = true; } sb.Append(el.ToString()); } return sb.ToString(); }
private string ThinInternal(string html, bool preserveImages, bool strict, params ModifyReplacement[] modifyReplacements) { Hashtable replacements = _tagSpecs; if (strict) { replacements = _tagSpecsStrict; } if (modifyReplacements != null) { replacements = (Hashtable)replacements.Clone(); foreach (ModifyReplacement modifyReplacement in modifyReplacements) modifyReplacement(replacements); } // Will hold the results of the leading whitespace buffer. // This buffer may or may not make it into the final result, // depending on whether any block-level tags are present. StringBuilder leadingOutput = new StringBuilder(10); // Will hold the results of everything else. StringBuilder mainOutput = new StringBuilder(html.Length); // references whichever output buffer is current. StringBuilder output = leadingOutput; SimpleHtmlParser parser = new SimpleHtmlParser(html); Element el; bool preserveWhitespace = false; // <pre> blocks should preserve whitespace WhitespaceBuffer whitespaceBuffer = new WhitespaceBuffer(); whitespaceBuffer.Promote(WhitespaceClass.Paragraph); // Insert an implicit <p> unless the first non-whitespace element is a block bool hasBlock = false; while (null != (el = parser.Next())) { if (el is Tag) { Tag t = (Tag)el; string lowerName = t.Name.ToLower(CultureInfo.InvariantCulture); TagDesc desc = (TagDesc)replacements[lowerName]; // if this tag is not in the table, drop it if (desc == null) continue; // Replace tag with substitute tag if necessary (e.g. <DIV> becomes <P>) string tagName = desc.Substitute; if (tagName == null) tagName = lowerName; // special case for images if (!preserveImages && tagName == TAG_IMG) continue; bool beginTag = el is BeginTag; ElementClass elClass = WhitespaceBuffer.ClassifyTag(tagName, desc.TagType); hasBlock |= (elClass == ElementClass.Block || elClass == ElementClass.Paragraph || elClass == ElementClass.Break); if (!preserveWhitespace && WhitespaceBuffer.ProcessElementClass(ref whitespaceBuffer, output, elClass, true)) continue; output = mainOutput; if (beginTag) { WriteBeginTag(desc, tagName, ((BeginTag)el).Attributes, output); if (tagName == TAG_PRE) preserveWhitespace = true; } else if (el is EndTag) { if (!((EndTag)el).Implicit && desc.TagType != TagType.Empty) { output.Append(string.Format(CultureInfo.InvariantCulture, "</{0}>", tagName)); } if (tagName == TAG_PRE) preserveWhitespace = false; } } else if (el is Text) { string text = el.RawText; text = HtmlUtils.EscapeEntities(HtmlUtils.UnEscapeEntities(text, HtmlUtils.UnEscapeMode.NonMarkupText)); if (!preserveWhitespace && WhitespaceBuffer.ProcessElementClass(ref whitespaceBuffer, output, WhitespaceBuffer.ClassifyText(text), false)) continue; output = mainOutput; output.Append(text); } } if (hasBlock && ReferenceEquals(mainOutput, output)) output.Insert(0, leadingOutput.ToString()); // The whitespace buffer may not be empty at this point. That's OK--we want to drop trailing whitespace return output.ToString(); }
/// <summary> /// Converts tag names, attribute names, and style text to lowercase. /// </summary> private string CleanupHtml(string html, bool xml) { bool needsCleanup; do { needsCleanup = false; StringBuilder output = new StringBuilder(html.Length); SimpleHtmlParser htmlParser = new SimpleHtmlParser(html); for (Element el; null != (el = htmlParser.Next());) { if (el is BeginTag) { BeginTag bt = (BeginTag)el; if (RemoveMeaninglessTags(htmlParser, bt)) { // Since we are removing a tag, we will want to clean up again, since that might mean // there will be another tag to remove needsCleanup = true; continue; } output.Append("<"); output.Append(bt.Name.ToLower(CultureInfo.InvariantCulture)); foreach (Attr attr in bt.Attributes) { if (attr.NameEquals("contenteditable") || attr.NameEquals("atomicselection") || attr.NameEquals("unselectable")) continue; output.Append(" "); output.Append(attr.Name.ToLower(CultureInfo.InvariantCulture)); if (attr.Value != null) { string attrVal = attr.Value; if (attr.NameEquals("style")) attrVal = LowerCaseCss(attrVal); else if (attr.Name == attr.Value) attrVal = attrVal.ToLower(CultureInfo.InvariantCulture); output.AppendFormat("=\"{0}\"", xml ? HtmlUtils.EscapeEntitiesForXml(attrVal, true) : HtmlUtils.EscapeEntities(attrVal)); } } if (bt.HasResidue) { if (bt.Attributes.Length == 0) output.Append(" "); output.Append(bt.Residue); } if (bt.Complete) output.Append(" /"); output.Append(">"); } else if (el is EndTag) { output.AppendFormat("</{0}>", ((EndTag)el).Name.ToLower(CultureInfo.InvariantCulture)); } else if (el is Text) { string textHtml = HtmlUtils.TidyNbsps(el.RawText); if (xml) textHtml = HtmlUtils.EscapeEntitiesForXml( HtmlUtils.UnEscapeEntities(textHtml, HtmlUtils.UnEscapeMode.NonMarkupText), false); output.Append(textHtml); } else if (el is StyleText) output.Append(el.RawText.ToLower(CultureInfo.InvariantCulture)); else output.Append(el.RawText); } html = output.ToString(); } while (needsCleanup); return html; }
public static bool ContainsUnbalancedDivs(string html) { int tags = 0; SimpleHtmlParser p = new SimpleHtmlParser(html); for (Element e; (e = p.Next()) != null;) { if (e is Tag && ((Tag)e).NameEquals("div")) { if (e is BeginTag) ++tags; else --tags; } } return tags != 0; }
/// <summary> /// Is the tag a meaningless tag such as <p></p> or <a href="..."></a> or <a href="..."> </a> /// </summary> /// <param name="htmlParser"></param> /// <param name="bt"></param> /// <returns></returns> private static bool RemoveMeaninglessTags(SimpleHtmlParser htmlParser, BeginTag bt) { // Look to see if the tag is a <p> without any attributes if ((bt.NameEquals("p") && bt.Attributes.Length == 0 && !bt.HasResidue)) { Element e = htmlParser.Peek(0); // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("p")) { // eat up the end tag htmlParser.Next(); return true; } } // Look to see if the tag is an <a> without a style/id/name attribute, but has an href... meaning the link is not useful if ((bt.NameEquals("a") && bt.GetAttribute("name") == null && bt.GetAttributeValue("style") == null && bt.GetAttributeValue("id") == null && bt.GetAttributeValue("href") != null)) { bool hadWhiteSpaceText = false; Element e = htmlParser.Peek(0); // Look to see if the a just has whitespace inside of it if (e is Text && HtmlUtils.UnEscapeEntities(e.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0) { e = htmlParser.Peek(1); hadWhiteSpaceText = true; } // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("a")) { // if this was an <a> with whitespace in the middle eat it up if (hadWhiteSpaceText) htmlParser.Next(); // eat up the end tag htmlParser.Next(); return true; } } return false; }
/// <summary> /// Reposition the extractor back to the beginning of the /// HTML. /// </summary> /// <returns>Returns this. This allows chaining together of calls, /// like this: /// /// if (ex.Seek(...).Success || ex.Reset().Seek(...).Success) { ... } /// </returns> public HtmlExtractor Reset() { lastMatch = null; parser = new SimpleHtmlParser(html); return(this); }
public void Parse() { SimpleHtmlParser parser = new SimpleHtmlParser(_html); OnDocumentBegin(); while (true) { Element currentElement = parser.Next(); BeginTag beginTag = currentElement as BeginTag; if (beginTag != null) { OnBeginTag(beginTag); continue; } EndTag endTag = currentElement as EndTag; if (endTag != null) { OnEndTag(endTag); continue; } ScriptLiteral literal = currentElement as ScriptLiteral; if (literal != null) { OnScriptLiteral(literal); continue; } Comment comment = currentElement as Comment; if (comment != null) { OnComment(comment); continue; } MarkupDirective markupDirective = currentElement as MarkupDirective; if (markupDirective != null) { OnMarkupDirective(markupDirective); continue; } ScriptText scriptText = currentElement as ScriptText; if (scriptText != null) { OnScriptText(scriptText); continue; } ScriptComment scriptComment = currentElement as ScriptComment; if (scriptComment != null) { OnScriptComment(scriptComment); continue; } StyleText styleText = currentElement as StyleText; if (styleText != null) { OnStyleText(styleText); continue; } StyleUrl styleUrl = currentElement as StyleUrl; if (styleUrl != null) { OnStyleUrl(styleUrl); continue; } StyleImport styleImport = currentElement as StyleImport; if (styleImport != null) { OnStyleImport(styleImport); continue; } StyleComment styleComment = currentElement as StyleComment; if (styleComment != null) { OnStyleComment(styleComment); continue; } StyleLiteral styleLiteral = currentElement as StyleLiteral; if (styleLiteral != null) { OnStyleLiteral(styleLiteral); continue; } Text text = currentElement as Text; if (text != null) { OnText(text); continue; } if (currentElement == null) { OnDocumentEnd(); return; } Debug.Fail("Unrecognized element in LightWeightHTMLDocumentIterator"); } }