public bool IsMatch(Element e) { BeginTag tag = e as BeginTag; if (tag == null) { return(false); } if (!tag.NameEquals("meta")) { return(false); } if (tag.GetAttributeValue("name") != "generator") { return(false); } string generator = tag.GetAttributeValue("content"); if (generator == null || CaseInsensitiveComparer.DefaultInvariant.Compare("blogger", generator) != 0) { return(false); } return(true); }
private void HandleInput(HtmlForm parentForm, BeginTag inputTag) { string type = inputTag.GetAttributeValue("type"); if (type != null) { type = type.Trim().ToLowerInvariant(); } string name = inputTag.GetAttributeValue("name"); string value = inputTag.GetAttributeValue("value"); switch (type) { case "password": new Textbox(parentForm, name, value); break; case "checkbox": { int dummy; bool isChecked = inputTag.GetAttribute("checked", true, 0, out dummy) != null; new Checkbox(parentForm, name, value, isChecked); break; } case "radio": { int dummy; bool isChecked = inputTag.GetAttribute("checked", true, 0, out dummy) != null; new Radio(parentForm, name, value, isChecked); break; } case "submit": new SubmitButton(parentForm, name, value); break; case "image": new ImageButton(parentForm, name, value, inputTag.GetAttributeValue("src")); break; case "hidden": new Hidden(parentForm, name, value); break; case "text": default: new Textbox(parentForm, name, value); break; } }
public bool IsBlogger() { if (Regex.IsMatch(homepageUrl, @"^http://.+\.blogspot\.com($|/)", RegexOptions.IgnoreCase) || Regex.IsMatch(homepageUrl, @"^http(s)?://(www\.)?blogger\.com($|/)", RegexOptions.IgnoreCase) || new HtmlExtractor(html).Seek(new BloggerGeneratorCriterion()).Success) { return(true); } HtmlExtractor ex = new HtmlExtractor(html); while (ex.Seek("<link href rel='service.post' type='application/atom+xml'>").Success) { BeginTag bt = (BeginTag)ex.Element; string atomHref = bt.GetAttributeValue("href"); // these obsolete Blogger atom links can't be used, but are // still a good indication that it's Blogger if (atomHref.StartsWith("https://www.blogger.com/atom/", StringComparison.OrdinalIgnoreCase)) { return(true); } // any other blogger or blogspot atom link will be considered a match if (Regex.IsMatch(atomHref, @"^https?\:\/\/.+\.blog(ger|spot)\.com\/.*", RegexOptions.IgnoreCase)) { return(true); } } return(false); }
Size FindSizeAttribute(string input) { Size size = new Size(_width, _height); if (string.IsNullOrEmpty(input)) { return(size); } try { RequiredAttribute[] attrWidth = new RequiredAttribute[] { new RequiredAttribute("width"), new RequiredAttribute("height") }; IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed", attrWidth), new BeginTagPredicate("object", attrWidth)); HtmlExtractor ex = new HtmlExtractor(input); if (ex.Seek(predicate).Success) { BeginTag tag = (BeginTag)ex.Element; size = new Size(Convert.ToInt32(tag.GetAttributeValue("width"), CultureInfo.InvariantCulture), Convert.ToInt32(tag.GetAttributeValue("height"), CultureInfo.InvariantCulture)); } } catch (Exception ex) { Trace.Fail("Exception thrown while trying to find video size: " + ex); } return(size); }
private void HandleTextarea(HtmlForm parentForm, BeginTag textareaTag) { string name = textareaTag.GetAttributeValue("name"); string value = parser.CollectTextUntil("textarea"); new Textarea(parentForm, name, value); }
public string RestorePreserved(string html) { StringBuilder sb = new StringBuilder(); HtmlExtractor ex = new HtmlExtractor(html); int pos = 0; while (ex.Seek("<span class='" + PRESERVE_CLASS + "'>").Success) { sb.Append(html, pos, ex.Element.Offset - pos); pos = ex.Element.Offset; BeginTag bt = (BeginTag)ex.Element; string elementId = bt.GetAttributeValue("id"); Match m = Regex.Match(elementId ?? "", @"^preserve([a-zA-Z0-9]+)$"); if (m.Success) { string preserveId = m.Groups[1].Value; string preservedValue; if (preserved.TryGetValue(preserveId, out preservedValue)) { sb.Append(preservedValue); ex.CollectTextUntil("span"); if (ex.Element == null) { pos = html.Length; } else { pos = ex.Parser.Position; } } } } sb.Append(html, pos, html.Length - pos); return(sb.ToString()); }
public static ImageViewer DetectImageViewer(string html, string sourceUrl) { List <ImageViewer> viewers = imageViewers; LazyLoader <List <Regex> > regexes = new LazyLoader <List <Regex> >(delegate { List <Regex> regexList = new List <Regex>(viewers.Count); foreach (ImageViewer v in viewers) { regexList.Add(new Regex(v.Pattern, RegexOptions.CultureInvariant)); } return(regexList); }); HtmlExtractor ex = new HtmlExtractor(html); while (ex.Seek("<script src>").Success) { BeginTag tag = (BeginTag)ex.Element; string src = tag.GetAttributeValue("src"); if (String.IsNullOrEmpty(src)) { continue; } try { if (!UrlHelper.IsUrl(src)) { // We need absolute URLs. src = UrlHelper.EscapeRelativeURL(sourceUrl, src); } Uri srcUri = new Uri(src); if (srcUri.IsAbsoluteUri) { // WinLive 248276: We want just the path portion since there could be an additional query or // fragment on the URL that our regexs can't handle. src = srcUri.GetLeftPart(UriPartial.Path); } } catch (UriFormatException) { // We'll just use the regex on the raw attribute value. } List <Regex> regexList = regexes.Value; for (int i = 0; i < regexList.Count; i++) { if (regexList[i].IsMatch(src)) { return(viewers[i]); } } } return(null); }
private HtmlForm HandleForm(BeginTag formTag) { string name = formTag.GetAttributeValue("name"); string action = formTag.GetAttributeValue("action"); string method = formTag.GetAttributeValue("method"); HtmlForm htmlForm = new HtmlForm(name, action, method); Element el; while (null != (el = parser.Next())) { if (el is EndTag && ((EndTag)el).NameEquals("form")) { break; } BeginTag tag = el as BeginTag; if (tag == null) { continue; } switch (tag.Name.ToLowerInvariant()) { case "input": HandleInput(htmlForm, tag); break; case "select": HandleSelect(htmlForm, tag); break; case "textarea": HandleTextarea(htmlForm, tag); break; } } return(htmlForm); }
public string ScanAndPreserve(string html) { StringBuilder sb = new StringBuilder(html.Length); SimpleHtmlParser p = new SimpleHtmlParser(html); Element e; while (null != (e = p.Next())) { if (!(e is BeginTag)) { sb.Append(html, e.Offset, e.Length); continue; } BeginTag bt = (BeginTag)e; if (bt.NameEquals("div")) { switch (bt.GetAttributeValue("class")) { case ContentSourceManager.EDITABLE_SMART_CONTENT: case ContentSourceManager.SMART_CONTENT: sb.Append(html, e.Offset, e.Length); sb.Append(p.CollectHtmlUntil("div")); sb.Append("</div>"); continue; } } if (!(bt.NameEquals("object") || bt.NameEquals("embed") || bt.NameEquals("noembed") || bt.NameEquals("script"))) { sb.Append(html, e.Offset, e.Length); continue; } else { string collected = p.CollectHtmlUntil(bt.Name); string preserve = bt.RawText + collected + "</" + bt.Name + ">"; string preserveId = Guid.NewGuid().ToString("N"); preserved[preserveId] = preserve; sb.AppendFormat("<span id=\"preserve{0}\" class=\"{1}\">", preserveId, PRESERVE_CLASS); sb.Append(preserve); sb.Append("</span>"); } } return(sb.ToString()); }
/// <summary> /// Is the tag a meaningless tag such as <p></p> or <a href="..."></a> or <a href="..."> </a> /// </summary> /// <param name="htmlParser"></param> /// <param name="bt"></param> /// <returns></returns> private static bool RemoveMeaninglessTags(SimpleHtmlParser htmlParser, BeginTag bt) { // Look to see if the tag is a <p> without any attributes if ((bt.NameEquals("p") && bt.Attributes.Length == 0 && !bt.HasResidue)) { Element e = htmlParser.Peek(0); // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("p")) { // eat up the end tag htmlParser.Next(); return(true); } } // Look to see if the tag is an <a> without a style/id/name attribute, but has an href... meaning the link is not useful if ((bt.NameEquals("a") && bt.GetAttribute("name") == null && bt.GetAttributeValue("style") == null && bt.GetAttributeValue("id") == null && bt.GetAttributeValue("href") != null)) { bool hadWhiteSpaceText = false; Element e = htmlParser.Peek(0); // Look to see if the a just has whitespace inside of it if (e is Text && HtmlUtils.UnEscapeEntities(e.RawText, HtmlUtils.UnEscapeMode.NonMarkupText).Trim().Length == 0) { e = htmlParser.Peek(1); hadWhiteSpaceText = true; } // Look to see if thereis a matching end tag to the element we are looking at if (e != null && e is EndTag && ((EndTag)e).NameEquals("a")) { // if this was an <a> with whitespace in the middle eat it up if (hadWhiteSpaceText) { htmlParser.Next(); } // eat up the end tag htmlParser.Next(); return(true); } } return(false); }
private string IdFromEmbed(string input) { foreach (EmbedPattern check in _embedPatterns) { IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) }); HtmlExtractor ex = new HtmlExtractor(input); ex = ex.Seek(predicate); if (ex.Success) { BeginTag bt = ex.Element as BeginTag; string srcRef = bt.GetAttributeValue(check.Attr); Match m = Regex.Match(srcRef, check.Pattern, RegexOptions.IgnoreCase); if (m.Success && m.Groups["id"].Success) { return(m.Groups["id"].Value); } } } return(String.Empty); }
private void HandleSelect(HtmlForm parentForm, BeginTag selectTag) { string name = selectTag.GetAttributeValue("name"); int dummy; bool multiple = selectTag.GetAttribute("multiple", true, 0, out dummy) != null; ArrayList optionInfos = new ArrayList(); Element el = parser.Next(); while (el != null) { BeginTag tag = el as BeginTag; if (tag != null && tag.NameEquals("option")) { string value = tag.GetAttributeValue("value"); bool isSelected = tag.GetAttribute("selected", true, 0, out dummy) != null; string label = string.Empty; el = parser.Next(); if (el != null && el is Text) { label = HtmlUtils.UnEscapeEntities(el.ToString(), HtmlUtils.UnEscapeMode.NonMarkupText).TrimEnd(' ', '\r', '\n', '\t'); el = parser.Next(); } optionInfos.Add(new OptionInfo(value, label, isSelected)); continue; } if (el is EndTag && ((EndTag)el).NameEquals("select")) { new Select(parentForm, name, multiple, (OptionInfo[])optionInfos.ToArray(typeof(OptionInfo))); return; } el = parser.Next(); } }
public bool MatchesEmbed(string input) { foreach (EmbedPattern check in _embedPatterns) { IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) }); HtmlExtractor ex = new HtmlExtractor(input); ex = ex.Seek(predicate); if (ex.Success) { BeginTag bt = ex.Element as BeginTag; string srcRef = bt.GetAttributeValue(check.Attr); if (!Regex.IsMatch(srcRef, check.Pattern, RegexOptions.IgnoreCase)) { return(false); } } else { return(false); //didn't find embed tag with the attr } } return(true); //found all predicates }
/// <summary> /// Any setting that is derived from the homepage html needs to be in this function. This function is turned /// on and off when detecting blog settings through the IncludeHomePageSettings. None of these checks will be run /// if the internet is not active. As each check is made, it does not need to be applied back the _content until the end /// at which time it will write the settings back to the registry. /// </summary> private void DetectHomePageSettings() { if (_homepageAccessor.HtmlDocument == null) { return; } IDictionary homepageSettings = new Hashtable(); Debug.Assert(!UseManifestCache, "This code will not run correctly under the manifest cache, due to option overrides not being set"); LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument); if (metaData.Charset != null) { try { homepageSettings.Add(BlogClientOptions.CHARACTER_SET, metaData.Charset); } catch (NotSupportedException) { //not an actual encoding } } string docType = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument).DocType; if (docType != null) { bool xhtml = docType.IndexOf("xhtml", StringComparison.OrdinalIgnoreCase) >= 0; if (xhtml) { homepageSettings.Add(BlogClientOptions.REQUIRES_XHTML, true.ToString(CultureInfo.InvariantCulture)); } } //checking whether blog is rtl HtmlExtractor extractor = new HtmlExtractor(_homepageAccessor.HtmlDocument.RawHtml); if (extractor.Seek(new OrPredicate( new SmartPredicate("<html dir>"), new SmartPredicate("<body dir>"))).Success) { BeginTag tag = (BeginTag)extractor.Element; string dir = tag.GetAttributeValue("dir"); if (String.Compare(dir, "rtl", StringComparison.OrdinalIgnoreCase) == 0) { homepageSettings.Add(BlogClientOptions.TEMPLATE_IS_RTL, true.ToString(CultureInfo.InvariantCulture)); } } if (_homepageAccessor.HtmlDocument != null) { string html = _homepageAccessor.OriginalHtml; ImageViewer viewer = DhtmlImageViewers.DetectImageViewer(html, _context.HomepageUrl); if (viewer != null) { homepageSettings.Add(BlogClientOptions.DHTML_IMAGE_VIEWER, viewer.Name); } } _context.HomePageOverrides = homepageSettings; }
private bool AttemptGenericAtomLinkDetection(string url, string html, bool preferredOnly) { const string GENERIC_ATOM_PROVIDER_ID = "D48F1B5A-06E6-4f0f-BD76-74F34F520792"; if (html == null) { return(false); } HtmlExtractor ex = new HtmlExtractor(html); if (ex .SeekWithin("<head>", "<body>") .SeekWithin("<link href rel='service' type='application/atomsvc+xml'>", "</head>") .Success) { IBlogProvider atomProvider = BlogProviderManager.FindProvider(GENERIC_ATOM_PROVIDER_ID); BeginTag bt = ex.Element as BeginTag; if (preferredOnly) { string classes = bt.GetAttributeValue("class"); if (classes == null) { return(false); } if (!Regex.IsMatch(classes, @"\bpreferred\b")) { return(false); } } string linkUrl = bt.GetAttributeValue("href"); Debug.WriteLine("Atom service link detected in the blog homepage"); _providerId = atomProvider.Id; _serviceName = atomProvider.Name; _clientType = atomProvider.ClientType; _blogName = string.Empty; _postApiUrl = GetAbsoluteUrl(url, linkUrl); IBlogClient client = BlogClientManager.CreateClient(atomProvider.ClientType, _postApiUrl, _credentials); client.VerifyCredentials(); _usersBlogs = client.GetUsersBlogs(); if (_usersBlogs.Length == 1) { _hostBlogId = _usersBlogs[0].Id; _blogName = _usersBlogs[0].Name; /* * if (_usersBlogs[0].HomepageUrl != null && _usersBlogs[0].HomepageUrl.Length > 0) * _homepageUrl = _usersBlogs[0].HomepageUrl; */ } // attempt to read the blog name from the homepage title if (_blogName == null || _blogName.Length == 0) { HtmlExtractor ex2 = new HtmlExtractor(html); if (ex2.Seek("<title>").Success) { _blogName = ex2.CollectTextUntil("title"); } } return(true); } return(false); }
/// <summary> /// Walks the current contents to find smart content areas. When one is found, it calls the operation on the smart content. The operation has a chance /// to return new content. If the content is non-null it will replace the current content. /// </summary> /// <param name="contents">the raw HTML string whose structured blocks will be replaced.</param> /// <param name="operation">Delegate for generating replacement content.</param> /// <param name="editMode">If true, then the element's stylename will be activated for editing</param> /// <param name="continueOnError"> /// true - if the plugin throws an exception, it keeps crawling the DOM /// false - if a plugin throws an exception, it stops processing the DOM and return empty string /// null - if a plugin throws an exception, this function will rethrow it /// </param /// <returns>the contents with structured blocks replaced.</returns> internal static string PerformOperation(string contents, SmartContentOperation operation, bool editMode, IContentSourceSidebarContext sourceContext, bool?continueOnError) { //replace all structured content blocks with their editor HTML //string html = PostBodyPreprocessor.Preprocess(contents); StringBuilder sb = new StringBuilder(); SimpleHtmlParser parser = new SimpleHtmlParser(contents); for (Element e = parser.Next(); e != null; e = parser.Next()) { if (e is BeginTag) { BeginTag beginTag = (BeginTag)e; string elementClassName = beginTag.GetAttributeValue("class"); if (ContentSourceManager.IsSmartContentClass(elementClassName)) { ISmartContent sContent = null; try { string contentSourceId, contentItemId; string blockId = beginTag.GetAttributeValue("id"); if (blockId != null) { ContentSourceManager.ParseContainingElementId(blockId, out contentSourceId, out contentItemId); ContentSourceInfo contentSource = sourceContext.FindContentSource(contentSourceId); if (contentSource != null && contentSource.Instance is SmartContentSource) { SmartContentSource sSource = (SmartContentSource)contentSource.Instance; sContent = sourceContext.FindSmartContent(contentItemId); if (sContent != null) { //write the div with the appropriate className string newClassName = editMode ? ContentSourceManager.EDITABLE_SMART_CONTENT : ContentSourceManager.SMART_CONTENT; beginTag.GetAttribute("class").Value = newClassName; //replace the inner HTML of the div with the source's editor HTML string content = parser.CollectHtmlUntil("div"); sb.Append(e.ToString()); operation(sourceContext, sSource, sContent, ref content); sb.Append(content); sb.Append("</div>"); continue; } } } } catch (Exception ex) { Trace.WriteLine(String.Format(CultureInfo.InvariantCulture, "Error loading smart content item\r\n{0}", ex)); sContent = null; if (continueOnError == null) { throw; } if (!continueOnError.Value) { return(String.Empty); } } if (sContent == null) { //this element references an unknown smart content, so it should not be editable Attr classAttr = beginTag.GetAttribute("class"); classAttr.Value = ContentSourceManager.SMART_CONTENT; } } } sb.Append(e.ToString()); } return(sb.ToString()); }