public bool IsBlogger() { if (Regex.IsMatch(homepageUrl, @"^http://.+\.blogspot\.com($|/)", RegexOptions.IgnoreCase) || Regex.IsMatch(homepageUrl, @"^http(s)?://(www\.)?blogger\.com($|/)", RegexOptions.IgnoreCase) || new HtmlExtractor(html).Seek(new BloggerGeneratorCriterion()).Success) { return true; } HtmlExtractor ex = new HtmlExtractor(html); while (ex.Seek("<link href rel='service.post' type='application/atom+xml'>").Success) { BeginTag bt = (BeginTag)ex.Element; string atomHref = bt.GetAttributeValue("href"); // these obsolete Blogger atom links can't be used, but are // still a good indication that it's Blogger if (atomHref.StartsWith("https://www.blogger.com/atom/", StringComparison.OrdinalIgnoreCase)) return true; // any other blogger or blogspot atom link will be considered a match if (Regex.IsMatch(atomHref, @"^https?\:\/\/.+\.blog(ger|spot)\.com\/.*", RegexOptions.IgnoreCase)) return true; } return false; }
public Match MatchHomepageText(Regex regex) { string html = DownloadHomepage(); HtmlExtractor ex = new HtmlExtractor(html); if (ex.Seek(new DelegatePredicate(e => e is Text && regex.IsMatch(e.ToString()))).Success) return regex.Match(ex.Element.ToString()); return Match.Empty; }
public static ImageViewer DetectImageViewer(string html, string sourceUrl) { List<ImageViewer> viewers = imageViewers; LazyLoader<List<Regex>> regexes = new LazyLoader<List<Regex>>(delegate { List<Regex> regexList = new List<Regex>(viewers.Count); foreach (ImageViewer v in viewers) { regexList.Add(new Regex(v.Pattern, RegexOptions.CultureInvariant)); } return regexList; }); HtmlExtractor ex = new HtmlExtractor(html); while (ex.Seek("<script src>").Success) { BeginTag tag = (BeginTag)ex.Element; string src = tag.GetAttributeValue("src"); if (String.IsNullOrEmpty(src)) { continue; } try { if (!UrlHelper.IsUrl(src)) { // We need absolute URLs. src = UrlHelper.EscapeRelativeURL(sourceUrl, src); } Uri srcUri = new Uri(src); if (srcUri.IsAbsoluteUri) { // WinLive 248276: We want just the path portion since there could be an additional query or // fragment on the URL that our regexs can't handle. src = srcUri.GetLeftPart(UriPartial.Path); } } catch (UriFormatException) { // We'll just use the regex on the raw attribute value. } List<Regex> regexList = regexes.Value; for (int i = 0; i < regexList.Count; i++) { if (regexList[i].IsMatch(src)) return viewers[i]; } } return null; }
public override BlogPost Parse(XmlElement entryNode, bool includeCategories, Uri documentUri) { BlogPost post = new BlogPost(); AtomEntry atomEntry = new AtomEntry(_atomVer, _atomNS, CategoryScheme, _nsMgr, documentUri, entryNode); post.Title = atomEntry.Title; post.Excerpt = atomEntry.Excerpt; post.Id = PostUriToPostId(atomEntry.EditUri); post.Permalink = atomEntry.Permalink; string content = atomEntry.ContentHtml; if (content.Trim() != string.Empty) { HtmlExtractor ex = new HtmlExtractor(content); int start, length; if (Options.SupportsExtendedEntries && ex.Seek("<a name=\"more\">").Success) { start = ex.Element.Offset; length = ex.Element.Length; if (ex.Seek("</a>").Success) { post.SetContents(content.Substring(0, start), content.Substring(ex.Element.Offset + ex.Element.Length)); } else { post.SetContents(content.Substring(0, start), content.Substring(start + length)); } } else { post.Contents = content; } } post.DatePublished = atomEntry.PublishDate; if (Options.SupportsCategories && includeCategories) post.Categories = atomEntry.Categories; return post; }
Size FindSizeAttribute(string input) { Size size = new Size(_width, _height); if (string.IsNullOrEmpty(input)) return size; try { RequiredAttribute[] attrWidth = new RequiredAttribute[] { new RequiredAttribute("width"), new RequiredAttribute("height") }; IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed", attrWidth), new BeginTagPredicate("object", attrWidth)); HtmlExtractor ex = new HtmlExtractor(input); if (ex.Seek(predicate).Success) { BeginTag tag = (BeginTag)ex.Element; size = new Size(Convert.ToInt32(tag.GetAttributeValue("width"), CultureInfo.InvariantCulture), Convert.ToInt32(tag.GetAttributeValue("height"), CultureInfo.InvariantCulture)); } } catch (Exception ex) { Trace.Fail("Exception thrown while trying to find video size: " + ex); } return size; }
private string IdFromEmbed(string input) { foreach (EmbedPattern check in _embedPatterns) { IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) }); HtmlExtractor ex = new HtmlExtractor(input); ex = ex.Seek(predicate); if (ex.Success) { BeginTag bt = ex.Element as BeginTag; string srcRef = bt.GetAttributeValue(check.Attr); Match m = Regex.Match(srcRef, check.Pattern, RegexOptions.IgnoreCase); if (m.Success && m.Groups["id"].Success) { return m.Groups["id"].Value; } } } return String.Empty; }
public bool MatchesEmbed(string input) { foreach (EmbedPattern check in _embedPatterns) { IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) }); HtmlExtractor ex = new HtmlExtractor(input); ex = ex.Seek(predicate); if (ex.Success) { BeginTag bt = ex.Element as BeginTag; string srcRef = bt.GetAttributeValue(check.Attr); if (!Regex.IsMatch(srcRef, check.Pattern, RegexOptions.IgnoreCase)) { return false; } } else { return false; //didn't find embed tag with the attr } } return true; //found all predicates }
public static string GetBaseUrl(string html, string defaultUrl) { string url = defaultUrl; HtmlExtractor extractor = new HtmlExtractor(html); if (extractor.Seek("<base href>").Success) { string newUrl = ((BeginTag)extractor.Element).GetAttributeValue("href"); if (UrlHelper.IsUrl(newUrl)) url = newUrl; } return url; }
private void ParsePostContent(XmlNode xmlNode, BlogPost blogPost) { // get raw content (decode base64 if necessary) string content; XmlNode base64Node = xmlNode.SelectSingleNode("base64"); if (base64Node != null) { byte[] contentBytes = Convert.FromBase64String(base64Node.InnerText); content = _utf8EncodingNoBOM.GetString(contentBytes); } else // no base64 encoding, just read text { content = xmlNode.InnerText; } // parse out the title and contents of the post HtmlExtractor ex = new HtmlExtractor(content); if (ex.Seek("<title>").Success) { SetPostTitleFromXmlValue(blogPost, ex.CollectTextUntil("title")); content = content.Substring(ex.Parser.Position).TrimStart('\r', '\n'); } if (content.Trim() != string.Empty) { HtmlExtractor ex2 = new HtmlExtractor(content); if (Options.SupportsExtendedEntries && ex2.Seek("<lj-cut>").Success) blogPost.SetContents(content.Substring(0, ex2.Element.Offset), content.Substring(ex2.Element.Offset + ex2.Element.Length)); else blogPost.Contents = content; } }
private bool AttemptGenericAtomLinkDetection(string url, string html, bool preferredOnly) { const string GENERIC_ATOM_PROVIDER_ID = "D48F1B5A-06E6-4f0f-BD76-74F34F520792"; if (html == null) return false; HtmlExtractor ex = new HtmlExtractor(html); if (ex .SeekWithin("<head>", "<body>") .SeekWithin("<link href rel='service' type='application/atomsvc+xml'>", "</head>") .Success) { IBlogProvider atomProvider = BlogProviderManager.FindProvider(GENERIC_ATOM_PROVIDER_ID); BeginTag bt = ex.Element as BeginTag; if (preferredOnly) { string classes = bt.GetAttributeValue("class"); if (classes == null) return false; if (!Regex.IsMatch(classes, @"\bpreferred\b")) return false; } string linkUrl = bt.GetAttributeValue("href"); Debug.WriteLine("Atom service link detected in the blog homepage"); _providerId = atomProvider.Id; _serviceName = atomProvider.Name; _clientType = atomProvider.ClientType; _blogName = string.Empty; _postApiUrl = linkUrl; IBlogClient client = BlogClientManager.CreateClient(atomProvider.ClientType, _postApiUrl, _credentials); client.VerifyCredentials(); _usersBlogs = client.GetUsersBlogs(); if (_usersBlogs.Length == 1) { _hostBlogId = _usersBlogs[0].Id; _blogName = _usersBlogs[0].Name; /* if (_usersBlogs[0].HomepageUrl != null && _usersBlogs[0].HomepageUrl.Length > 0) _homepageUrl = _usersBlogs[0].HomepageUrl; */ } // attempt to read the blog name from the homepage title if (_blogName == null || _blogName.Length == 0) { HtmlExtractor ex2 = new HtmlExtractor(html); if (ex2.Seek("<title>").Success) { _blogName = ex2.CollectTextUntil("title"); } } return true; } return false; }
private static bool ContainsEmbedOrObject(string html) { try { IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object"), new BeginTagPredicate("iframe")); HtmlExtractor ex = new HtmlExtractor(html); return ex.Seek(predicate).Success; } catch { return false; } }
public bool ShouldUpdateContent(string oldHTML, string newHTML) { HtmlExtractor exOld = new HtmlExtractor(oldHTML); HtmlExtractor exNew = new HtmlExtractor(newHTML); HtmlExtractor exImgOld = exOld.Seek("<img title>"); HtmlExtractor exImgNew = exNew.Seek("<img title>"); if (exImgOld.Success && exImgNew.Success && ((BeginTag)exImgOld.Element).GetAttributeValue("title") == ((BeginTag)exImgNew.Element).GetAttributeValue("title")) { return false; } return true; }
// Warning: Does not deal with escaping properly. This is fine as long as // we're only using it for content we generate and there are no security // impliciations. public static string StripDivsWithClass(string html, string cssClass) { if (html.IndexOf(cssClass) < 0) return html; StringBuilder sb = new StringBuilder(); HtmlExtractor ex = new HtmlExtractor(html); int pos = 0; while (ex.Seek("<div class='" + cssClass + "'>").Success) { sb.Append(html, pos, ex.Element.Offset - pos); ex.Parser.CollectHtmlUntil("div"); pos = ex.Parser.Position; } sb.Append(html, pos, html.Length - pos); return sb.ToString(); }
/// <summary> /// Any setting that is derivaed from the homepage html needs to be in this function. This function is turned /// on and off when detecting blog seetings through the IncludeHomePageSettings. None of these checks will be run /// if the internet is not active. As each check is made, it does not need to be applied back the _content until the end /// at which time it will write the settings back to the registry. /// </summary> private void DetectHomePageSettings() { if (_homepageAccessor.HtmlDocument == null) return; IDictionary homepageSettings = new Hashtable(); Debug.Assert(!UseManifestCache, "This code will not run correctly under the manifest cache, due to option overrides not being set"); LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument); if (metaData.Charset != null) { try { homepageSettings.Add(BlogClientOptions.CHARACTER_SET, metaData.Charset); } catch (NotSupportedException) { //not an actual encoding } } string docType = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument).DocType; if (docType != null) { bool xhtml = docType.IndexOf("xhtml", StringComparison.OrdinalIgnoreCase) >= 0; if (xhtml) { homepageSettings.Add(BlogClientOptions.REQUIRES_XHTML, true.ToString(CultureInfo.InvariantCulture)); } } //checking whether blog is rtl HtmlExtractor extractor = new HtmlExtractor(_homepageAccessor.HtmlDocument.RawHtml); if (extractor.Seek(new OrPredicate( new SmartPredicate("<html dir>"), new SmartPredicate("<body dir>"))).Success) { BeginTag tag = (BeginTag)extractor.Element; string dir = tag.GetAttributeValue("dir"); if (String.Compare(dir, "rtl", StringComparison.OrdinalIgnoreCase) == 0) { homepageSettings.Add(BlogClientOptions.TEMPLATE_IS_RTL, true.ToString(CultureInfo.InvariantCulture)); } } if (_homepageAccessor.HtmlDocument != null) { string html = _homepageAccessor.OriginalHtml; ImageViewer viewer = DhtmlImageViewers.DetectImageViewer(html, _context.HomepageUrl); if (viewer != null) { homepageSettings.Add(BlogClientOptions.DHTML_IMAGE_VIEWER, viewer.Name); } } _context.HomePageOverrides = homepageSettings; }
public string RestorePreserved(string html) { StringBuilder sb = new StringBuilder(); HtmlExtractor ex = new HtmlExtractor(html); int pos = 0; while (ex.Seek("<span class='" + PRESERVE_CLASS + "'>").Success) { sb.Append(html, pos, ex.Element.Offset - pos); pos = ex.Element.Offset; BeginTag bt = (BeginTag)ex.Element; string elementId = bt.GetAttributeValue("id"); Match m = Regex.Match(elementId ?? "", @"^preserve([a-zA-Z0-9]+)$"); if (m.Success) { string preserveId = m.Groups[1].Value; string preservedValue; if (preserved.TryGetValue(preserveId, out preservedValue)) { sb.Append(preservedValue); ex.CollectTextUntil("span"); if (ex.Element == null) pos = html.Length; else pos = ex.Parser.Position; } } } sb.Append(html, pos, html.Length - pos); return sb.ToString(); }
public SmartPredicate(string criterion) { actualPredicate = HtmlExtractor.Parse(criterion); }
private static bool ContainsEmbed(string input) { IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object")); HtmlExtractor ex = new HtmlExtractor(input); return ex.Seek(predicate).Success; }